1/* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "HTMLTokenizer.h" 30 31#include "HTMLEntityParser.h" 32#include "HTMLTreeBuilder.h" 33#include "MarkupTokenizerInlines.h" 34#include "NotImplemented.h" 35#include <wtf/ASCIICType.h> 36#include <wtf/CurrentTime.h> 37#include <wtf/text/CString.h> 38 39using namespace WTF; 40 41namespace WebCore { 42 43using namespace HTMLNames; 44 45// This has to go in a .cpp file, as the linker doesn't like it being included more than once. 46// We don't have an HTMLToken.cpp though, so this is the next best place. 47QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const 48{ 49 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom); 50} 51 52bool AtomicHTMLToken::usesName() const 53{ 54 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 55} 56 57bool AtomicHTMLToken::usesAttributes() const 58{ 59 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 60} 61 62static inline UChar toLowerCase(UChar cc) 63{ 64 ASSERT(isASCIIUpper(cc)); 65 const int lowerCaseOffset = 0x20; 66 return cc + lowerCaseOffset; 67} 68 69static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string) 70{ 71 if (vector.size() != string.length()) 72 return false; 73 74 if (!string.length()) 75 return true; 76 77 return equal(string.impl(), vector.data(), vector.size()); 78} 79 80static inline bool isEndTagBufferingState(HTMLTokenizer::State state) 81{ 82 switch (state) { 83 case HTMLTokenizer::RCDATAEndTagOpenState: 84 case HTMLTokenizer::RCDATAEndTagNameState: 85 case HTMLTokenizer::RAWTEXTEndTagOpenState: 86 case HTMLTokenizer::RAWTEXTEndTagNameState: 87 case HTMLTokenizer::ScriptDataEndTagOpenState: 88 case HTMLTokenizer::ScriptDataEndTagNameState: 89 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 90 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 91 return true; 92 default: 93 return false; 94 } 95} 96 97#define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) 98#define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) 99#define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) 100#define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) 101 102HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options) 103 : m_inputStreamPreprocessor(this) 104 , m_options(options) 105{ 106 reset(); 107} 108 109HTMLTokenizer::~HTMLTokenizer() 110{ 111} 112 113void HTMLTokenizer::reset() 114{ 115 m_state = HTMLTokenizer::DataState; 116 m_token = 0; 117 m_forceNullCharacterReplacement = false; 118 m_shouldAllowCDATA = false; 119 m_additionalAllowedCharacter = '\0'; 120} 121 122inline bool HTMLTokenizer::processEntity(SegmentedString& source) 123{ 124 bool notEnoughCharacters = false; 125 StringBuilder decodedEntity; 126 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 127 if (notEnoughCharacters) 128 return false; 129 if (!success) { 130 ASSERT(decodedEntity.isEmpty()); 131 bufferCharacter('&'); 132 } else { 133 for (unsigned i = 0; i < decodedEntity.length(); ++i) 134 bufferCharacter(decodedEntity[i]); 135 } 136 return true; 137} 138 139bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 140{ 141 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 142 source.advanceAndUpdateLineNumber(); 143 if (m_token->type() == HTMLToken::Character) 144 return true; 145 m_token->beginEndTag(m_bufferedEndTagName); 146 m_bufferedEndTagName.clear(); 147 m_appropriateEndTagName.clear(); 148 m_temporaryBuffer.clear(); 149 return false; 150} 151 152#define FLUSH_AND_ADVANCE_TO(stateName) \ 153 do { \ 154 m_state = HTMLTokenizer::stateName; \ 155 if (flushBufferedEndTag(source)) \ 156 return true; \ 157 if (source.isEmpty() \ 158 || !m_inputStreamPreprocessor.peek(source)) \ 159 return haveBufferedCharacterToken(); \ 160 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 161 goto stateName; \ 162 } while (false) 163 164bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) 165{ 166 m_state = state; 167 flushBufferedEndTag(source); 168 return true; 169} 170 171bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 172{ 173 // If we have a token in progress, then we're supposed to be called back 174 // with the same token so we can finish it. 175 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 176 m_token = &token; 177 178 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 179 // FIXME: This should call flushBufferedEndTag(). 180 // We started an end tag during our last iteration. 181 m_token->beginEndTag(m_bufferedEndTagName); 182 m_bufferedEndTagName.clear(); 183 m_appropriateEndTagName.clear(); 184 m_temporaryBuffer.clear(); 185 if (m_state == HTMLTokenizer::DataState) { 186 // We're back in the data state, so we must be done with the tag. 187 return true; 188 } 189 } 190 191 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 192 return haveBufferedCharacterToken(); 193 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 194 195 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 196 switch (m_state) { 197 HTML_BEGIN_STATE(DataState) { 198 if (cc == '&') 199 HTML_ADVANCE_TO(CharacterReferenceInDataState); 200 else if (cc == '<') { 201 if (m_token->type() == HTMLToken::Character) { 202 // We have a bunch of character tokens queued up that we 203 // are emitting lazily here. 204 return true; 205 } 206 HTML_ADVANCE_TO(TagOpenState); 207 } else if (cc == kEndOfFileMarker) 208 return emitEndOfFile(source); 209 else { 210 bufferCharacter(cc); 211 HTML_ADVANCE_TO(DataState); 212 } 213 } 214 END_STATE() 215 216 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 217 if (!processEntity(source)) 218 return haveBufferedCharacterToken(); 219 HTML_SWITCH_TO(DataState); 220 } 221 END_STATE() 222 223 HTML_BEGIN_STATE(RCDATAState) { 224 if (cc == '&') 225 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState); 226 else if (cc == '<') 227 HTML_ADVANCE_TO(RCDATALessThanSignState); 228 else if (cc == kEndOfFileMarker) 229 return emitEndOfFile(source); 230 else { 231 bufferCharacter(cc); 232 HTML_ADVANCE_TO(RCDATAState); 233 } 234 } 235 END_STATE() 236 237 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) { 238 if (!processEntity(source)) 239 return haveBufferedCharacterToken(); 240 HTML_SWITCH_TO(RCDATAState); 241 } 242 END_STATE() 243 244 HTML_BEGIN_STATE(RAWTEXTState) { 245 if (cc == '<') 246 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 247 else if (cc == kEndOfFileMarker) 248 return emitEndOfFile(source); 249 else { 250 bufferCharacter(cc); 251 HTML_ADVANCE_TO(RAWTEXTState); 252 } 253 } 254 END_STATE() 255 256 HTML_BEGIN_STATE(ScriptDataState) { 257 if (cc == '<') 258 HTML_ADVANCE_TO(ScriptDataLessThanSignState); 259 else if (cc == kEndOfFileMarker) 260 return emitEndOfFile(source); 261 else { 262 bufferCharacter(cc); 263 HTML_ADVANCE_TO(ScriptDataState); 264 } 265 } 266 END_STATE() 267 268 HTML_BEGIN_STATE(PLAINTEXTState) { 269 if (cc == kEndOfFileMarker) 270 return emitEndOfFile(source); 271 bufferCharacter(cc); 272 HTML_ADVANCE_TO(PLAINTEXTState); 273 } 274 END_STATE() 275 276 HTML_BEGIN_STATE(TagOpenState) { 277 if (cc == '!') 278 HTML_ADVANCE_TO(MarkupDeclarationOpenState); 279 else if (cc == '/') 280 HTML_ADVANCE_TO(EndTagOpenState); 281 else if (isASCIIUpper(cc)) { 282 m_token->beginStartTag(toLowerCase(cc)); 283 HTML_ADVANCE_TO(TagNameState); 284 } else if (isASCIILower(cc)) { 285 m_token->beginStartTag(cc); 286 HTML_ADVANCE_TO(TagNameState); 287 } else if (cc == '?') { 288 parseError(); 289 // The spec consumes the current character before switching 290 // to the bogus comment state, but it's easier to implement 291 // if we reconsume the current character. 292 HTML_RECONSUME_IN(BogusCommentState); 293 } else { 294 parseError(); 295 bufferCharacter('<'); 296 HTML_RECONSUME_IN(DataState); 297 } 298 } 299 END_STATE() 300 301 HTML_BEGIN_STATE(EndTagOpenState) { 302 if (isASCIIUpper(cc)) { 303 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc))); 304 m_appropriateEndTagName.clear(); 305 HTML_ADVANCE_TO(TagNameState); 306 } else if (isASCIILower(cc)) { 307 m_token->beginEndTag(static_cast<LChar>(cc)); 308 m_appropriateEndTagName.clear(); 309 HTML_ADVANCE_TO(TagNameState); 310 } else if (cc == '>') { 311 parseError(); 312 HTML_ADVANCE_TO(DataState); 313 } else if (cc == kEndOfFileMarker) { 314 parseError(); 315 bufferCharacter('<'); 316 bufferCharacter('/'); 317 HTML_RECONSUME_IN(DataState); 318 } else { 319 parseError(); 320 HTML_RECONSUME_IN(BogusCommentState); 321 } 322 } 323 END_STATE() 324 325 HTML_BEGIN_STATE(TagNameState) { 326 if (isTokenizerWhitespace(cc)) 327 HTML_ADVANCE_TO(BeforeAttributeNameState); 328 else if (cc == '/') 329 HTML_ADVANCE_TO(SelfClosingStartTagState); 330 else if (cc == '>') 331 return emitAndResumeIn(source, HTMLTokenizer::DataState); 332 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 333 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 334 else if (isASCIIUpper(cc)) { 335 m_token->appendToName(toLowerCase(cc)); 336 HTML_ADVANCE_TO(TagNameState); 337 } else if (cc == kEndOfFileMarker) { 338 parseError(); 339 HTML_RECONSUME_IN(DataState); 340 } else { 341 m_token->appendToName(cc); 342 HTML_ADVANCE_TO(TagNameState); 343 } 344 } 345 END_STATE() 346 347 HTML_BEGIN_STATE(RCDATALessThanSignState) { 348 if (cc == '/') { 349 m_temporaryBuffer.clear(); 350 ASSERT(m_bufferedEndTagName.isEmpty()); 351 HTML_ADVANCE_TO(RCDATAEndTagOpenState); 352 } else { 353 bufferCharacter('<'); 354 HTML_RECONSUME_IN(RCDATAState); 355 } 356 } 357 END_STATE() 358 359 HTML_BEGIN_STATE(RCDATAEndTagOpenState) { 360 if (isASCIIUpper(cc)) { 361 m_temporaryBuffer.append(static_cast<LChar>(cc)); 362 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 363 HTML_ADVANCE_TO(RCDATAEndTagNameState); 364 } else if (isASCIILower(cc)) { 365 m_temporaryBuffer.append(static_cast<LChar>(cc)); 366 addToPossibleEndTag(static_cast<LChar>(cc)); 367 HTML_ADVANCE_TO(RCDATAEndTagNameState); 368 } else { 369 bufferCharacter('<'); 370 bufferCharacter('/'); 371 HTML_RECONSUME_IN(RCDATAState); 372 } 373 } 374 END_STATE() 375 376 HTML_BEGIN_STATE(RCDATAEndTagNameState) { 377 if (isASCIIUpper(cc)) { 378 m_temporaryBuffer.append(static_cast<LChar>(cc)); 379 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 380 HTML_ADVANCE_TO(RCDATAEndTagNameState); 381 } else if (isASCIILower(cc)) { 382 m_temporaryBuffer.append(static_cast<LChar>(cc)); 383 addToPossibleEndTag(static_cast<LChar>(cc)); 384 HTML_ADVANCE_TO(RCDATAEndTagNameState); 385 } else { 386 if (isTokenizerWhitespace(cc)) { 387 if (isAppropriateEndTag()) { 388 m_temporaryBuffer.append(static_cast<LChar>(cc)); 389 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 390 } 391 } else if (cc == '/') { 392 if (isAppropriateEndTag()) { 393 m_temporaryBuffer.append(static_cast<LChar>(cc)); 394 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 395 } 396 } else if (cc == '>') { 397 if (isAppropriateEndTag()) { 398 m_temporaryBuffer.append(static_cast<LChar>(cc)); 399 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 400 } 401 } 402 bufferCharacter('<'); 403 bufferCharacter('/'); 404 m_token->appendToCharacter(m_temporaryBuffer); 405 m_bufferedEndTagName.clear(); 406 m_temporaryBuffer.clear(); 407 HTML_RECONSUME_IN(RCDATAState); 408 } 409 } 410 END_STATE() 411 412 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) { 413 if (cc == '/') { 414 m_temporaryBuffer.clear(); 415 ASSERT(m_bufferedEndTagName.isEmpty()); 416 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState); 417 } else { 418 bufferCharacter('<'); 419 HTML_RECONSUME_IN(RAWTEXTState); 420 } 421 } 422 END_STATE() 423 424 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) { 425 if (isASCIIUpper(cc)) { 426 m_temporaryBuffer.append(static_cast<LChar>(cc)); 427 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 428 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 429 } else if (isASCIILower(cc)) { 430 m_temporaryBuffer.append(static_cast<LChar>(cc)); 431 addToPossibleEndTag(static_cast<LChar>(cc)); 432 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 433 } else { 434 bufferCharacter('<'); 435 bufferCharacter('/'); 436 HTML_RECONSUME_IN(RAWTEXTState); 437 } 438 } 439 END_STATE() 440 441 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) { 442 if (isASCIIUpper(cc)) { 443 m_temporaryBuffer.append(static_cast<LChar>(cc)); 444 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 445 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 446 } else if (isASCIILower(cc)) { 447 m_temporaryBuffer.append(static_cast<LChar>(cc)); 448 addToPossibleEndTag(static_cast<LChar>(cc)); 449 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 450 } else { 451 if (isTokenizerWhitespace(cc)) { 452 if (isAppropriateEndTag()) { 453 m_temporaryBuffer.append(static_cast<LChar>(cc)); 454 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 455 } 456 } else if (cc == '/') { 457 if (isAppropriateEndTag()) { 458 m_temporaryBuffer.append(static_cast<LChar>(cc)); 459 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 460 } 461 } else if (cc == '>') { 462 if (isAppropriateEndTag()) { 463 m_temporaryBuffer.append(static_cast<LChar>(cc)); 464 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 465 } 466 } 467 bufferCharacter('<'); 468 bufferCharacter('/'); 469 m_token->appendToCharacter(m_temporaryBuffer); 470 m_bufferedEndTagName.clear(); 471 m_temporaryBuffer.clear(); 472 HTML_RECONSUME_IN(RAWTEXTState); 473 } 474 } 475 END_STATE() 476 477 HTML_BEGIN_STATE(ScriptDataLessThanSignState) { 478 if (cc == '/') { 479 m_temporaryBuffer.clear(); 480 ASSERT(m_bufferedEndTagName.isEmpty()); 481 HTML_ADVANCE_TO(ScriptDataEndTagOpenState); 482 } else if (cc == '!') { 483 bufferCharacter('<'); 484 bufferCharacter('!'); 485 HTML_ADVANCE_TO(ScriptDataEscapeStartState); 486 } else { 487 bufferCharacter('<'); 488 HTML_RECONSUME_IN(ScriptDataState); 489 } 490 } 491 END_STATE() 492 493 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) { 494 if (isASCIIUpper(cc)) { 495 m_temporaryBuffer.append(static_cast<LChar>(cc)); 496 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 497 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 498 } else if (isASCIILower(cc)) { 499 m_temporaryBuffer.append(static_cast<LChar>(cc)); 500 addToPossibleEndTag(static_cast<LChar>(cc)); 501 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 502 } else { 503 bufferCharacter('<'); 504 bufferCharacter('/'); 505 HTML_RECONSUME_IN(ScriptDataState); 506 } 507 } 508 END_STATE() 509 510 HTML_BEGIN_STATE(ScriptDataEndTagNameState) { 511 if (isASCIIUpper(cc)) { 512 m_temporaryBuffer.append(static_cast<LChar>(cc)); 513 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 514 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 515 } else if (isASCIILower(cc)) { 516 m_temporaryBuffer.append(static_cast<LChar>(cc)); 517 addToPossibleEndTag(static_cast<LChar>(cc)); 518 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 519 } else { 520 if (isTokenizerWhitespace(cc)) { 521 if (isAppropriateEndTag()) { 522 m_temporaryBuffer.append(static_cast<LChar>(cc)); 523 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 524 } 525 } else if (cc == '/') { 526 if (isAppropriateEndTag()) { 527 m_temporaryBuffer.append(static_cast<LChar>(cc)); 528 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 529 } 530 } else if (cc == '>') { 531 if (isAppropriateEndTag()) { 532 m_temporaryBuffer.append(static_cast<LChar>(cc)); 533 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 534 } 535 } 536 bufferCharacter('<'); 537 bufferCharacter('/'); 538 m_token->appendToCharacter(m_temporaryBuffer); 539 m_bufferedEndTagName.clear(); 540 m_temporaryBuffer.clear(); 541 HTML_RECONSUME_IN(ScriptDataState); 542 } 543 } 544 END_STATE() 545 546 HTML_BEGIN_STATE(ScriptDataEscapeStartState) { 547 if (cc == '-') { 548 bufferCharacter(cc); 549 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState); 550 } else 551 HTML_RECONSUME_IN(ScriptDataState); 552 } 553 END_STATE() 554 555 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) { 556 if (cc == '-') { 557 bufferCharacter(cc); 558 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 559 } else 560 HTML_RECONSUME_IN(ScriptDataState); 561 } 562 END_STATE() 563 564 HTML_BEGIN_STATE(ScriptDataEscapedState) { 565 if (cc == '-') { 566 bufferCharacter(cc); 567 HTML_ADVANCE_TO(ScriptDataEscapedDashState); 568 } else if (cc == '<') 569 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 570 else if (cc == kEndOfFileMarker) { 571 parseError(); 572 HTML_RECONSUME_IN(DataState); 573 } else { 574 bufferCharacter(cc); 575 HTML_ADVANCE_TO(ScriptDataEscapedState); 576 } 577 } 578 END_STATE() 579 580 HTML_BEGIN_STATE(ScriptDataEscapedDashState) { 581 if (cc == '-') { 582 bufferCharacter(cc); 583 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 584 } else if (cc == '<') 585 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 586 else if (cc == kEndOfFileMarker) { 587 parseError(); 588 HTML_RECONSUME_IN(DataState); 589 } else { 590 bufferCharacter(cc); 591 HTML_ADVANCE_TO(ScriptDataEscapedState); 592 } 593 } 594 END_STATE() 595 596 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) { 597 if (cc == '-') { 598 bufferCharacter(cc); 599 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 600 } else if (cc == '<') 601 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 602 else if (cc == '>') { 603 bufferCharacter(cc); 604 HTML_ADVANCE_TO(ScriptDataState); 605 } else if (cc == kEndOfFileMarker) { 606 parseError(); 607 HTML_RECONSUME_IN(DataState); 608 } else { 609 bufferCharacter(cc); 610 HTML_ADVANCE_TO(ScriptDataEscapedState); 611 } 612 } 613 END_STATE() 614 615 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 616 if (cc == '/') { 617 m_temporaryBuffer.clear(); 618 ASSERT(m_bufferedEndTagName.isEmpty()); 619 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 620 } else if (isASCIIUpper(cc)) { 621 bufferCharacter('<'); 622 bufferCharacter(cc); 623 m_temporaryBuffer.clear(); 624 m_temporaryBuffer.append(toLowerCase(cc)); 625 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 626 } else if (isASCIILower(cc)) { 627 bufferCharacter('<'); 628 bufferCharacter(cc); 629 m_temporaryBuffer.clear(); 630 m_temporaryBuffer.append(static_cast<LChar>(cc)); 631 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 632 } else { 633 bufferCharacter('<'); 634 HTML_RECONSUME_IN(ScriptDataEscapedState); 635 } 636 } 637 END_STATE() 638 639 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 640 if (isASCIIUpper(cc)) { 641 m_temporaryBuffer.append(static_cast<LChar>(cc)); 642 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 643 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 644 } else if (isASCIILower(cc)) { 645 m_temporaryBuffer.append(static_cast<LChar>(cc)); 646 addToPossibleEndTag(static_cast<LChar>(cc)); 647 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 648 } else { 649 bufferCharacter('<'); 650 bufferCharacter('/'); 651 HTML_RECONSUME_IN(ScriptDataEscapedState); 652 } 653 } 654 END_STATE() 655 656 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 657 if (isASCIIUpper(cc)) { 658 m_temporaryBuffer.append(static_cast<LChar>(cc)); 659 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 660 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 661 } else if (isASCIILower(cc)) { 662 m_temporaryBuffer.append(static_cast<LChar>(cc)); 663 addToPossibleEndTag(static_cast<LChar>(cc)); 664 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 665 } else { 666 if (isTokenizerWhitespace(cc)) { 667 if (isAppropriateEndTag()) { 668 m_temporaryBuffer.append(static_cast<LChar>(cc)); 669 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 670 } 671 } else if (cc == '/') { 672 if (isAppropriateEndTag()) { 673 m_temporaryBuffer.append(static_cast<LChar>(cc)); 674 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 675 } 676 } else if (cc == '>') { 677 if (isAppropriateEndTag()) { 678 m_temporaryBuffer.append(static_cast<LChar>(cc)); 679 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 680 } 681 } 682 bufferCharacter('<'); 683 bufferCharacter('/'); 684 m_token->appendToCharacter(m_temporaryBuffer); 685 m_bufferedEndTagName.clear(); 686 m_temporaryBuffer.clear(); 687 HTML_RECONSUME_IN(ScriptDataEscapedState); 688 } 689 } 690 END_STATE() 691 692 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 693 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 694 bufferCharacter(cc); 695 if (temporaryBufferIs(scriptTag.localName())) 696 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 697 else 698 HTML_ADVANCE_TO(ScriptDataEscapedState); 699 } else if (isASCIIUpper(cc)) { 700 bufferCharacter(cc); 701 m_temporaryBuffer.append(toLowerCase(cc)); 702 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 703 } else if (isASCIILower(cc)) { 704 bufferCharacter(cc); 705 m_temporaryBuffer.append(static_cast<LChar>(cc)); 706 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 707 } else 708 HTML_RECONSUME_IN(ScriptDataEscapedState); 709 } 710 END_STATE() 711 712 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) { 713 if (cc == '-') { 714 bufferCharacter(cc); 715 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState); 716 } else if (cc == '<') { 717 bufferCharacter(cc); 718 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 719 } else if (cc == kEndOfFileMarker) { 720 parseError(); 721 HTML_RECONSUME_IN(DataState); 722 } else { 723 bufferCharacter(cc); 724 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 725 } 726 } 727 END_STATE() 728 729 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 730 if (cc == '-') { 731 bufferCharacter(cc); 732 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 733 } else if (cc == '<') { 734 bufferCharacter(cc); 735 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 736 } else if (cc == kEndOfFileMarker) { 737 parseError(); 738 HTML_RECONSUME_IN(DataState); 739 } else { 740 bufferCharacter(cc); 741 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 742 } 743 } 744 END_STATE() 745 746 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 747 if (cc == '-') { 748 bufferCharacter(cc); 749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 750 } else if (cc == '<') { 751 bufferCharacter(cc); 752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 753 } else if (cc == '>') { 754 bufferCharacter(cc); 755 HTML_ADVANCE_TO(ScriptDataState); 756 } else if (cc == kEndOfFileMarker) { 757 parseError(); 758 HTML_RECONSUME_IN(DataState); 759 } else { 760 bufferCharacter(cc); 761 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 762 } 763 } 764 END_STATE() 765 766 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 767 if (cc == '/') { 768 bufferCharacter(cc); 769 m_temporaryBuffer.clear(); 770 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 771 } else 772 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 773 } 774 END_STATE() 775 776 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 777 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 778 bufferCharacter(cc); 779 if (temporaryBufferIs(scriptTag.localName())) 780 HTML_ADVANCE_TO(ScriptDataEscapedState); 781 else 782 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 783 } else if (isASCIIUpper(cc)) { 784 bufferCharacter(cc); 785 m_temporaryBuffer.append(toLowerCase(cc)); 786 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 787 } else if (isASCIILower(cc)) { 788 bufferCharacter(cc); 789 m_temporaryBuffer.append(static_cast<LChar>(cc)); 790 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 791 } else 792 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 793 } 794 END_STATE() 795 796 HTML_BEGIN_STATE(BeforeAttributeNameState) { 797 if (isTokenizerWhitespace(cc)) 798 HTML_ADVANCE_TO(BeforeAttributeNameState); 799 else if (cc == '/') 800 HTML_ADVANCE_TO(SelfClosingStartTagState); 801 else if (cc == '>') 802 return emitAndResumeIn(source, HTMLTokenizer::DataState); 803 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 804 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 805 else if (isASCIIUpper(cc)) { 806 m_token->addNewAttribute(); 807 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 808 m_token->appendToAttributeName(toLowerCase(cc)); 809 HTML_ADVANCE_TO(AttributeNameState); 810 } else if (cc == kEndOfFileMarker) { 811 parseError(); 812 HTML_RECONSUME_IN(DataState); 813 } else { 814 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 815 parseError(); 816 m_token->addNewAttribute(); 817 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 818 m_token->appendToAttributeName(cc); 819 HTML_ADVANCE_TO(AttributeNameState); 820 } 821 } 822 END_STATE() 823 824 HTML_BEGIN_STATE(AttributeNameState) { 825 if (isTokenizerWhitespace(cc)) { 826 m_token->endAttributeName(source.numberOfCharactersConsumed()); 827 HTML_ADVANCE_TO(AfterAttributeNameState); 828 } else if (cc == '/') { 829 m_token->endAttributeName(source.numberOfCharactersConsumed()); 830 HTML_ADVANCE_TO(SelfClosingStartTagState); 831 } else if (cc == '=') { 832 m_token->endAttributeName(source.numberOfCharactersConsumed()); 833 HTML_ADVANCE_TO(BeforeAttributeValueState); 834 } else if (cc == '>') { 835 m_token->endAttributeName(source.numberOfCharactersConsumed()); 836 return emitAndResumeIn(source, HTMLTokenizer::DataState); 837 } else if (m_options.usePreHTML5ParserQuirks && cc == '<') { 838 m_token->endAttributeName(source.numberOfCharactersConsumed()); 839 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 840 } else if (isASCIIUpper(cc)) { 841 m_token->appendToAttributeName(toLowerCase(cc)); 842 HTML_ADVANCE_TO(AttributeNameState); 843 } else if (cc == kEndOfFileMarker) { 844 parseError(); 845 m_token->endAttributeName(source.numberOfCharactersConsumed()); 846 HTML_RECONSUME_IN(DataState); 847 } else { 848 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 849 parseError(); 850 m_token->appendToAttributeName(cc); 851 HTML_ADVANCE_TO(AttributeNameState); 852 } 853 } 854 END_STATE() 855 856 HTML_BEGIN_STATE(AfterAttributeNameState) { 857 if (isTokenizerWhitespace(cc)) 858 HTML_ADVANCE_TO(AfterAttributeNameState); 859 else if (cc == '/') 860 HTML_ADVANCE_TO(SelfClosingStartTagState); 861 else if (cc == '=') 862 HTML_ADVANCE_TO(BeforeAttributeValueState); 863 else if (cc == '>') 864 return emitAndResumeIn(source, HTMLTokenizer::DataState); 865 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 866 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 867 else if (isASCIIUpper(cc)) { 868 m_token->addNewAttribute(); 869 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 870 m_token->appendToAttributeName(toLowerCase(cc)); 871 HTML_ADVANCE_TO(AttributeNameState); 872 } else if (cc == kEndOfFileMarker) { 873 parseError(); 874 HTML_RECONSUME_IN(DataState); 875 } else { 876 if (cc == '"' || cc == '\'' || cc == '<') 877 parseError(); 878 m_token->addNewAttribute(); 879 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 880 m_token->appendToAttributeName(cc); 881 HTML_ADVANCE_TO(AttributeNameState); 882 } 883 } 884 END_STATE() 885 886 HTML_BEGIN_STATE(BeforeAttributeValueState) { 887 if (isTokenizerWhitespace(cc)) 888 HTML_ADVANCE_TO(BeforeAttributeValueState); 889 else if (cc == '"') { 890 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 891 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 892 } else if (cc == '&') { 893 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 894 HTML_RECONSUME_IN(AttributeValueUnquotedState); 895 } else if (cc == '\'') { 896 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 897 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 898 } else if (cc == '>') { 899 parseError(); 900 return emitAndResumeIn(source, HTMLTokenizer::DataState); 901 } else if (cc == kEndOfFileMarker) { 902 parseError(); 903 HTML_RECONSUME_IN(DataState); 904 } else { 905 if (cc == '<' || cc == '=' || cc == '`') 906 parseError(); 907 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 908 m_token->appendToAttributeValue(cc); 909 HTML_ADVANCE_TO(AttributeValueUnquotedState); 910 } 911 } 912 END_STATE() 913 914 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 915 if (cc == '"') { 916 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 917 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 918 } else if (cc == '&') { 919 m_additionalAllowedCharacter = '"'; 920 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 921 } else if (cc == kEndOfFileMarker) { 922 parseError(); 923 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 924 HTML_RECONSUME_IN(DataState); 925 } else { 926 m_token->appendToAttributeValue(cc); 927 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 928 } 929 } 930 END_STATE() 931 932 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 933 if (cc == '\'') { 934 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 935 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 936 } else if (cc == '&') { 937 m_additionalAllowedCharacter = '\''; 938 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 939 } else if (cc == kEndOfFileMarker) { 940 parseError(); 941 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 942 HTML_RECONSUME_IN(DataState); 943 } else { 944 m_token->appendToAttributeValue(cc); 945 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 946 } 947 } 948 END_STATE() 949 950 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 951 if (isTokenizerWhitespace(cc)) { 952 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 953 HTML_ADVANCE_TO(BeforeAttributeNameState); 954 } else if (cc == '&') { 955 m_additionalAllowedCharacter = '>'; 956 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 957 } else if (cc == '>') { 958 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 959 return emitAndResumeIn(source, HTMLTokenizer::DataState); 960 } else if (cc == kEndOfFileMarker) { 961 parseError(); 962 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 963 HTML_RECONSUME_IN(DataState); 964 } else { 965 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 966 parseError(); 967 m_token->appendToAttributeValue(cc); 968 HTML_ADVANCE_TO(AttributeValueUnquotedState); 969 } 970 } 971 END_STATE() 972 973 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { 974 bool notEnoughCharacters = false; 975 StringBuilder decodedEntity; 976 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 977 if (notEnoughCharacters) 978 return haveBufferedCharacterToken(); 979 if (!success) { 980 ASSERT(decodedEntity.isEmpty()); 981 m_token->appendToAttributeValue('&'); 982 } else { 983 for (unsigned i = 0; i < decodedEntity.length(); ++i) 984 m_token->appendToAttributeValue(decodedEntity[i]); 985 } 986 // We're supposed to switch back to the attribute value state that 987 // we were in when we were switched into this state. Rather than 988 // keeping track of this explictly, we observe that the previous 989 // state can be determined by m_additionalAllowedCharacter. 990 if (m_additionalAllowedCharacter == '"') 991 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); 992 else if (m_additionalAllowedCharacter == '\'') 993 HTML_SWITCH_TO(AttributeValueSingleQuotedState); 994 else if (m_additionalAllowedCharacter == '>') 995 HTML_SWITCH_TO(AttributeValueUnquotedState); 996 else 997 ASSERT_NOT_REACHED(); 998 } 999 END_STATE() 1000 1001 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 1002 if (isTokenizerWhitespace(cc)) 1003 HTML_ADVANCE_TO(BeforeAttributeNameState); 1004 else if (cc == '/') 1005 HTML_ADVANCE_TO(SelfClosingStartTagState); 1006 else if (cc == '>') 1007 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1008 else if (m_options.usePreHTML5ParserQuirks && cc == '<') 1009 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1010 else if (cc == kEndOfFileMarker) { 1011 parseError(); 1012 HTML_RECONSUME_IN(DataState); 1013 } else { 1014 parseError(); 1015 HTML_RECONSUME_IN(BeforeAttributeNameState); 1016 } 1017 } 1018 END_STATE() 1019 1020 HTML_BEGIN_STATE(SelfClosingStartTagState) { 1021 if (cc == '>') { 1022 m_token->setSelfClosing(); 1023 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1024 } else if (cc == kEndOfFileMarker) { 1025 parseError(); 1026 HTML_RECONSUME_IN(DataState); 1027 } else { 1028 parseError(); 1029 HTML_RECONSUME_IN(BeforeAttributeNameState); 1030 } 1031 } 1032 END_STATE() 1033 1034 HTML_BEGIN_STATE(BogusCommentState) { 1035 m_token->beginComment(); 1036 HTML_RECONSUME_IN(ContinueBogusCommentState); 1037 } 1038 END_STATE() 1039 1040 HTML_BEGIN_STATE(ContinueBogusCommentState) { 1041 if (cc == '>') 1042 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1043 else if (cc == kEndOfFileMarker) 1044 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1045 else { 1046 m_token->appendToComment(cc); 1047 HTML_ADVANCE_TO(ContinueBogusCommentState); 1048 } 1049 } 1050 END_STATE() 1051 1052 HTML_BEGIN_STATE(MarkupDeclarationOpenState) { 1053 DEPRECATED_DEFINE_STATIC_LOCAL(String, dashDashString, (ASCIILiteral("--"))); 1054 DEPRECATED_DEFINE_STATIC_LOCAL(String, doctypeString, (ASCIILiteral("doctype"))); 1055 DEPRECATED_DEFINE_STATIC_LOCAL(String, cdataString, (ASCIILiteral("[CDATA["))); 1056 if (cc == '-') { 1057 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); 1058 if (result == SegmentedString::DidMatch) { 1059 source.advanceAndASSERT('-'); 1060 source.advanceAndASSERT('-'); 1061 m_token->beginComment(); 1062 HTML_SWITCH_TO(CommentStartState); 1063 } else if (result == SegmentedString::NotEnoughCharacters) 1064 return haveBufferedCharacterToken(); 1065 } else if (cc == 'D' || cc == 'd') { 1066 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); 1067 if (result == SegmentedString::DidMatch) { 1068 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1069 HTML_SWITCH_TO(DOCTYPEState); 1070 } else if (result == SegmentedString::NotEnoughCharacters) 1071 return haveBufferedCharacterToken(); 1072 } else if (cc == '[' && shouldAllowCDATA()) { 1073 SegmentedString::LookAheadResult result = source.lookAhead(cdataString); 1074 if (result == SegmentedString::DidMatch) { 1075 advanceStringAndASSERT(source, "[CDATA["); 1076 HTML_SWITCH_TO(CDATASectionState); 1077 } else if (result == SegmentedString::NotEnoughCharacters) 1078 return haveBufferedCharacterToken(); 1079 } 1080 parseError(); 1081 HTML_RECONSUME_IN(BogusCommentState); 1082 } 1083 END_STATE() 1084 1085 HTML_BEGIN_STATE(CommentStartState) { 1086 if (cc == '-') 1087 HTML_ADVANCE_TO(CommentStartDashState); 1088 else if (cc == '>') { 1089 parseError(); 1090 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1091 } else if (cc == kEndOfFileMarker) { 1092 parseError(); 1093 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1094 } else { 1095 m_token->appendToComment(cc); 1096 HTML_ADVANCE_TO(CommentState); 1097 } 1098 } 1099 END_STATE() 1100 1101 HTML_BEGIN_STATE(CommentStartDashState) { 1102 if (cc == '-') 1103 HTML_ADVANCE_TO(CommentEndState); 1104 else if (cc == '>') { 1105 parseError(); 1106 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1107 } else if (cc == kEndOfFileMarker) { 1108 parseError(); 1109 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1110 } else { 1111 m_token->appendToComment('-'); 1112 m_token->appendToComment(cc); 1113 HTML_ADVANCE_TO(CommentState); 1114 } 1115 } 1116 END_STATE() 1117 1118 HTML_BEGIN_STATE(CommentState) { 1119 if (cc == '-') 1120 HTML_ADVANCE_TO(CommentEndDashState); 1121 else if (cc == kEndOfFileMarker) { 1122 parseError(); 1123 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1124 } else { 1125 m_token->appendToComment(cc); 1126 HTML_ADVANCE_TO(CommentState); 1127 } 1128 } 1129 END_STATE() 1130 1131 HTML_BEGIN_STATE(CommentEndDashState) { 1132 if (cc == '-') 1133 HTML_ADVANCE_TO(CommentEndState); 1134 else if (cc == kEndOfFileMarker) { 1135 parseError(); 1136 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1137 } else { 1138 m_token->appendToComment('-'); 1139 m_token->appendToComment(cc); 1140 HTML_ADVANCE_TO(CommentState); 1141 } 1142 } 1143 END_STATE() 1144 1145 HTML_BEGIN_STATE(CommentEndState) { 1146 if (cc == '>') 1147 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1148 else if (cc == '!') { 1149 parseError(); 1150 HTML_ADVANCE_TO(CommentEndBangState); 1151 } else if (cc == '-') { 1152 parseError(); 1153 m_token->appendToComment('-'); 1154 HTML_ADVANCE_TO(CommentEndState); 1155 } else if (cc == kEndOfFileMarker) { 1156 parseError(); 1157 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1158 } else { 1159 parseError(); 1160 m_token->appendToComment('-'); 1161 m_token->appendToComment('-'); 1162 m_token->appendToComment(cc); 1163 HTML_ADVANCE_TO(CommentState); 1164 } 1165 } 1166 END_STATE() 1167 1168 HTML_BEGIN_STATE(CommentEndBangState) { 1169 if (cc == '-') { 1170 m_token->appendToComment('-'); 1171 m_token->appendToComment('-'); 1172 m_token->appendToComment('!'); 1173 HTML_ADVANCE_TO(CommentEndDashState); 1174 } else if (cc == '>') 1175 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1176 else if (cc == kEndOfFileMarker) { 1177 parseError(); 1178 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1179 } else { 1180 m_token->appendToComment('-'); 1181 m_token->appendToComment('-'); 1182 m_token->appendToComment('!'); 1183 m_token->appendToComment(cc); 1184 HTML_ADVANCE_TO(CommentState); 1185 } 1186 } 1187 END_STATE() 1188 1189 HTML_BEGIN_STATE(DOCTYPEState) { 1190 if (isTokenizerWhitespace(cc)) 1191 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1192 else if (cc == kEndOfFileMarker) { 1193 parseError(); 1194 m_token->beginDOCTYPE(); 1195 m_token->setForceQuirks(); 1196 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1197 } else { 1198 parseError(); 1199 HTML_RECONSUME_IN(BeforeDOCTYPENameState); 1200 } 1201 } 1202 END_STATE() 1203 1204 HTML_BEGIN_STATE(BeforeDOCTYPENameState) { 1205 if (isTokenizerWhitespace(cc)) 1206 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1207 else if (isASCIIUpper(cc)) { 1208 m_token->beginDOCTYPE(toLowerCase(cc)); 1209 HTML_ADVANCE_TO(DOCTYPENameState); 1210 } else if (cc == '>') { 1211 parseError(); 1212 m_token->beginDOCTYPE(); 1213 m_token->setForceQuirks(); 1214 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1215 } else if (cc == kEndOfFileMarker) { 1216 parseError(); 1217 m_token->beginDOCTYPE(); 1218 m_token->setForceQuirks(); 1219 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1220 } else { 1221 m_token->beginDOCTYPE(cc); 1222 HTML_ADVANCE_TO(DOCTYPENameState); 1223 } 1224 } 1225 END_STATE() 1226 1227 HTML_BEGIN_STATE(DOCTYPENameState) { 1228 if (isTokenizerWhitespace(cc)) 1229 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1230 else if (cc == '>') 1231 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1232 else if (isASCIIUpper(cc)) { 1233 m_token->appendToName(toLowerCase(cc)); 1234 HTML_ADVANCE_TO(DOCTYPENameState); 1235 } else if (cc == kEndOfFileMarker) { 1236 parseError(); 1237 m_token->setForceQuirks(); 1238 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1239 } else { 1240 m_token->appendToName(cc); 1241 HTML_ADVANCE_TO(DOCTYPENameState); 1242 } 1243 } 1244 END_STATE() 1245 1246 HTML_BEGIN_STATE(AfterDOCTYPENameState) { 1247 if (isTokenizerWhitespace(cc)) 1248 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1249 if (cc == '>') 1250 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1251 else if (cc == kEndOfFileMarker) { 1252 parseError(); 1253 m_token->setForceQuirks(); 1254 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1255 } else { 1256 DEPRECATED_DEFINE_STATIC_LOCAL(String, publicString, (ASCIILiteral("public"))); 1257 DEPRECATED_DEFINE_STATIC_LOCAL(String, systemString, (ASCIILiteral("system"))); 1258 if (cc == 'P' || cc == 'p') { 1259 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); 1260 if (result == SegmentedString::DidMatch) { 1261 advanceStringAndASSERTIgnoringCase(source, "public"); 1262 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1263 } else if (result == SegmentedString::NotEnoughCharacters) 1264 return haveBufferedCharacterToken(); 1265 } else if (cc == 'S' || cc == 's') { 1266 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); 1267 if (result == SegmentedString::DidMatch) { 1268 advanceStringAndASSERTIgnoringCase(source, "system"); 1269 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState); 1270 } else if (result == SegmentedString::NotEnoughCharacters) 1271 return haveBufferedCharacterToken(); 1272 } 1273 parseError(); 1274 m_token->setForceQuirks(); 1275 HTML_ADVANCE_TO(BogusDOCTYPEState); 1276 } 1277 } 1278 END_STATE() 1279 1280 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1281 if (isTokenizerWhitespace(cc)) 1282 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1283 else if (cc == '"') { 1284 parseError(); 1285 m_token->setPublicIdentifierToEmptyString(); 1286 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1287 } else if (cc == '\'') { 1288 parseError(); 1289 m_token->setPublicIdentifierToEmptyString(); 1290 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1291 } else if (cc == '>') { 1292 parseError(); 1293 m_token->setForceQuirks(); 1294 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1295 } else if (cc == kEndOfFileMarker) { 1296 parseError(); 1297 m_token->setForceQuirks(); 1298 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1299 } else { 1300 parseError(); 1301 m_token->setForceQuirks(); 1302 HTML_ADVANCE_TO(BogusDOCTYPEState); 1303 } 1304 } 1305 END_STATE() 1306 1307 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1308 if (isTokenizerWhitespace(cc)) 1309 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1310 else if (cc == '"') { 1311 m_token->setPublicIdentifierToEmptyString(); 1312 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1313 } else if (cc == '\'') { 1314 m_token->setPublicIdentifierToEmptyString(); 1315 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1316 } else if (cc == '>') { 1317 parseError(); 1318 m_token->setForceQuirks(); 1319 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1320 } else if (cc == kEndOfFileMarker) { 1321 parseError(); 1322 m_token->setForceQuirks(); 1323 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1324 } else { 1325 parseError(); 1326 m_token->setForceQuirks(); 1327 HTML_ADVANCE_TO(BogusDOCTYPEState); 1328 } 1329 } 1330 END_STATE() 1331 1332 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1333 if (cc == '"') 1334 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1335 else if (cc == '>') { 1336 parseError(); 1337 m_token->setForceQuirks(); 1338 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1339 } else if (cc == kEndOfFileMarker) { 1340 parseError(); 1341 m_token->setForceQuirks(); 1342 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1343 } else { 1344 m_token->appendToPublicIdentifier(cc); 1345 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1346 } 1347 } 1348 END_STATE() 1349 1350 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1351 if (cc == '\'') 1352 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1353 else if (cc == '>') { 1354 parseError(); 1355 m_token->setForceQuirks(); 1356 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1357 } else if (cc == kEndOfFileMarker) { 1358 parseError(); 1359 m_token->setForceQuirks(); 1360 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1361 } else { 1362 m_token->appendToPublicIdentifier(cc); 1363 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1364 } 1365 } 1366 END_STATE() 1367 1368 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1369 if (isTokenizerWhitespace(cc)) 1370 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1371 else if (cc == '>') 1372 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1373 else if (cc == '"') { 1374 parseError(); 1375 m_token->setSystemIdentifierToEmptyString(); 1376 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1377 } else if (cc == '\'') { 1378 parseError(); 1379 m_token->setSystemIdentifierToEmptyString(); 1380 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1381 } else if (cc == kEndOfFileMarker) { 1382 parseError(); 1383 m_token->setForceQuirks(); 1384 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1385 } else { 1386 parseError(); 1387 m_token->setForceQuirks(); 1388 HTML_ADVANCE_TO(BogusDOCTYPEState); 1389 } 1390 } 1391 END_STATE() 1392 1393 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1394 if (isTokenizerWhitespace(cc)) 1395 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1396 else if (cc == '>') 1397 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1398 else if (cc == '"') { 1399 m_token->setSystemIdentifierToEmptyString(); 1400 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1401 } else if (cc == '\'') { 1402 m_token->setSystemIdentifierToEmptyString(); 1403 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1404 } else if (cc == kEndOfFileMarker) { 1405 parseError(); 1406 m_token->setForceQuirks(); 1407 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1408 } else { 1409 parseError(); 1410 m_token->setForceQuirks(); 1411 HTML_ADVANCE_TO(BogusDOCTYPEState); 1412 } 1413 } 1414 END_STATE() 1415 1416 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1417 if (isTokenizerWhitespace(cc)) 1418 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1419 else if (cc == '"') { 1420 parseError(); 1421 m_token->setSystemIdentifierToEmptyString(); 1422 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1423 } else if (cc == '\'') { 1424 parseError(); 1425 m_token->setSystemIdentifierToEmptyString(); 1426 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1427 } else if (cc == '>') { 1428 parseError(); 1429 m_token->setForceQuirks(); 1430 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1431 } else if (cc == kEndOfFileMarker) { 1432 parseError(); 1433 m_token->setForceQuirks(); 1434 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1435 } else { 1436 parseError(); 1437 m_token->setForceQuirks(); 1438 HTML_ADVANCE_TO(BogusDOCTYPEState); 1439 } 1440 } 1441 END_STATE() 1442 1443 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1444 if (isTokenizerWhitespace(cc)) 1445 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1446 if (cc == '"') { 1447 m_token->setSystemIdentifierToEmptyString(); 1448 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1449 } else if (cc == '\'') { 1450 m_token->setSystemIdentifierToEmptyString(); 1451 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1452 } else if (cc == '>') { 1453 parseError(); 1454 m_token->setForceQuirks(); 1455 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1456 } else if (cc == kEndOfFileMarker) { 1457 parseError(); 1458 m_token->setForceQuirks(); 1459 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1460 } else { 1461 parseError(); 1462 m_token->setForceQuirks(); 1463 HTML_ADVANCE_TO(BogusDOCTYPEState); 1464 } 1465 } 1466 END_STATE() 1467 1468 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1469 if (cc == '"') 1470 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1471 else if (cc == '>') { 1472 parseError(); 1473 m_token->setForceQuirks(); 1474 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1475 } else if (cc == kEndOfFileMarker) { 1476 parseError(); 1477 m_token->setForceQuirks(); 1478 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1479 } else { 1480 m_token->appendToSystemIdentifier(cc); 1481 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1482 } 1483 } 1484 END_STATE() 1485 1486 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1487 if (cc == '\'') 1488 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1489 else if (cc == '>') { 1490 parseError(); 1491 m_token->setForceQuirks(); 1492 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1493 } else if (cc == kEndOfFileMarker) { 1494 parseError(); 1495 m_token->setForceQuirks(); 1496 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1497 } else { 1498 m_token->appendToSystemIdentifier(cc); 1499 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1500 } 1501 } 1502 END_STATE() 1503 1504 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1505 if (isTokenizerWhitespace(cc)) 1506 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1507 else if (cc == '>') 1508 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1509 else if (cc == kEndOfFileMarker) { 1510 parseError(); 1511 m_token->setForceQuirks(); 1512 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1513 } else { 1514 parseError(); 1515 HTML_ADVANCE_TO(BogusDOCTYPEState); 1516 } 1517 } 1518 END_STATE() 1519 1520 HTML_BEGIN_STATE(BogusDOCTYPEState) { 1521 if (cc == '>') 1522 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1523 else if (cc == kEndOfFileMarker) 1524 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1525 HTML_ADVANCE_TO(BogusDOCTYPEState); 1526 } 1527 END_STATE() 1528 1529 HTML_BEGIN_STATE(CDATASectionState) { 1530 if (cc == ']') 1531 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState); 1532 else if (cc == kEndOfFileMarker) 1533 HTML_RECONSUME_IN(DataState); 1534 else { 1535 bufferCharacter(cc); 1536 HTML_ADVANCE_TO(CDATASectionState); 1537 } 1538 } 1539 END_STATE() 1540 1541 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) { 1542 if (cc == ']') 1543 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1544 else { 1545 bufferCharacter(']'); 1546 HTML_RECONSUME_IN(CDATASectionState); 1547 } 1548 } 1549 1550 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1551 if (cc == '>') 1552 HTML_ADVANCE_TO(DataState); 1553 else { 1554 bufferCharacter(']'); 1555 bufferCharacter(']'); 1556 HTML_RECONSUME_IN(CDATASectionState); 1557 } 1558 } 1559 END_STATE() 1560 1561 } 1562 1563 ASSERT_NOT_REACHED(); 1564 return false; 1565} 1566 1567String HTMLTokenizer::bufferedCharacters() const 1568{ 1569 // FIXME: Add an assert about m_state. 1570 StringBuilder characters; 1571 characters.reserveCapacity(numberOfBufferedCharacters()); 1572 characters.append('<'); 1573 characters.append('/'); 1574 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size()); 1575 return characters.toString(); 1576} 1577 1578void HTMLTokenizer::updateStateFor(const AtomicString& tagName) 1579{ 1580 if (tagName == textareaTag || tagName == titleTag) 1581 setState(HTMLTokenizer::RCDATAState); 1582 else if (tagName == plaintextTag) 1583 setState(HTMLTokenizer::PLAINTEXTState); 1584 else if (tagName == scriptTag) 1585 setState(HTMLTokenizer::ScriptDataState); 1586 else if (tagName == styleTag 1587 || tagName == iframeTag 1588 || tagName == xmpTag 1589 || (tagName == noembedTag && m_options.pluginsEnabled) 1590 || tagName == noframesTag 1591 || (tagName == noscriptTag && m_options.scriptEnabled)) 1592 setState(HTMLTokenizer::RAWTEXTState); 1593} 1594 1595inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1596{ 1597 return vectorEqualsString(m_temporaryBuffer, expectedString); 1598} 1599 1600inline void HTMLTokenizer::addToPossibleEndTag(LChar cc) 1601{ 1602 ASSERT(isEndTagBufferingState(m_state)); 1603 m_bufferedEndTagName.append(cc); 1604} 1605 1606inline bool HTMLTokenizer::isAppropriateEndTag() 1607{ 1608 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size()) 1609 return false; 1610 1611 size_t numCharacters = m_bufferedEndTagName.size(); 1612 1613 for (size_t i = 0; i < numCharacters; i++) { 1614 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i]) 1615 return false; 1616 } 1617 1618 return true; 1619} 1620 1621inline void HTMLTokenizer::parseError() 1622{ 1623 notImplemented(); 1624} 1625 1626} 1627