1/* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#ifndef HTMLTokenizer_h 28#define HTMLTokenizer_h 29 30#include "HTMLParserOptions.h" 31#include "HTMLToken.h" 32#include "InputStreamPreprocessor.h" 33#include "SegmentedString.h" 34 35namespace WebCore { 36 37class HTMLTokenizer { 38 WTF_MAKE_NONCOPYABLE(HTMLTokenizer); 39 WTF_MAKE_FAST_ALLOCATED; 40public: 41 explicit HTMLTokenizer(const HTMLParserOptions&); 42 ~HTMLTokenizer(); 43 44 void reset(); 45 46 enum State { 47 DataState, 48 CharacterReferenceInDataState, 49 RCDATAState, 50 CharacterReferenceInRCDATAState, 51 RAWTEXTState, 52 ScriptDataState, 53 PLAINTEXTState, 54 TagOpenState, 55 EndTagOpenState, 56 TagNameState, 57 RCDATALessThanSignState, 58 RCDATAEndTagOpenState, 59 RCDATAEndTagNameState, 60 RAWTEXTLessThanSignState, 61 RAWTEXTEndTagOpenState, 62 RAWTEXTEndTagNameState, 63 ScriptDataLessThanSignState, 64 ScriptDataEndTagOpenState, 65 ScriptDataEndTagNameState, 66 ScriptDataEscapeStartState, 67 ScriptDataEscapeStartDashState, 68 ScriptDataEscapedState, 69 ScriptDataEscapedDashState, 70 ScriptDataEscapedDashDashState, 71 ScriptDataEscapedLessThanSignState, 72 ScriptDataEscapedEndTagOpenState, 73 ScriptDataEscapedEndTagNameState, 74 ScriptDataDoubleEscapeStartState, 75 ScriptDataDoubleEscapedState, 76 ScriptDataDoubleEscapedDashState, 77 ScriptDataDoubleEscapedDashDashState, 78 ScriptDataDoubleEscapedLessThanSignState, 79 ScriptDataDoubleEscapeEndState, 80 BeforeAttributeNameState, 81 AttributeNameState, 82 AfterAttributeNameState, 83 BeforeAttributeValueState, 84 AttributeValueDoubleQuotedState, 85 AttributeValueSingleQuotedState, 86 AttributeValueUnquotedState, 87 CharacterReferenceInAttributeValueState, 88 AfterAttributeValueQuotedState, 89 SelfClosingStartTagState, 90 BogusCommentState, 91 // The ContinueBogusCommentState is not in the HTML5 spec, but we use 92 // it internally to keep track of whether we've started the bogus 93 // comment token yet. 94 ContinueBogusCommentState, 95 MarkupDeclarationOpenState, 96 CommentStartState, 97 CommentStartDashState, 98 CommentState, 99 CommentEndDashState, 100 CommentEndState, 101 CommentEndBangState, 102 DOCTYPEState, 103 BeforeDOCTYPENameState, 104 DOCTYPENameState, 105 AfterDOCTYPENameState, 106 AfterDOCTYPEPublicKeywordState, 107 BeforeDOCTYPEPublicIdentifierState, 108 DOCTYPEPublicIdentifierDoubleQuotedState, 109 DOCTYPEPublicIdentifierSingleQuotedState, 110 AfterDOCTYPEPublicIdentifierState, 111 BetweenDOCTYPEPublicAndSystemIdentifiersState, 112 AfterDOCTYPESystemKeywordState, 113 BeforeDOCTYPESystemIdentifierState, 114 DOCTYPESystemIdentifierDoubleQuotedState, 115 DOCTYPESystemIdentifierSingleQuotedState, 116 AfterDOCTYPESystemIdentifierState, 117 BogusDOCTYPEState, 118 CDATASectionState, 119 // These CDATA states are not in the HTML5 spec, but we use them internally. 120 CDATASectionRightSquareBracketState, 121 CDATASectionDoubleRightSquareBracketState, 122 }; 123 124 // This function returns true if it emits a token. Otherwise, callers 125 // must provide the same (in progress) token on the next call (unless 126 // they call reset() first). 127 bool nextToken(SegmentedString&, HTMLToken&); 128 129 // Returns a copy of any characters buffered internally by the tokenizer. 130 // The tokenizer buffers characters when searching for the </script> token 131 // that terminates a script element. 132 String bufferedCharacters() const; 133 134 size_t numberOfBufferedCharacters() const 135 { 136 // Notice that we add 2 to the length of the m_temporaryBuffer to 137 // account for the "</" characters, which are effecitvely buffered in 138 // the tokenizer's state machine. 139 return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; 140 } 141 142 // Updates the tokenizer's state according to the given tag name. This is 143 // an approximation of how the tree builder would update the tokenizer's 144 // state. This method is useful for approximating HTML tokenization. To 145 // get exactly the correct tokenization, you need the real tree builder. 146 // 147 // The main failures in the approximation are as follows: 148 // 149 // * The first set of character tokens emitted for a <pre> element might 150 // contain an extra leading newline. 151 // * The replacement of U+0000 with U+FFFD will not be sensitive to the 152 // tree builder's insertion mode. 153 // * CDATA sections in foreign content will be tokenized as bogus comments 154 // instead of as character tokens. 155 // 156 void updateStateFor(const AtomicString& tagName); 157 158 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } 159 void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } 160 161 bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } 162 void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } 163 164 State state() const { return m_state; } 165 void setState(State state) { m_state = state; } 166 167 inline bool shouldSkipNullCharacters() const 168 { 169 return !m_forceNullCharacterReplacement 170 && (m_state == HTMLTokenizer::DataState 171 || m_state == HTMLTokenizer::RCDATAState 172 || m_state == HTMLTokenizer::RAWTEXTState); 173 } 174 175private: 176 inline bool processEntity(SegmentedString&); 177 178 inline void parseError(); 179 180 inline void bufferCharacter(UChar character) 181 { 182 ASSERT(character != kEndOfFileMarker); 183 m_token->ensureIsCharacterToken(); 184 m_token->appendToCharacter(character); 185 } 186 187 inline bool emitAndResumeIn(SegmentedString& source, State state) 188 { 189 saveEndTagNameIfNeeded(); 190 m_state = state; 191 source.advanceAndUpdateLineNumber(); 192 return true; 193 } 194 195 inline bool emitAndReconsumeIn(SegmentedString&, State state) 196 { 197 saveEndTagNameIfNeeded(); 198 m_state = state; 199 return true; 200 } 201 202 inline bool emitEndOfFile(SegmentedString& source) 203 { 204 if (haveBufferedCharacterToken()) 205 return true; 206 m_state = HTMLTokenizer::DataState; 207 source.advanceAndUpdateLineNumber(); 208 m_token->clear(); 209 m_token->makeEndOfFile(); 210 return true; 211 } 212 213 inline bool flushEmitAndResumeIn(SegmentedString&, State); 214 215 // Return whether we need to emit a character token before dealing with 216 // the buffered end tag. 217 inline bool flushBufferedEndTag(SegmentedString&); 218 inline bool temporaryBufferIs(const String&); 219 220 // Sometimes we speculatively consume input characters and we don't 221 // know whether they represent end tags or RCDATA, etc. These 222 // functions help manage these state. 223 inline void addToPossibleEndTag(LChar cc); 224 225 inline void saveEndTagNameIfNeeded() 226 { 227 ASSERT(m_token->type() != HTMLToken::Uninitialized); 228 if (m_token->type() == HTMLToken::StartTag) 229 m_appropriateEndTagName = m_token->name(); 230 } 231 inline bool isAppropriateEndTag(); 232 233 234 inline bool haveBufferedCharacterToken() 235 { 236 return m_token->type() == HTMLToken::Character; 237 } 238 239 State m_state; 240 bool m_forceNullCharacterReplacement; 241 bool m_shouldAllowCDATA; 242 243 // m_token is owned by the caller. If nextToken is not on the stack, 244 // this member might be pointing to unallocated memory. 245 HTMLToken* m_token; 246 247 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character 248 UChar m_additionalAllowedCharacter; 249 250 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 251 InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor; 252 253 Vector<UChar, 32> m_appropriateEndTagName; 254 255 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer 256 Vector<LChar, 32> m_temporaryBuffer; 257 258 // We occationally want to emit both a character token and an end tag 259 // token (e.g., when lexing script). We buffer the name of the end tag 260 // token here so we remember it next time we re-enter the tokenizer. 261 Vector<LChar, 32> m_bufferedEndTagName; 262 263 HTMLParserOptions m_options; 264}; 265 266} 267 268#endif 269