1/* 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27#include "HTMLMetaCharsetParser.h" 28 29#include "HTMLNames.h" 30#include "HTMLParserIdioms.h" 31#include "HTMLParserOptions.h" 32#include "HTMLTokenizer.h" 33#include "TextCodec.h" 34#include "TextEncodingRegistry.h" 35#include <wtf/text/WTFString.h> 36 37using namespace WTF; 38 39namespace WebCore { 40 41using namespace HTMLNames; 42 43HTMLMetaCharsetParser::HTMLMetaCharsetParser() 44 : m_tokenizer(HTMLTokenizer::create(HTMLParserOptions(0))) 45 , m_assumedCodec(newTextCodec(Latin1Encoding())) 46 , m_inHeadSection(true) 47 , m_doneChecking(false) 48{ 49} 50 51HTMLMetaCharsetParser::~HTMLMetaCharsetParser() 52{ 53} 54 55static const char charsetString[] = "charset"; 56static const size_t charsetLength = sizeof("charset") - 1; 57 58String HTMLMetaCharsetParser::extractCharset(const String& value) 59{ 60 size_t pos = 0; 61 unsigned length = value.length(); 62 63 while (pos < length) { 64 pos = value.find(charsetString, pos, false); 65 if (pos == notFound) 66 break; 67 68 pos += charsetLength; 69 70 // Skip whitespace. 71 while (pos < length && value[pos] <= ' ') 72 ++pos; 73 74 if (value[pos] != '=') 75 continue; 76 77 ++pos; 78 79 while (pos < length && value[pos] <= ' ') 80 ++pos; 81 82 char quoteMark = 0; 83 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { 84 quoteMark = static_cast<char>(value[pos++]); 85 ASSERT(!(quoteMark & 0x80)); 86 } 87 88 if (pos == length) 89 break; 90 91 unsigned end = pos; 92 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quoteMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[end] != ';'))) 93 ++end; 94 95 if (quoteMark && (end == length)) 96 break; // Close quote not found. 97 98 return value.substring(pos, end - pos); 99 } 100 101 return ""; 102} 103 104bool HTMLMetaCharsetParser::processMeta() 105{ 106 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); 107 AttributeList attributes; 108 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin(); iter != tokenAttributes.end(); ++iter) { 109 String attributeName = StringImpl::create8BitIfPossible(iter->name); 110 String attributeValue = StringImpl::create8BitIfPossible(iter->value); 111 attributes.append(std::make_pair(attributeName, attributeValue)); 112 } 113 114 m_encoding = encodingFromMetaAttributes(attributes); 115 return m_encoding.isValid(); 116} 117 118TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const AttributeList& attributes) 119{ 120 bool gotPragma = false; 121 Mode mode = None; 122 String charset; 123 124 for (AttributeList::const_iterator iter = attributes.begin(); iter != attributes.end(); ++iter) { 125 const AtomicString& attributeName = iter->first; 126 const String& attributeValue = iter->second; 127 128 if (attributeName == http_equivAttr) { 129 if (equalIgnoringCase(attributeValue, "content-type")) 130 gotPragma = true; 131 } else if (charset.isEmpty()) { 132 if (attributeName == charsetAttr) { 133 charset = attributeValue; 134 mode = Charset; 135 } else if (attributeName == contentAttr) { 136 charset = extractCharset(attributeValue); 137 if (charset.length()) 138 mode = Pragma; 139 } 140 } 141 } 142 143 if (mode == Charset || (mode == Pragma && gotPragma)) 144 return TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); 145 146 return TextEncoding(); 147} 148 149static const int bytesToCheckUnconditionally = 1024; // That many input bytes will be checked for meta charset even if <head> section is over. 150 151bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) 152{ 153 if (m_doneChecking) 154 return true; 155 156 ASSERT(!m_encoding.isValid()); 157 158 // We still don't have an encoding, and are in the head. 159 // The following tags are allowed in <head>: 160 // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE 161 162 // We stop scanning when a tag that is not permitted in <head> 163 // is seen, rather when </head> is seen, because that more closely 164 // matches behavior in other browsers; more details in 165 // <http://bugs.webkit.org/show_bug.cgi?id=3590>. 166 167 // Additionally, we ignore things that looks like tags in <title>, <script> 168 // and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>, 169 // <http://bugs.webkit.org/show_bug.cgi?id=12165> and 170 // <http://bugs.webkit.org/show_bug.cgi?id=12389>. 171 172 // Since many sites have charset declarations after <body> or other tags 173 // that are disallowed in <head>, we don't bail out until we've checked at 174 // least bytesToCheckUnconditionally bytes of input. 175 176 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); 177 178 while (m_tokenizer->nextToken(m_input, m_token)) { 179 bool end = m_token.type() == HTMLToken::EndTag; 180 if (end || m_token.type() == HTMLToken::StartTag) { 181 AtomicString tagName(m_token.name()); 182 if (!end) { 183 m_tokenizer->updateStateFor(tagName); 184 if (tagName == metaTag && processMeta()) { 185 m_doneChecking = true; 186 return true; 187 } 188 } 189 190 if (tagName != scriptTag && tagName != noscriptTag 191 && tagName != styleTag && tagName != linkTag 192 && tagName != metaTag && tagName != objectTag 193 && tagName != titleTag && tagName != baseTag 194 && (end || tagName != htmlTag) && (end || tagName != headTag)) { 195 m_inHeadSection = false; 196 } 197 } 198 199 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) { 200 m_doneChecking = true; 201 return true; 202 } 203 204 m_token.clear(); 205 } 206 207 return false; 208} 209 210} 211