1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32#include "PageSerializer.h" 33 34#include "CSSImageValue.h" 35#include "CSSImportRule.h" 36#include "CSSStyleRule.h" 37#include "CachedImage.h" 38#include "Document.h" 39#include "Element.h" 40#include "Frame.h" 41#include "HTMLFrameOwnerElement.h" 42#include "HTMLHeadElement.h" 43#include "HTMLImageElement.h" 44#include "HTMLLinkElement.h" 45#include "HTMLMetaCharsetParser.h" 46#include "HTMLNames.h" 47#include "HTMLStyleElement.h" 48#include "HTTPParsers.h" 49#include "Image.h" 50#include "MIMETypeRegistry.h" 51#include "MarkupAccumulator.h" 52#include "Page.h" 53#include "StyleCachedImage.h" 54#include "StyleImage.h" 55#include "StylePropertySet.h" 56#include "StyleRule.h" 57#include "StyleSheetContents.h" 58#include "Text.h" 59#include "TextEncoding.h" 60#include <wtf/text/CString.h> 61#include <wtf/text/StringBuilder.h> 62#include <wtf/text/WTFString.h> 63 64namespace WebCore { 65 66static bool isCharsetSpecifyingNode(Node* node) 67{ 68 if (!node->isHTMLElement()) 69 return false; 70 71 HTMLElement* element = toHTMLElement(node); 72 if (!element->hasTagName(HTMLNames::metaTag)) 73 return false; 74 HTMLMetaCharsetParser::AttributeList attributes; 75 if (element->hasAttributes()) { 76 for (unsigned i = 0; i < element->attributeCount(); ++i) { 77 const Attribute* attribute = element->attributeItem(i); 78 // FIXME: We should deal appropriately with the attribute if they have a namespace. 79 attributes.append(std::make_pair(attribute->name().toString(), attribute->value().string())); 80 } 81 } 82 TextEncoding textEncoding = HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes); 83 return textEncoding.isValid(); 84} 85 86static bool shouldIgnoreElement(Element* element) 87{ 88 return element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element); 89} 90 91static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner) 92{ 93 // FIXME: We should support all frame owners including applets. 94 return frameOwner.hasTagName(HTMLNames::objectTag) ? HTMLNames::dataAttr : HTMLNames::srcAttr; 95} 96 97class SerializerMarkupAccumulator : public WebCore::MarkupAccumulator { 98public: 99 SerializerMarkupAccumulator(PageSerializer*, Document*, Vector<Node*>*); 100 virtual ~SerializerMarkupAccumulator(); 101 102protected: 103 virtual void appendText(StringBuilder& out, Text*); 104 virtual void appendElement(StringBuilder& out, Element*, Namespaces*); 105 virtual void appendCustomAttributes(StringBuilder& out, Element*, Namespaces*); 106 virtual void appendEndTag(Node*); 107 108private: 109 PageSerializer* m_serializer; 110 Document* m_document; 111}; 112 113SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, Document* document, Vector<Node*>* nodes) 114 : MarkupAccumulator(nodes, ResolveAllURLs) 115 , m_serializer(serializer) 116 , m_document(document) 117{ 118 // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified. 119 if (m_document->isXHTMLDocument() || m_document->xmlStandalone() || m_document->isSVGDocument()) 120 appendString("<?xml version=\"" + m_document->xmlVersion() + "\" encoding=\"" + m_document->charset() + "\"?>"); 121} 122 123SerializerMarkupAccumulator::~SerializerMarkupAccumulator() 124{ 125} 126 127void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text* text) 128{ 129 Element* parent = text->parentElement(); 130 if (parent && !shouldIgnoreElement(parent)) 131 MarkupAccumulator::appendText(out, text); 132} 133 134void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element* element, Namespaces* namespaces) 135{ 136 if (!shouldIgnoreElement(element)) 137 MarkupAccumulator::appendElement(out, element, namespaces); 138 139 if (element->hasTagName(HTMLNames::headTag)) { 140 out.append("<meta charset=\""); 141 out.append(m_document->charset()); 142 out.append("\">"); 143 } 144 145 // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents. 146} 147 148void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, Element* element, Namespaces* namespaces) 149{ 150 if (!element->isFrameOwnerElement()) 151 return; 152 153 HTMLFrameOwnerElement* frameOwner = toFrameOwnerElement(element); 154 Frame* frame = frameOwner->contentFrame(); 155 if (!frame) 156 return; 157 158 KURL url = frame->document()->url(); 159 if (url.isValid() && !url.isBlankURL()) 160 return; 161 162 // We need to give a fake location to blank frames so they can be referenced by the serialized frame. 163 url = m_serializer->urlForBlankFrame(frame); 164 appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(*frameOwner), url.string()), namespaces); 165} 166 167void SerializerMarkupAccumulator::appendEndTag(Node* node) 168{ 169 if (node->isElementNode() && !shouldIgnoreElement(toElement(node))) 170 MarkupAccumulator::appendEndTag(node); 171} 172 173PageSerializer::Resource::Resource() 174{ 175} 176 177PageSerializer::Resource::Resource(const KURL& url, const String& mimeType, PassRefPtr<SharedBuffer> data) 178 : url(url) 179 , mimeType(mimeType) 180 , data(data) 181{ 182} 183 184PageSerializer::PageSerializer(Vector<PageSerializer::Resource>* resources) 185 : m_resources(resources) 186 , m_blankFrameCounter(0) 187{ 188} 189 190void PageSerializer::serialize(Page* page) 191{ 192 serializeFrame(page->mainFrame()); 193} 194 195void PageSerializer::serializeFrame(Frame* frame) 196{ 197 Document* document = frame->document(); 198 KURL url = document->url(); 199 if (!url.isValid() || url.isBlankURL()) { 200 // For blank frames we generate a fake URL so they can be referenced by their containing frame. 201 url = urlForBlankFrame(frame); 202 } 203 204 if (m_resourceURLs.contains(url)) { 205 // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now 206 // different content. So we should serialize both and somehow rename the frame src in the containing 207 // frame. Arg! 208 return; 209 } 210 211 Vector<Node*> nodes; 212 SerializerMarkupAccumulator accumulator(this, document, &nodes); 213 TextEncoding textEncoding(document->charset()); 214 CString data; 215 if (!textEncoding.isValid()) { 216 // FIXME: iframes used as images trigger this. We should deal with them correctly. 217 return; 218 } 219 String text = accumulator.serializeNodes(document->documentElement(), 0, IncludeNode); 220 CString frameHTML = textEncoding.encode(text.characters(), text.length(), EntitiesForUnencodables); 221 m_resources->append(Resource(url, document->suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length()))); 222 m_resourceURLs.add(url); 223 224 for (Vector<Node*>::iterator iter = nodes.begin(); iter != nodes.end(); ++iter) { 225 Node* node = *iter; 226 if (!node->isElementNode()) 227 continue; 228 229 Element* element = toElement(node); 230 // We have to process in-line style as it might contain some resources (typically background images). 231 if (element->isStyledElement()) 232 retrieveResourcesForProperties(static_cast<StyledElement*>(element)->inlineStyle(), document); 233 234 if (element->hasTagName(HTMLNames::imgTag)) { 235 HTMLImageElement* imageElement = static_cast<HTMLImageElement*>(element); 236 KURL url = document->completeURL(imageElement->getAttribute(HTMLNames::srcAttr)); 237 CachedImage* cachedImage = imageElement->cachedImage(); 238 addImageToResources(cachedImage, imageElement->renderer(), url); 239 } else if (element->hasTagName(HTMLNames::linkTag)) { 240 HTMLLinkElement* linkElement = static_cast<HTMLLinkElement*>(element); 241 if (CSSStyleSheet* sheet = linkElement->sheet()) { 242 KURL url = document->completeURL(linkElement->getAttribute(HTMLNames::hrefAttr)); 243 serializeCSSStyleSheet(sheet, url); 244 ASSERT(m_resourceURLs.contains(url)); 245 } 246 } else if (element->hasTagName(HTMLNames::styleTag)) { 247 HTMLStyleElement* styleElement = static_cast<HTMLStyleElement*>(element); 248 if (CSSStyleSheet* sheet = styleElement->sheet()) 249 serializeCSSStyleSheet(sheet, KURL()); 250 } 251 } 252 253 for (Frame* childFrame = frame->tree()->firstChild(); childFrame; childFrame = childFrame->tree()->nextSibling()) 254 serializeFrame(childFrame); 255} 256 257void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const KURL& url) 258{ 259 StringBuilder cssText; 260 for (unsigned i = 0; i < styleSheet->length(); ++i) { 261 CSSRule* rule = styleSheet->item(i); 262 String itemText = rule->cssText(); 263 if (!itemText.isEmpty()) { 264 cssText.append(itemText); 265 if (i < styleSheet->length() - 1) 266 cssText.append("\n\n"); 267 } 268 Document* document = styleSheet->ownerDocument(); 269 // Some rules have resources associated with them that we need to retrieve. 270 if (rule->type() == CSSRule::IMPORT_RULE) { 271 CSSImportRule* importRule = static_cast<CSSImportRule*>(rule); 272 KURL importURL = document->completeURL(importRule->href()); 273 if (m_resourceURLs.contains(importURL)) 274 continue; 275 serializeCSSStyleSheet(importRule->styleSheet(), importURL); 276 } else if (rule->type() == CSSRule::FONT_FACE_RULE) { 277 // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can 278 // be retrieved from the CSSFontFaceRule object. 279 } else if (rule->type() == CSSRule::STYLE_RULE) 280 retrieveResourcesForRule(static_cast<CSSStyleRule*>(rule)->styleRule(), document); 281 } 282 283 if (url.isValid() && !m_resourceURLs.contains(url)) { 284 // FIXME: We should check whether a charset has been specified and if none was found add one. 285 TextEncoding textEncoding(styleSheet->contents()->charset()); 286 ASSERT(textEncoding.isValid()); 287 String textString = cssText.toString(); 288 CString text = textEncoding.encode(textString.characters(), textString.length(), EntitiesForUnencodables); 289 m_resources->append(Resource(url, String("text/css"), SharedBuffer::create(text.data(), text.length()))); 290 m_resourceURLs.add(url); 291 } 292} 293 294void PageSerializer::addImageToResources(CachedImage* image, RenderObject* imageRenderer, const KURL& url) 295{ 296 if (!url.isValid() || m_resourceURLs.contains(url)) 297 return; 298 299 if (!image || image->image() == Image::nullImage()) 300 return; 301 302 RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0; 303 if (!data) 304 data = image->image()->data(); 305 306 if (!data) { 307 LOG_ERROR("No data for image %s", url.string().utf8().data()); 308 return; 309 } 310 311 String mimeType = image->response().mimeType(); 312 m_resources->append(Resource(url, mimeType, data)); 313 m_resourceURLs.add(url); 314} 315 316void PageSerializer::retrieveResourcesForRule(StyleRule* rule, Document* document) 317{ 318 retrieveResourcesForProperties(rule->properties(), document); 319} 320 321void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document* document) 322{ 323 if (!styleDeclaration) 324 return; 325 326 // The background-image and list-style-image (for ul or ol) are the CSS properties 327 // that make use of images. We iterate to make sure we include any other 328 // image properties there might be. 329 unsigned propertyCount = styleDeclaration->propertyCount(); 330 for (unsigned i = 0; i < propertyCount; ++i) { 331 RefPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value(); 332 if (!cssValue->isImageValue()) 333 continue; 334 335 CSSImageValue* imageValue = static_cast<CSSImageValue*>(cssValue.get()); 336 StyleImage* styleImage = imageValue->cachedOrPendingImage(); 337 // Non cached-images are just place-holders and do not contain data. 338 if (!styleImage || !styleImage->isCachedImage()) 339 continue; 340 341 CachedImage* image = static_cast<StyleCachedImage*>(styleImage)->cachedImage(); 342 343 KURL url = document->completeURL(image->url()); 344 addImageToResources(image, 0, url); 345 } 346} 347 348KURL PageSerializer::urlForBlankFrame(Frame* frame) 349{ 350 HashMap<Frame*, KURL>::iterator iter = m_blankFrameURLs.find(frame); 351 if (iter != m_blankFrameURLs.end()) 352 return iter->value; 353 String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++); 354 KURL fakeURL(ParsedURLString, url); 355 m_blankFrameURLs.add(frame, fakeURL); 356 357 return fakeURL; 358} 359 360} 361