1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32#include "PageSerializer.h"
33
34#include "CSSImageValue.h"
35#include "CSSImportRule.h"
36#include "CSSStyleRule.h"
37#include "CachedImage.h"
38#include "Document.h"
39#include "Element.h"
40#include "Frame.h"
41#include "HTMLFrameOwnerElement.h"
42#include "HTMLHeadElement.h"
43#include "HTMLImageElement.h"
44#include "HTMLLinkElement.h"
45#include "HTMLMetaCharsetParser.h"
46#include "HTMLNames.h"
47#include "HTMLStyleElement.h"
48#include "HTTPParsers.h"
49#include "Image.h"
50#include "MIMETypeRegistry.h"
51#include "MarkupAccumulator.h"
52#include "Page.h"
53#include "StyleCachedImage.h"
54#include "StyleImage.h"
55#include "StylePropertySet.h"
56#include "StyleRule.h"
57#include "StyleSheetContents.h"
58#include "Text.h"
59#include "TextEncoding.h"
60#include <wtf/text/CString.h>
61#include <wtf/text/StringBuilder.h>
62#include <wtf/text/WTFString.h>
63
64namespace WebCore {
65
66static bool isCharsetSpecifyingNode(Node* node)
67{
68    if (!node->isHTMLElement())
69        return false;
70
71    HTMLElement* element = toHTMLElement(node);
72    if (!element->hasTagName(HTMLNames::metaTag))
73        return false;
74    HTMLMetaCharsetParser::AttributeList attributes;
75    if (element->hasAttributes()) {
76        for (unsigned i = 0; i < element->attributeCount(); ++i) {
77            const Attribute* attribute = element->attributeItem(i);
78            // FIXME: We should deal appropriately with the attribute if they have a namespace.
79            attributes.append(std::make_pair(attribute->name().toString(), attribute->value().string()));
80        }
81    }
82    TextEncoding textEncoding = HTMLMetaCharsetParser::encodingFromMetaAttributes(attributes);
83    return textEncoding.isValid();
84}
85
86static bool shouldIgnoreElement(Element* element)
87{
88    return element->hasTagName(HTMLNames::scriptTag) || element->hasTagName(HTMLNames::noscriptTag) || isCharsetSpecifyingNode(element);
89}
90
91static const QualifiedName& frameOwnerURLAttributeName(const HTMLFrameOwnerElement& frameOwner)
92{
93    // FIXME: We should support all frame owners including applets.
94    return frameOwner.hasTagName(HTMLNames::objectTag) ? HTMLNames::dataAttr : HTMLNames::srcAttr;
95}
96
97class SerializerMarkupAccumulator : public WebCore::MarkupAccumulator {
98public:
99    SerializerMarkupAccumulator(PageSerializer*, Document*, Vector<Node*>*);
100    virtual ~SerializerMarkupAccumulator();
101
102protected:
103    virtual void appendText(StringBuilder& out, Text*);
104    virtual void appendElement(StringBuilder& out, Element*, Namespaces*);
105    virtual void appendCustomAttributes(StringBuilder& out, Element*, Namespaces*);
106    virtual void appendEndTag(Node*);
107
108private:
109    PageSerializer* m_serializer;
110    Document* m_document;
111};
112
113SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer* serializer, Document* document, Vector<Node*>* nodes)
114    : MarkupAccumulator(nodes, ResolveAllURLs)
115    , m_serializer(serializer)
116    , m_document(document)
117{
118    // MarkupAccumulator does not serialize the <?xml ... line, so we add it explicitely to ensure the right encoding is specified.
119    if (m_document->isXHTMLDocument() || m_document->xmlStandalone() || m_document->isSVGDocument())
120        appendString("<?xml version=\"" + m_document->xmlVersion() + "\" encoding=\"" + m_document->charset() + "\"?>");
121}
122
123SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
124{
125}
126
127void SerializerMarkupAccumulator::appendText(StringBuilder& out, Text* text)
128{
129    Element* parent = text->parentElement();
130    if (parent && !shouldIgnoreElement(parent))
131        MarkupAccumulator::appendText(out, text);
132}
133
134void SerializerMarkupAccumulator::appendElement(StringBuilder& out, Element* element, Namespaces* namespaces)
135{
136    if (!shouldIgnoreElement(element))
137        MarkupAccumulator::appendElement(out, element, namespaces);
138
139    if (element->hasTagName(HTMLNames::headTag)) {
140        out.append("<meta charset=\"");
141        out.append(m_document->charset());
142        out.append("\">");
143    }
144
145    // FIXME: For object (plugins) tags and video tag we could replace them by an image of their current contents.
146}
147
148void SerializerMarkupAccumulator::appendCustomAttributes(StringBuilder& out, Element* element, Namespaces* namespaces)
149{
150    if (!element->isFrameOwnerElement())
151        return;
152
153    HTMLFrameOwnerElement* frameOwner = toFrameOwnerElement(element);
154    Frame* frame = frameOwner->contentFrame();
155    if (!frame)
156        return;
157
158    KURL url = frame->document()->url();
159    if (url.isValid() && !url.isBlankURL())
160        return;
161
162    // We need to give a fake location to blank frames so they can be referenced by the serialized frame.
163    url = m_serializer->urlForBlankFrame(frame);
164    appendAttribute(out, element, Attribute(frameOwnerURLAttributeName(*frameOwner), url.string()), namespaces);
165}
166
167void SerializerMarkupAccumulator::appendEndTag(Node* node)
168{
169    if (node->isElementNode() && !shouldIgnoreElement(toElement(node)))
170        MarkupAccumulator::appendEndTag(node);
171}
172
173PageSerializer::Resource::Resource()
174{
175}
176
177PageSerializer::Resource::Resource(const KURL& url, const String& mimeType, PassRefPtr<SharedBuffer> data)
178    : url(url)
179    , mimeType(mimeType)
180    , data(data)
181{
182}
183
184PageSerializer::PageSerializer(Vector<PageSerializer::Resource>* resources)
185    : m_resources(resources)
186    , m_blankFrameCounter(0)
187{
188}
189
190void PageSerializer::serialize(Page* page)
191{
192    serializeFrame(page->mainFrame());
193}
194
195void PageSerializer::serializeFrame(Frame* frame)
196{
197    Document* document = frame->document();
198    KURL url = document->url();
199    if (!url.isValid() || url.isBlankURL()) {
200        // For blank frames we generate a fake URL so they can be referenced by their containing frame.
201        url = urlForBlankFrame(frame);
202    }
203
204    if (m_resourceURLs.contains(url)) {
205        // FIXME: We could have 2 frame with the same URL but which were dynamically changed and have now
206        // different content. So we should serialize both and somehow rename the frame src in the containing
207        // frame. Arg!
208        return;
209    }
210
211    Vector<Node*> nodes;
212    SerializerMarkupAccumulator accumulator(this, document, &nodes);
213    TextEncoding textEncoding(document->charset());
214    CString data;
215    if (!textEncoding.isValid()) {
216        // FIXME: iframes used as images trigger this. We should deal with them correctly.
217        return;
218    }
219    String text = accumulator.serializeNodes(document->documentElement(), 0, IncludeNode);
220    CString frameHTML = textEncoding.encode(text.characters(), text.length(), EntitiesForUnencodables);
221    m_resources->append(Resource(url, document->suggestedMIMEType(), SharedBuffer::create(frameHTML.data(), frameHTML.length())));
222    m_resourceURLs.add(url);
223
224    for (Vector<Node*>::iterator iter = nodes.begin(); iter != nodes.end(); ++iter) {
225        Node* node = *iter;
226        if (!node->isElementNode())
227            continue;
228
229        Element* element = toElement(node);
230        // We have to process in-line style as it might contain some resources (typically background images).
231        if (element->isStyledElement())
232            retrieveResourcesForProperties(static_cast<StyledElement*>(element)->inlineStyle(), document);
233
234        if (element->hasTagName(HTMLNames::imgTag)) {
235            HTMLImageElement* imageElement = static_cast<HTMLImageElement*>(element);
236            KURL url = document->completeURL(imageElement->getAttribute(HTMLNames::srcAttr));
237            CachedImage* cachedImage = imageElement->cachedImage();
238            addImageToResources(cachedImage, imageElement->renderer(), url);
239        } else if (element->hasTagName(HTMLNames::linkTag)) {
240            HTMLLinkElement* linkElement = static_cast<HTMLLinkElement*>(element);
241            if (CSSStyleSheet* sheet = linkElement->sheet()) {
242                KURL url = document->completeURL(linkElement->getAttribute(HTMLNames::hrefAttr));
243                serializeCSSStyleSheet(sheet, url);
244                ASSERT(m_resourceURLs.contains(url));
245            }
246        } else if (element->hasTagName(HTMLNames::styleTag)) {
247            HTMLStyleElement* styleElement = static_cast<HTMLStyleElement*>(element);
248            if (CSSStyleSheet* sheet = styleElement->sheet())
249                serializeCSSStyleSheet(sheet, KURL());
250        }
251    }
252
253    for (Frame* childFrame = frame->tree()->firstChild(); childFrame; childFrame = childFrame->tree()->nextSibling())
254        serializeFrame(childFrame);
255}
256
257void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet* styleSheet, const KURL& url)
258{
259    StringBuilder cssText;
260    for (unsigned i = 0; i < styleSheet->length(); ++i) {
261        CSSRule* rule = styleSheet->item(i);
262        String itemText = rule->cssText();
263        if (!itemText.isEmpty()) {
264            cssText.append(itemText);
265            if (i < styleSheet->length() - 1)
266                cssText.append("\n\n");
267        }
268        Document* document = styleSheet->ownerDocument();
269        // Some rules have resources associated with them that we need to retrieve.
270        if (rule->type() == CSSRule::IMPORT_RULE) {
271            CSSImportRule* importRule = static_cast<CSSImportRule*>(rule);
272            KURL importURL = document->completeURL(importRule->href());
273            if (m_resourceURLs.contains(importURL))
274                continue;
275            serializeCSSStyleSheet(importRule->styleSheet(), importURL);
276        } else if (rule->type() == CSSRule::FONT_FACE_RULE) {
277            // FIXME: Add support for font face rule. It is not clear to me at this point if the actual otf/eot file can
278            // be retrieved from the CSSFontFaceRule object.
279        } else if (rule->type() == CSSRule::STYLE_RULE)
280            retrieveResourcesForRule(static_cast<CSSStyleRule*>(rule)->styleRule(), document);
281    }
282
283    if (url.isValid() && !m_resourceURLs.contains(url)) {
284        // FIXME: We should check whether a charset has been specified and if none was found add one.
285        TextEncoding textEncoding(styleSheet->contents()->charset());
286        ASSERT(textEncoding.isValid());
287        String textString = cssText.toString();
288        CString text = textEncoding.encode(textString.characters(), textString.length(), EntitiesForUnencodables);
289        m_resources->append(Resource(url, String("text/css"), SharedBuffer::create(text.data(), text.length())));
290        m_resourceURLs.add(url);
291    }
292}
293
294void PageSerializer::addImageToResources(CachedImage* image, RenderObject* imageRenderer, const KURL& url)
295{
296    if (!url.isValid() || m_resourceURLs.contains(url))
297        return;
298
299    if (!image || image->image() == Image::nullImage())
300        return;
301
302    RefPtr<SharedBuffer> data = imageRenderer ? image->imageForRenderer(imageRenderer)->data() : 0;
303    if (!data)
304        data = image->image()->data();
305
306    if (!data) {
307        LOG_ERROR("No data for image %s", url.string().utf8().data());
308        return;
309    }
310
311    String mimeType = image->response().mimeType();
312    m_resources->append(Resource(url, mimeType, data));
313    m_resourceURLs.add(url);
314}
315
316void PageSerializer::retrieveResourcesForRule(StyleRule* rule, Document* document)
317{
318    retrieveResourcesForProperties(rule->properties(), document);
319}
320
321void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styleDeclaration, Document* document)
322{
323    if (!styleDeclaration)
324        return;
325
326    // The background-image and list-style-image (for ul or ol) are the CSS properties
327    // that make use of images. We iterate to make sure we include any other
328    // image properties there might be.
329    unsigned propertyCount = styleDeclaration->propertyCount();
330    for (unsigned i = 0; i < propertyCount; ++i) {
331        RefPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).value();
332        if (!cssValue->isImageValue())
333            continue;
334
335        CSSImageValue* imageValue = static_cast<CSSImageValue*>(cssValue.get());
336        StyleImage* styleImage = imageValue->cachedOrPendingImage();
337        // Non cached-images are just place-holders and do not contain data.
338        if (!styleImage || !styleImage->isCachedImage())
339            continue;
340
341        CachedImage* image = static_cast<StyleCachedImage*>(styleImage)->cachedImage();
342
343        KURL url = document->completeURL(image->url());
344        addImageToResources(image, 0, url);
345    }
346}
347
348KURL PageSerializer::urlForBlankFrame(Frame* frame)
349{
350    HashMap<Frame*, KURL>::iterator iter = m_blankFrameURLs.find(frame);
351    if (iter != m_blankFrameURLs.end())
352        return iter->value;
353    String url = "wyciwyg://frame/" + String::number(m_blankFrameCounter++);
354    KURL fakeURL(ParsedURLString, url);
355    m_blankFrameURLs.add(frame, fakeURL);
356
357    return fakeURL;
358}
359
360}
361