1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#if ENABLE(MHTML)
34#include "MHTMLArchive.h"
35
36#include "Document.h"
37#include "Frame.h"
38#include "MHTMLParser.h"
39#include "MIMETypeRegistry.h"
40#include "Page.h"
41#include "PageSerializer.h"
42#include "QuotedPrintable.h"
43#include "SchemeRegistry.h"
44#include "SharedBuffer.h"
45
46#include <wtf/CryptographicallyRandomNumber.h>
47#include <wtf/DateMath.h>
48#include <wtf/GregorianDateTime.h>
49#include <wtf/StdLibExtras.h>
50#include <wtf/text/Base64.h>
51#include <wtf/text/StringBuilder.h>
52
53#if HAVE(SYS_TIME_H)
54#include <sys/time.h>
55#endif
56#include <time.h>
57
58
59namespace WebCore {
60
61const char* const quotedPrintable = "quoted-printable";
62const char* const base64 = "base64";
63const char* const binary = "binary";
64
65static String generateRandomBoundary()
66{
67    // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0).
68    const size_t randomValuesLength = 10;
69    char randomValues[randomValuesLength];
70    cryptographicallyRandomValues(&randomValues, randomValuesLength);
71    StringBuilder stringBuilder;
72    stringBuilder.append("----=_NextPart_000_");
73    for (size_t i = 0; i < randomValuesLength; ++i) {
74        if (i == 2)
75            stringBuilder.append('_');
76        else if (i == 6)
77            stringBuilder.append('.');
78        stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i]));
79        stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i]));
80    }
81    return stringBuilder.toString();
82}
83
84static String replaceNonPrintableCharacters(const String& text)
85{
86    StringBuilder stringBuilder;
87    for (size_t i = 0; i < text.length(); ++i) {
88        if (isASCIIPrintable(text[i]))
89            stringBuilder.append(text[i]);
90        else
91            stringBuilder.append('?');
92    }
93    return stringBuilder.toString();
94}
95
96MHTMLArchive::MHTMLArchive()
97{
98}
99
100MHTMLArchive::~MHTMLArchive()
101{
102    // Because all frames know about each other we need to perform a deep clearing of the archives graph.
103    clearAllSubframeArchives();
104}
105
106PassRefPtr<MHTMLArchive> MHTMLArchive::create()
107{
108    return adoptRef(new MHTMLArchive);
109}
110
111PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data)
112{
113    // For security reasons we only load MHTML pages from local URLs.
114    if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol()))
115        return 0;
116
117    MHTMLParser parser(data);
118    RefPtr<MHTMLArchive> mainArchive = parser.parseArchive();
119    if (!mainArchive)
120        return 0; // Invalid MHTML file.
121
122    // Since MHTML is a flat format, we need to make all frames aware of all resources.
123    for (size_t i = 0; i < parser.frameCount(); ++i) {
124        RefPtr<MHTMLArchive> archive = parser.frameAt(i);
125        for (size_t j = 1; j < parser.frameCount(); ++j) {
126            if (i != j)
127                archive->addSubframeArchive(parser.frameAt(j));
128        }
129        for (size_t j = 0; j < parser.subResourceCount(); ++j)
130            archive->addSubresource(parser.subResourceAt(j));
131    }
132    return mainArchive.release();
133}
134
135PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page)
136{
137    return generateMHTMLData(page, false);
138}
139
140PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLDataUsingBinaryEncoding(Page* page)
141{
142    return generateMHTMLData(page, true);
143}
144
145PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page, bool useBinaryEncoding)
146{
147    Vector<PageSerializer::Resource> resources;
148    PageSerializer pageSerializer(&resources);
149    pageSerializer.serialize(page);
150
151    String boundary = generateRandomBoundary();
152    String endOfResourceBoundary = makeString("--", boundary, "\r\n");
153
154    GregorianDateTime now;
155    now.setToCurrentLocalTime();
156    String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60);
157
158    StringBuilder stringBuilder;
159    stringBuilder.append("From: <Saved by WebKit>\r\n");
160    stringBuilder.append("Subject: ");
161    // We replace non ASCII characters with '?' characters to match IE's behavior.
162    stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame()->document()->title()));
163    stringBuilder.append("\r\nDate: ");
164    stringBuilder.append(dateString);
165    stringBuilder.append("\r\nMIME-Version: 1.0\r\n");
166    stringBuilder.append("Content-Type: multipart/related;\r\n");
167    stringBuilder.append("\ttype=\"");
168    stringBuilder.append(page->mainFrame()->document()->suggestedMIMEType());
169    stringBuilder.append("\";\r\n");
170    stringBuilder.append("\tboundary=\"");
171    stringBuilder.append(boundary);
172    stringBuilder.append("\"\r\n\r\n");
173
174    // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it).
175    ASSERT(stringBuilder.toString().containsOnlyASCII());
176    CString asciiString = stringBuilder.toString().utf8();
177    RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create();
178    mhtmlData->append(asciiString.data(), asciiString.length());
179
180    for (size_t i = 0; i < resources.size(); ++i) {
181        const PageSerializer::Resource& resource = resources[i];
182
183        stringBuilder.clear();
184        stringBuilder.append(endOfResourceBoundary);
185        stringBuilder.append("Content-Type: ");
186        stringBuilder.append(resource.mimeType);
187
188        const char* contentEncoding = 0;
189        if (useBinaryEncoding)
190            contentEncoding = binary;
191        else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType))
192            contentEncoding = quotedPrintable;
193        else
194            contentEncoding = base64;
195
196        stringBuilder.append("\r\nContent-Transfer-Encoding: ");
197        stringBuilder.append(contentEncoding);
198        stringBuilder.append("\r\nContent-Location: ");
199        stringBuilder.append(resource.url);
200        stringBuilder.append("\r\n\r\n");
201
202        asciiString = stringBuilder.toString().utf8();
203        mhtmlData->append(asciiString.data(), asciiString.length());
204
205        if (!strcmp(contentEncoding, binary)) {
206            const char* data;
207            size_t position = 0;
208            while (size_t length = resource.data->getSomeData(data, position)) {
209                mhtmlData->append(data, length);
210                position += length;
211            }
212        } else {
213            // FIXME: ideally we would encode the content as a stream without having to fetch it all.
214            const char* data = resource.data->data();
215            size_t dataLength = resource.data->size();
216            Vector<char> encodedData;
217            if (!strcmp(contentEncoding, quotedPrintable)) {
218                quotedPrintableEncode(data, dataLength, encodedData);
219                mhtmlData->append(encodedData.data(), encodedData.size());
220                mhtmlData->append("\r\n", 2);
221            } else {
222                ASSERT(!strcmp(contentEncoding, base64));
223                // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs.
224                base64Encode(data, dataLength, encodedData);
225                const size_t maximumLineLength = 76;
226                size_t index = 0;
227                size_t encodedDataLength = encodedData.size();
228                do {
229                    size_t lineLength = std::min(encodedDataLength - index, maximumLineLength);
230                    mhtmlData->append(encodedData.data() + index, lineLength);
231                    mhtmlData->append("\r\n", 2);
232                    index += maximumLineLength;
233                } while (index < encodedDataLength);
234            }
235        }
236    }
237
238    asciiString = makeString("--", boundary, "--\r\n").utf8();
239    mhtmlData->append(asciiString.data(), asciiString.length());
240
241    return mhtmlData.release();
242}
243
244}
245#endif
246