1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33#if ENABLE(MHTML) 34#include "MHTMLArchive.h" 35 36#include "Document.h" 37#include "Frame.h" 38#include "MHTMLParser.h" 39#include "MIMETypeRegistry.h" 40#include "Page.h" 41#include "PageSerializer.h" 42#include "QuotedPrintable.h" 43#include "SchemeRegistry.h" 44#include "SharedBuffer.h" 45 46#include <wtf/CryptographicallyRandomNumber.h> 47#include <wtf/DateMath.h> 48#include <wtf/GregorianDateTime.h> 49#include <wtf/StdLibExtras.h> 50#include <wtf/text/Base64.h> 51#include <wtf/text/StringBuilder.h> 52 53#if HAVE(SYS_TIME_H) 54#include <sys/time.h> 55#endif 56#include <time.h> 57 58 59namespace WebCore { 60 61const char* const quotedPrintable = "quoted-printable"; 62const char* const base64 = "base64"; 63const char* const binary = "binary"; 64 65static String generateRandomBoundary() 66{ 67 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). 68 const size_t randomValuesLength = 10; 69 char randomValues[randomValuesLength]; 70 cryptographicallyRandomValues(&randomValues, randomValuesLength); 71 StringBuilder stringBuilder; 72 stringBuilder.append("----=_NextPart_000_"); 73 for (size_t i = 0; i < randomValuesLength; ++i) { 74 if (i == 2) 75 stringBuilder.append('_'); 76 else if (i == 6) 77 stringBuilder.append('.'); 78 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); 79 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); 80 } 81 return stringBuilder.toString(); 82} 83 84static String replaceNonPrintableCharacters(const String& text) 85{ 86 StringBuilder stringBuilder; 87 for (size_t i = 0; i < text.length(); ++i) { 88 if (isASCIIPrintable(text[i])) 89 stringBuilder.append(text[i]); 90 else 91 stringBuilder.append('?'); 92 } 93 return stringBuilder.toString(); 94} 95 96MHTMLArchive::MHTMLArchive() 97{ 98} 99 100MHTMLArchive::~MHTMLArchive() 101{ 102 // Because all frames know about each other we need to perform a deep clearing of the archives graph. 103 clearAllSubframeArchives(); 104} 105 106PassRefPtr<MHTMLArchive> MHTMLArchive::create() 107{ 108 return adoptRef(new MHTMLArchive); 109} 110 111PassRefPtr<MHTMLArchive> MHTMLArchive::create(const KURL& url, SharedBuffer* data) 112{ 113 // For security reasons we only load MHTML pages from local URLs. 114 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol())) 115 return 0; 116 117 MHTMLParser parser(data); 118 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); 119 if (!mainArchive) 120 return 0; // Invalid MHTML file. 121 122 // Since MHTML is a flat format, we need to make all frames aware of all resources. 123 for (size_t i = 0; i < parser.frameCount(); ++i) { 124 RefPtr<MHTMLArchive> archive = parser.frameAt(i); 125 for (size_t j = 1; j < parser.frameCount(); ++j) { 126 if (i != j) 127 archive->addSubframeArchive(parser.frameAt(j)); 128 } 129 for (size_t j = 0; j < parser.subResourceCount(); ++j) 130 archive->addSubresource(parser.subResourceAt(j)); 131 } 132 return mainArchive.release(); 133} 134 135PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page) 136{ 137 return generateMHTMLData(page, false); 138} 139 140PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLDataUsingBinaryEncoding(Page* page) 141{ 142 return generateMHTMLData(page, true); 143} 144 145PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page, bool useBinaryEncoding) 146{ 147 Vector<PageSerializer::Resource> resources; 148 PageSerializer pageSerializer(&resources); 149 pageSerializer.serialize(page); 150 151 String boundary = generateRandomBoundary(); 152 String endOfResourceBoundary = makeString("--", boundary, "\r\n"); 153 154 GregorianDateTime now; 155 now.setToCurrentLocalTime(); 156 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); 157 158 StringBuilder stringBuilder; 159 stringBuilder.append("From: <Saved by WebKit>\r\n"); 160 stringBuilder.append("Subject: "); 161 // We replace non ASCII characters with '?' characters to match IE's behavior. 162 stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame()->document()->title())); 163 stringBuilder.append("\r\nDate: "); 164 stringBuilder.append(dateString); 165 stringBuilder.append("\r\nMIME-Version: 1.0\r\n"); 166 stringBuilder.append("Content-Type: multipart/related;\r\n"); 167 stringBuilder.append("\ttype=\""); 168 stringBuilder.append(page->mainFrame()->document()->suggestedMIMEType()); 169 stringBuilder.append("\";\r\n"); 170 stringBuilder.append("\tboundary=\""); 171 stringBuilder.append(boundary); 172 stringBuilder.append("\"\r\n\r\n"); 173 174 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). 175 ASSERT(stringBuilder.toString().containsOnlyASCII()); 176 CString asciiString = stringBuilder.toString().utf8(); 177 RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create(); 178 mhtmlData->append(asciiString.data(), asciiString.length()); 179 180 for (size_t i = 0; i < resources.size(); ++i) { 181 const PageSerializer::Resource& resource = resources[i]; 182 183 stringBuilder.clear(); 184 stringBuilder.append(endOfResourceBoundary); 185 stringBuilder.append("Content-Type: "); 186 stringBuilder.append(resource.mimeType); 187 188 const char* contentEncoding = 0; 189 if (useBinaryEncoding) 190 contentEncoding = binary; 191 else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) 192 contentEncoding = quotedPrintable; 193 else 194 contentEncoding = base64; 195 196 stringBuilder.append("\r\nContent-Transfer-Encoding: "); 197 stringBuilder.append(contentEncoding); 198 stringBuilder.append("\r\nContent-Location: "); 199 stringBuilder.append(resource.url); 200 stringBuilder.append("\r\n\r\n"); 201 202 asciiString = stringBuilder.toString().utf8(); 203 mhtmlData->append(asciiString.data(), asciiString.length()); 204 205 if (!strcmp(contentEncoding, binary)) { 206 const char* data; 207 size_t position = 0; 208 while (size_t length = resource.data->getSomeData(data, position)) { 209 mhtmlData->append(data, length); 210 position += length; 211 } 212 } else { 213 // FIXME: ideally we would encode the content as a stream without having to fetch it all. 214 const char* data = resource.data->data(); 215 size_t dataLength = resource.data->size(); 216 Vector<char> encodedData; 217 if (!strcmp(contentEncoding, quotedPrintable)) { 218 quotedPrintableEncode(data, dataLength, encodedData); 219 mhtmlData->append(encodedData.data(), encodedData.size()); 220 mhtmlData->append("\r\n", 2); 221 } else { 222 ASSERT(!strcmp(contentEncoding, base64)); 223 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. 224 base64Encode(data, dataLength, encodedData); 225 const size_t maximumLineLength = 76; 226 size_t index = 0; 227 size_t encodedDataLength = encodedData.size(); 228 do { 229 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); 230 mhtmlData->append(encodedData.data() + index, lineLength); 231 mhtmlData->append("\r\n", 2); 232 index += maximumLineLength; 233 } while (index < encodedDataLength); 234 } 235 } 236 } 237 238 asciiString = makeString("--", boundary, "--\r\n").utf8(); 239 mhtmlData->append(asciiString.data(), asciiString.length()); 240 241 return mhtmlData.release(); 242} 243 244} 245#endif 246