1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33#if ENABLE(MHTML) 34 35#include "MHTMLArchive.h" 36 37#include "Document.h" 38#include "MHTMLParser.h" 39#include "MIMETypeRegistry.h" 40#include "MainFrame.h" 41#include "Page.h" 42#include "PageSerializer.h" 43#include "QuotedPrintable.h" 44#include "SchemeRegistry.h" 45#include "SharedBuffer.h" 46#include <time.h> 47#include <wtf/CryptographicallyRandomNumber.h> 48#include <wtf/DateMath.h> 49#include <wtf/GregorianDateTime.h> 50#include <wtf/StdLibExtras.h> 51#include <wtf/text/Base64.h> 52#include <wtf/text/StringBuilder.h> 53 54#if HAVE(SYS_TIME_H) 55#include <sys/time.h> 56#endif 57 58namespace WebCore { 59 60const char* const quotedPrintable = "quoted-printable"; 61const char* const base64 = "base64"; 62const char* const binary = "binary"; 63 64static String generateRandomBoundary() 65{ 66 // Trying to generate random boundaries similar to IE/UnMHT (ex: ----=_NextPart_000_001B_01CC157B.96F808A0). 67 const size_t randomValuesLength = 10; 68 char randomValues[randomValuesLength]; 69 cryptographicallyRandomValues(&randomValues, randomValuesLength); 70 StringBuilder stringBuilder; 71 stringBuilder.append("----=_NextPart_000_"); 72 for (size_t i = 0; i < randomValuesLength; ++i) { 73 if (i == 2) 74 stringBuilder.append('_'); 75 else if (i == 6) 76 stringBuilder.append('.'); 77 stringBuilder.append(lowerNibbleToASCIIHexDigit(randomValues[i])); 78 stringBuilder.append(upperNibbleToASCIIHexDigit(randomValues[i])); 79 } 80 return stringBuilder.toString(); 81} 82 83static String replaceNonPrintableCharacters(const String& text) 84{ 85 StringBuilder stringBuilder; 86 for (size_t i = 0; i < text.length(); ++i) { 87 if (isASCIIPrintable(text[i])) 88 stringBuilder.append(text[i]); 89 else 90 stringBuilder.append('?'); 91 } 92 return stringBuilder.toString(); 93} 94 95MHTMLArchive::MHTMLArchive() 96{ 97} 98 99MHTMLArchive::~MHTMLArchive() 100{ 101 // Because all frames know about each other we need to perform a deep clearing of the archives graph. 102 clearAllSubframeArchives(); 103} 104 105PassRefPtr<MHTMLArchive> MHTMLArchive::create() 106{ 107 return adoptRef(new MHTMLArchive); 108} 109 110PassRefPtr<MHTMLArchive> MHTMLArchive::create(const URL& url, SharedBuffer* data) 111{ 112 // For security reasons we only load MHTML pages from local URLs. 113 if (!SchemeRegistry::shouldTreatURLSchemeAsLocal(url.protocol())) 114 return 0; 115 116 MHTMLParser parser(data); 117 RefPtr<MHTMLArchive> mainArchive = parser.parseArchive(); 118 if (!mainArchive) 119 return 0; // Invalid MHTML file. 120 121 // Since MHTML is a flat format, we need to make all frames aware of all resources. 122 for (size_t i = 0; i < parser.frameCount(); ++i) { 123 RefPtr<MHTMLArchive> archive = parser.frameAt(i); 124 for (size_t j = 1; j < parser.frameCount(); ++j) { 125 if (i != j) 126 archive->addSubframeArchive(parser.frameAt(j)); 127 } 128 for (size_t j = 0; j < parser.subResourceCount(); ++j) 129 archive->addSubresource(parser.subResourceAt(j)); 130 } 131 return mainArchive.release(); 132} 133 134PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page) 135{ 136 return generateMHTMLData(page, false); 137} 138 139PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLDataUsingBinaryEncoding(Page* page) 140{ 141 return generateMHTMLData(page, true); 142} 143 144PassRefPtr<SharedBuffer> MHTMLArchive::generateMHTMLData(Page* page, bool useBinaryEncoding) 145{ 146 Vector<PageSerializer::Resource> resources; 147 PageSerializer pageSerializer(&resources); 148 pageSerializer.serialize(page); 149 150 String boundary = generateRandomBoundary(); 151 String endOfResourceBoundary = makeString("--", boundary, "\r\n"); 152 153 GregorianDateTime now; 154 now.setToCurrentLocalTime(); 155 String dateString = makeRFC2822DateString(now.weekDay(), now.monthDay(), now.month(), now.year(), now.hour(), now.minute(), now.second(), now.utcOffset() / 60); 156 157 StringBuilder stringBuilder; 158 stringBuilder.append("From: <Saved by WebKit>\r\n"); 159 stringBuilder.append("Subject: "); 160 // We replace non ASCII characters with '?' characters to match IE's behavior. 161 stringBuilder.append(replaceNonPrintableCharacters(page->mainFrame().document()->title())); 162 stringBuilder.append("\r\nDate: "); 163 stringBuilder.append(dateString); 164 stringBuilder.append("\r\nMIME-Version: 1.0\r\n"); 165 stringBuilder.append("Content-Type: multipart/related;\r\n"); 166 stringBuilder.append("\ttype=\""); 167 stringBuilder.append(page->mainFrame().document()->suggestedMIMEType()); 168 stringBuilder.append("\";\r\n"); 169 stringBuilder.append("\tboundary=\""); 170 stringBuilder.append(boundary); 171 stringBuilder.append("\"\r\n\r\n"); 172 173 // We use utf8() below instead of ascii() as ascii() replaces CRLFs with ?? (we still only have put ASCII characters in it). 174 ASSERT(stringBuilder.toString().containsOnlyASCII()); 175 CString asciiString = stringBuilder.toString().utf8(); 176 RefPtr<SharedBuffer> mhtmlData = SharedBuffer::create(); 177 mhtmlData->append(asciiString.data(), asciiString.length()); 178 179 for (size_t i = 0; i < resources.size(); ++i) { 180 const PageSerializer::Resource& resource = resources[i]; 181 182 stringBuilder.clear(); 183 stringBuilder.append(endOfResourceBoundary); 184 stringBuilder.append("Content-Type: "); 185 stringBuilder.append(resource.mimeType); 186 187 const char* contentEncoding = 0; 188 if (useBinaryEncoding) 189 contentEncoding = binary; 190 else if (MIMETypeRegistry::isSupportedJavaScriptMIMEType(resource.mimeType) || MIMETypeRegistry::isSupportedNonImageMIMEType(resource.mimeType)) 191 contentEncoding = quotedPrintable; 192 else 193 contentEncoding = base64; 194 195 stringBuilder.append("\r\nContent-Transfer-Encoding: "); 196 stringBuilder.append(contentEncoding); 197 stringBuilder.append("\r\nContent-Location: "); 198 stringBuilder.append(resource.url); 199 stringBuilder.append("\r\n\r\n"); 200 201 asciiString = stringBuilder.toString().utf8(); 202 mhtmlData->append(asciiString.data(), asciiString.length()); 203 204 if (!strcmp(contentEncoding, binary)) { 205 const char* data; 206 size_t position = 0; 207 while (size_t length = resource.data->getSomeData(data, position)) { 208 mhtmlData->append(data, length); 209 position += length; 210 } 211 } else { 212 // FIXME: ideally we would encode the content as a stream without having to fetch it all. 213 const char* data = resource.data->data(); 214 size_t dataLength = resource.data->size(); 215 Vector<char> encodedData; 216 if (!strcmp(contentEncoding, quotedPrintable)) { 217 quotedPrintableEncode(data, dataLength, encodedData); 218 mhtmlData->append(encodedData.data(), encodedData.size()); 219 mhtmlData->append("\r\n", 2); 220 } else { 221 ASSERT(!strcmp(contentEncoding, base64)); 222 // We are not specifying insertLFs = true below as it would cut the lines with LFs and MHTML requires CRLFs. 223 base64Encode(data, dataLength, encodedData); 224 const size_t maximumLineLength = 76; 225 size_t index = 0; 226 size_t encodedDataLength = encodedData.size(); 227 do { 228 size_t lineLength = std::min(encodedDataLength - index, maximumLineLength); 229 mhtmlData->append(encodedData.data() + index, lineLength); 230 mhtmlData->append("\r\n", 2); 231 index += maximumLineLength; 232 } while (index < encodedDataLength); 233 } 234 } 235 } 236 237 asciiString = makeString("--", boundary, "--\r\n").utf8(); 238 mhtmlData->append(asciiString.data(), asciiString.length()); 239 240 return mhtmlData.release(); 241} 242 243} 244 245#endif 246