1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions are 6 * met: 7 * 8 * * Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * * Redistributions in binary form must reproduce the above 11 * copyright notice, this list of conditions and the following disclaimer 12 * in the documentation and/or other materials provided with the 13 * distribution. 14 * * Neither the name of Google Inc. nor the names of its 15 * contributors may be used to endorse or promote products derived from 16 * this software without specific prior written permission. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31#include "config.h" 32 33#if ENABLE(MHTML) 34#include "MHTMLParser.h" 35 36#include "MHTMLArchive.h" 37#include "MIMEHeader.h" 38#include "MIMETypeRegistry.h" 39#include "QuotedPrintable.h" 40#include <wtf/HashMap.h> 41#include <wtf/text/Base64.h> 42 43namespace WebCore { 44 45static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary) 46{ 47 String line; 48 while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 49 if (line == boundary) 50 return true; 51 } 52 return false; 53} 54 55MHTMLParser::MHTMLParser(SharedBuffer* data) 56 : m_lineReader(data, "\r\n") 57{ 58} 59 60PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive() 61{ 62 RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader); 63 return parseArchiveWithHeader(header.get()); 64} 65 66PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header) 67{ 68 if (!header) { 69 LOG_ERROR("Failed to parse MHTML part: no header."); 70 return 0; 71 } 72 73 RefPtr<MHTMLArchive> archive = MHTMLArchive::create(); 74 if (!header->isMultipart()) { 75 // With IE a page with no resource is not multi-part. 76 bool endOfArchiveReached = false; 77 RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached); 78 if (!resource) 79 return 0; 80 archive->setMainResource(resource); 81 return archive; 82 } 83 84 // Skip the message content (it's a generic browser specific message). 85 skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 86 87 bool endOfArchive = false; 88 while (!endOfArchive) { 89 RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader); 90 if (!resourceHeader) { 91 LOG_ERROR("Failed to parse MHTML, invalid MIME header."); 92 return 0; 93 } 94 if (resourceHeader->contentType() == "multipart/alternative") { 95 // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames). 96 RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get()); 97 if (!subframeArchive) { 98 LOG_ERROR("Failed to parse MHTML subframe."); 99 return 0; 100 } 101 bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary()); 102 ASSERT_UNUSED(endOfPartReached, endOfPartReached); 103 // The top-frame is the first frame found, regardless of the nesting level. 104 if (subframeArchive->mainResource()) 105 addResourceToArchive(subframeArchive->mainResource(), archive.get()); 106 archive->addSubframeArchive(subframeArchive); 107 continue; 108 } 109 110 RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive); 111 if (!resource) { 112 LOG_ERROR("Failed to parse MHTML part."); 113 return 0; 114 } 115 addResourceToArchive(resource.get(), archive.get()); 116 } 117 118 return archive.release(); 119} 120 121void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive) 122{ 123 const String& mimeType = resource->mimeType(); 124 if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") { 125 m_resources.append(resource); 126 return; 127 } 128 129 // The first document suitable resource is the main frame. 130 if (!archive->mainResource()) { 131 archive->setMainResource(resource); 132 m_frames.append(archive); 133 return; 134 } 135 136 RefPtr<MHTMLArchive> subframe = MHTMLArchive::create(); 137 subframe->setMainResource(resource); 138 m_frames.append(subframe); 139} 140 141PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached) 142{ 143 ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty()); 144 145 RefPtr<SharedBuffer> content = SharedBuffer::create(); 146 const bool checkBoundary = !endOfPartBoundary.isEmpty(); 147 bool endOfPartReached = false; 148 if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) { 149 if (!checkBoundary) { 150 LOG_ERROR("Binary contents requires end of part"); 151 return 0; 152 } 153 m_lineReader.setSeparator(endOfPartBoundary.utf8().data()); 154 Vector<char> part; 155 if (!m_lineReader.nextChunk(part)) { 156 LOG_ERROR("Binary contents requires end of part"); 157 return 0; 158 } 159 content->append(part); 160 m_lineReader.setSeparator("\r\n"); 161 Vector<char> nextChars; 162 if (m_lineReader.peek(nextChars, 2) != 2) { 163 LOG_ERROR("Invalid seperator."); 164 return 0; 165 } 166 endOfPartReached = true; 167 ASSERT(nextChars.size() == 2); 168 endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-'); 169 if (!endOfArchiveReached) { 170 String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback(); 171 if (!line.isEmpty()) { 172 LOG_ERROR("No CRLF at end of binary section."); 173 return 0; 174 } 175 } 176 } else { 177 String line; 178 while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) { 179 endOfArchiveReached = (line == endOfDocumentBoundary); 180 if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) { 181 endOfPartReached = true; 182 break; 183 } 184 // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'. 185 content->append(line.utf8().data(), line.length()); 186 if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) { 187 // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines. 188 content->append("\r\n", 2); 189 } 190 } 191 } 192 if (!endOfPartReached && checkBoundary) { 193 LOG_ERROR("No bounday found for MHTML part."); 194 return 0; 195 } 196 197 Vector<char> data; 198 switch (mimeHeader.contentTransferEncoding()) { 199 case MIMEHeader::Base64: 200 if (!base64Decode(content->data(), content->size(), data)) { 201 LOG_ERROR("Invalid base64 content for MHTML part."); 202 return 0; 203 } 204 break; 205 case MIMEHeader::QuotedPrintable: 206 quotedPrintableDecode(content->data(), content->size(), data); 207 break; 208 case MIMEHeader::SevenBit: 209 case MIMEHeader::Binary: 210 data.append(content->data(), content->size()); 211 break; 212 default: 213 LOG_ERROR("Invalid encoding for MHTML part."); 214 return 0; 215 } 216 RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data); 217 // FIXME: the URL in the MIME header could be relative, we should resolve it if it is. 218 // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5 219 // IE and Firefox (UNMht) seem to generate only absolute URLs. 220 URL location = URL(URL(), mimeHeader.contentLocation()); 221 return ArchiveResource::create(contentBuffer, location, mimeHeader.contentType(), mimeHeader.charset(), String()); 222} 223 224size_t MHTMLParser::frameCount() const 225{ 226 return m_frames.size(); 227} 228 229MHTMLArchive* MHTMLParser::frameAt(size_t index) const 230{ 231 return m_frames[index].get(); 232} 233 234size_t MHTMLParser::subResourceCount() const 235{ 236 return m_resources.size(); 237} 238 239ArchiveResource* MHTMLParser::subResourceAt(size_t index) const 240{ 241 return m_resources[index].get(); 242} 243 244} 245#endif 246