1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 *     * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 *     * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 *     * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31#include "config.h"
32
33#if ENABLE(MHTML)
34#include "MHTMLParser.h"
35
36#include "MHTMLArchive.h"
37#include "MIMEHeader.h"
38#include "MIMETypeRegistry.h"
39#include "QuotedPrintable.h"
40#include <wtf/HashMap.h>
41#include <wtf/text/Base64.h>
42
43namespace WebCore {
44
45static bool skipLinesUntilBoundaryFound(SharedBufferChunkReader& lineReader, const String& boundary)
46{
47    String line;
48    while (!(line = lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
49        if (line == boundary)
50            return true;
51    }
52    return false;
53}
54
55MHTMLParser::MHTMLParser(SharedBuffer* data)
56    : m_lineReader(data, "\r\n")
57{
58}
59
60PassRefPtr<MHTMLArchive> MHTMLParser::parseArchive()
61{
62    RefPtr<MIMEHeader> header = MIMEHeader::parseHeader(&m_lineReader);
63    return parseArchiveWithHeader(header.get());
64}
65
66PassRefPtr<MHTMLArchive> MHTMLParser::parseArchiveWithHeader(MIMEHeader* header)
67{
68    if (!header) {
69        LOG_ERROR("Failed to parse MHTML part: no header.");
70        return 0;
71    }
72
73    RefPtr<MHTMLArchive> archive = MHTMLArchive::create();
74    if (!header->isMultipart()) {
75        // With IE a page with no resource is not multi-part.
76        bool endOfArchiveReached = false;
77        RefPtr<ArchiveResource> resource = parseNextPart(*header, String(), String(), endOfArchiveReached);
78        if (!resource)
79            return 0;
80        archive->setMainResource(resource);
81        return archive;
82    }
83
84    // Skip the message content (it's a generic browser specific message).
85    skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
86
87    bool endOfArchive = false;
88    while (!endOfArchive) {
89        RefPtr<MIMEHeader> resourceHeader = MIMEHeader::parseHeader(&m_lineReader);
90        if (!resourceHeader) {
91            LOG_ERROR("Failed to parse MHTML, invalid MIME header.");
92            return 0;
93        }
94        if (resourceHeader->contentType() == "multipart/alternative") {
95            // Ignore IE nesting which makes little sense (IE seems to nest only some of the frames).
96            RefPtr<MHTMLArchive> subframeArchive = parseArchiveWithHeader(resourceHeader.get());
97            if (!subframeArchive) {
98                LOG_ERROR("Failed to parse MHTML subframe.");
99                return 0;
100            }
101            bool endOfPartReached = skipLinesUntilBoundaryFound(m_lineReader, header->endOfPartBoundary());
102            ASSERT_UNUSED(endOfPartReached, endOfPartReached);
103            // The top-frame is the first frame found, regardless of the nesting level.
104            if (subframeArchive->mainResource())
105                addResourceToArchive(subframeArchive->mainResource(), archive.get());
106            archive->addSubframeArchive(subframeArchive);
107            continue;
108        }
109
110        RefPtr<ArchiveResource> resource = parseNextPart(*resourceHeader, header->endOfPartBoundary(), header->endOfDocumentBoundary(), endOfArchive);
111        if (!resource) {
112            LOG_ERROR("Failed to parse MHTML part.");
113            return 0;
114        }
115        addResourceToArchive(resource.get(), archive.get());
116    }
117
118    return archive.release();
119}
120
121void MHTMLParser::addResourceToArchive(ArchiveResource* resource, MHTMLArchive* archive)
122{
123    const String& mimeType = resource->mimeType();
124    if (!MIMETypeRegistry::isSupportedNonImageMIMEType(mimeType) || MIMETypeRegistry::isSupportedJavaScriptMIMEType(mimeType) || mimeType == "text/css") {
125        m_resources.append(resource);
126        return;
127    }
128
129    // The first document suitable resource is the main frame.
130    if (!archive->mainResource()) {
131        archive->setMainResource(resource);
132        m_frames.append(archive);
133        return;
134    }
135
136    RefPtr<MHTMLArchive> subframe = MHTMLArchive::create();
137    subframe->setMainResource(resource);
138    m_frames.append(subframe);
139}
140
141PassRefPtr<ArchiveResource> MHTMLParser::parseNextPart(const MIMEHeader& mimeHeader, const String& endOfPartBoundary, const String& endOfDocumentBoundary, bool& endOfArchiveReached)
142{
143    ASSERT(endOfPartBoundary.isEmpty() == endOfDocumentBoundary.isEmpty());
144
145    RefPtr<SharedBuffer> content = SharedBuffer::create();
146    const bool checkBoundary = !endOfPartBoundary.isEmpty();
147    bool endOfPartReached = false;
148    if (mimeHeader.contentTransferEncoding() == MIMEHeader::Binary) {
149        if (!checkBoundary) {
150            LOG_ERROR("Binary contents requires end of part");
151            return 0;
152        }
153        m_lineReader.setSeparator(endOfPartBoundary.utf8().data());
154        Vector<char> part;
155        if (!m_lineReader.nextChunk(part)) {
156            LOG_ERROR("Binary contents requires end of part");
157            return 0;
158         }
159         content->append(part);
160         m_lineReader.setSeparator("\r\n");
161         Vector<char> nextChars;
162         if (m_lineReader.peek(nextChars, 2) != 2) {
163             LOG_ERROR("Invalid seperator.");
164             return 0;
165         }
166         endOfPartReached = true;
167         ASSERT(nextChars.size() == 2);
168         endOfArchiveReached = (nextChars[0] == '-' && nextChars[1] == '-');
169         if (!endOfArchiveReached) {
170             String line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback();
171             if (!line.isEmpty()) {
172                 LOG_ERROR("No CRLF at end of binary section.");
173                 return 0;
174             }
175         }
176    } else {
177        String line;
178        while (!(line = m_lineReader.nextChunkAsUTF8StringWithLatin1Fallback()).isNull()) {
179            endOfArchiveReached = (line == endOfDocumentBoundary);
180            if (checkBoundary && (line == endOfPartBoundary || endOfArchiveReached)) {
181                endOfPartReached = true;
182                break;
183            }
184            // Note that we use line.utf8() and not line.ascii() as ascii turns special characters (such as tab, line-feed...) into '?'.
185            content->append(line.utf8().data(), line.length());
186            if (mimeHeader.contentTransferEncoding() == MIMEHeader::QuotedPrintable) {
187                // The line reader removes the \r\n, but we need them for the content in this case as the QuotedPrintable decoder expects CR-LF terminated lines.
188                content->append("\r\n", 2);
189            }
190        }
191    }
192    if (!endOfPartReached && checkBoundary) {
193        LOG_ERROR("No bounday found for MHTML part.");
194        return 0;
195    }
196
197    Vector<char> data;
198    switch (mimeHeader.contentTransferEncoding()) {
199    case MIMEHeader::Base64:
200        if (!base64Decode(content->data(), content->size(), data)) {
201            LOG_ERROR("Invalid base64 content for MHTML part.");
202            return 0;
203        }
204        break;
205    case MIMEHeader::QuotedPrintable:
206        quotedPrintableDecode(content->data(), content->size(), data);
207        break;
208    case MIMEHeader::SevenBit:
209    case MIMEHeader::Binary:
210        data.append(content->data(), content->size());
211        break;
212    default:
213        LOG_ERROR("Invalid encoding for MHTML part.");
214        return 0;
215    }
216    RefPtr<SharedBuffer> contentBuffer = SharedBuffer::adoptVector(data);
217    // FIXME: the URL in the MIME header could be relative, we should resolve it if it is.
218    // The specs mentions 5 ways to resolve a URL: http://tools.ietf.org/html/rfc2557#section-5
219    // IE and Firefox (UNMht) seem to generate only absolute URLs.
220    URL location = URL(URL(), mimeHeader.contentLocation());
221    return ArchiveResource::create(contentBuffer, location, mimeHeader.contentType(), mimeHeader.charset(), String());
222}
223
224size_t MHTMLParser::frameCount() const
225{
226    return m_frames.size();
227}
228
229MHTMLArchive* MHTMLParser::frameAt(size_t index) const
230{
231    return m_frames[index].get();
232}
233
234size_t MHTMLParser::subResourceCount() const
235{
236    return m_resources.size();
237}
238
239ArchiveResource* MHTMLParser::subResourceAt(size_t index) const
240{
241    return m_resources[index].get();
242}
243
244}
245#endif
246