1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27
28#if ENABLE(THREADED_HTML_PARSER)
29
30#include "BackgroundHTMLParser.h"
31
32#include "HTMLDocumentParser.h"
33#include "HTMLParserIdioms.h"
34#include "HTMLParserThread.h"
35#include "HTMLTokenizer.h"
36#include "XSSAuditor.h"
37#include <wtf/MainThread.h>
38#include <wtf/text/TextPosition.h>
39
40namespace WebCore {
41
42// On a network with high latency and high bandwidth, using a device
43// with a fast CPU, we could end up speculatively tokenizing
44// the whole document, well ahead of when the main-thread actually needs it.
45// This is a waste of memory (and potentially time if the speculation fails).
46// So we limit our outstanding speculations arbitrarily to 10.
47// Our maximal memory spent speculating will be approximately:
48// outstandingCheckpointLimit * pendingTokenLimit * sizeof(CompactToken)
49// We use a separate low and high water mark to avoid constantly topping
50// off the main thread's token buffer.
51// At time of writing, this is 10 * 1000 * 28 bytes = appox 280kb of memory.
52// These numbers have not been tuned.
53static const size_t outstandingCheckpointLimit = 10;
54
55// We limit our chucks to 1000 tokens, to make sure the main
56// thread is never waiting on the parser thread for tokens.
57// This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408.
58static const size_t pendingTokenLimit = 1000;
59
60using namespace HTMLNames;
61
62#ifndef NDEBUG
63
64static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens)
65{
66    for (size_t i = 0; i < tokens->size(); ++i)
67        ASSERT(tokens->at(i).isSafeToSendToAnotherThread());
68}
69
70static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads)
71{
72    for (size_t i = 0; i < preloads.size(); ++i)
73        ASSERT(preloads[i]->isSafeToSendToAnotherThread());
74}
75
76#endif
77
78BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
79    : m_weakFactory(reference, this)
80    , m_token(adoptPtr(new HTMLToken))
81    , m_tokenizer(HTMLTokenizer::create(config->options))
82    , m_treeBuilderSimulator(config->options)
83    , m_options(config->options)
84    , m_parser(config->parser)
85    , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream))
86    , m_xssAuditor(config->xssAuditor.release())
87    , m_preloadScanner(config->preloadScanner.release())
88{
89}
90
91void BackgroundHTMLParser::append(const String& input)
92{
93    ASSERT(!m_input.current().isClosed());
94    m_input.append(input);
95    pumpTokenizer();
96}
97
98void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint)
99{
100    m_parser = checkpoint->parser;
101    m_token = checkpoint->token.release();
102    m_tokenizer = checkpoint->tokenizer.release();
103    m_treeBuilderSimulator.setState(checkpoint->treeBuilderState);
104    m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput);
105    m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint);
106    pumpTokenizer();
107}
108
109void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint)
110{
111    // Note, we should not have to worry about the index being invalid
112    // as messages from the main thread will be processed in FIFO order.
113    m_input.invalidateCheckpointsBefore(inputCheckpoint);
114    pumpTokenizer();
115}
116
117void BackgroundHTMLParser::finish()
118{
119    markEndOfFile();
120    pumpTokenizer();
121}
122
123void BackgroundHTMLParser::stop()
124{
125    delete this;
126}
127
128void BackgroundHTMLParser::forcePlaintextForTextDocument()
129{
130    // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser)
131    // to force us into the PLAINTEXT state w/o using a <plaintext> tag.
132    // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons.
133    m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
134}
135
136void BackgroundHTMLParser::markEndOfFile()
137{
138    ASSERT(!m_input.current().isClosed());
139    m_input.append(String(&kEndOfFileMarker, 1));
140    m_input.close();
141}
142
143void BackgroundHTMLParser::pumpTokenizer()
144{
145    // No need to start speculating until the main thread has almost caught up.
146    if (m_input.outstandingCheckpointCount() > outstandingCheckpointLimit)
147        return;
148
149    while (true) {
150        m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token);
151        if (!m_tokenizer->nextToken(m_input.current(), *m_token.get())) {
152            // We've reached the end of our current input.
153            sendTokensToMainThread();
154            break;
155        }
156        m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token);
157
158        {
159            TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn());
160
161            if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) {
162                xssInfo->m_textPosition = position;
163                m_pendingXSSInfos.append(xssInfo.release());
164            }
165
166            CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()));
167
168            m_preloadScanner->scan(token, m_pendingPreloads);
169
170            m_pendingTokens->append(token);
171        }
172
173        m_token->clear();
174
175        if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) {
176            sendTokensToMainThread();
177            // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory.
178            if (m_input.outstandingCheckpointCount() > outstandingCheckpointLimit)
179                break;
180        }
181    }
182}
183
184void BackgroundHTMLParser::sendTokensToMainThread()
185{
186    if (m_pendingTokens->isEmpty())
187        return;
188
189#ifndef NDEBUG
190    checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get());
191    checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads);
192#endif
193
194    OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk);
195    chunk->tokens = m_pendingTokens.release();
196    chunk->preloads.swap(m_pendingPreloads);
197    chunk->xssInfos.swap(m_pendingXSSInfos);
198    chunk->tokenizerState = m_tokenizer->state();
199    chunk->treeBuilderState = m_treeBuilderSimulator.state();
200    chunk->inputCheckpoint = m_input.createCheckpoint();
201    chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint();
202    callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release()));
203
204    m_pendingTokens = adoptPtr(new CompactHTMLTokenStream);
205}
206
207}
208
209#endif // ENABLE(THREADED_HTML_PARSER)
210