1/*
2 * Copyright (C) 2011, 2013 Google Inc.  All rights reserved.
3 * Copyright (C) 2014 Apple Inc.  All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are
7 * met:
8 *
9 *     * Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *     * Redistributions in binary form must reproduce the above
12 * copyright notice, this list of conditions and the following disclaimer
13 * in the documentation and/or other materials provided with the
14 * distribution.
15 *     * Neither the name of Google Inc. nor the names of its
16 * contributors may be used to endorse or promote products derived from
17 * this software without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include "config.h"
33
34#if ENABLE(VIDEO_TRACK)
35
36#include "WebVTTTokenizer.h"
37
38#include "MarkupTokenizerInlines.h"
39#include <wtf/text/StringBuilder.h>
40#include <wtf/unicode/CharacterNames.h>
41
42namespace WebCore {
43
44#define WEBVTT_BEGIN_STATE(stateName) case stateName: stateName:
45#define WEBVTT_ADVANCE_TO(stateName)                               \
46    do {                                                           \
47        state = stateName;                                         \
48        ASSERT(!m_input.isEmpty());                                \
49        m_inputStreamPreprocessor.advance(m_input);                \
50        cc = m_inputStreamPreprocessor.nextInputCharacter();       \
51        goto stateName;                                            \
52    } while (false)
53
54
55template<unsigned charactersCount>
56ALWAYS_INLINE bool equalLiteral(const StringBuilder& s, const char (&characters)[charactersCount])
57{
58    return WTF::equal(s, reinterpret_cast<const LChar*>(characters), charactersCount - 1);
59}
60
61static void addNewClass(StringBuilder& classes, const StringBuilder& newClass)
62{
63    if (!classes.isEmpty())
64        classes.append(' ');
65    classes.append(newClass);
66}
67
68inline bool emitToken(WebVTTToken& resultToken, const WebVTTToken& token)
69{
70    resultToken = token;
71    return true;
72}
73
74inline bool advanceAndEmitToken(SegmentedString& source, WebVTTToken& resultToken, const WebVTTToken& token)
75{
76    source.advanceAndUpdateLineNumber();
77    return emitToken(resultToken, token);
78}
79
80WebVTTTokenizer::WebVTTTokenizer(const String& input)
81    : m_input(input)
82    , m_inputStreamPreprocessor(this)
83{
84    // Append an EOF marker and close the input "stream".
85    ASSERT(!m_input.isClosed());
86    m_input.append(SegmentedString(String(&kEndOfFileMarker, 1)));
87    m_input.close();
88}
89
90bool WebVTTTokenizer::nextToken(WebVTTToken& token)
91{
92    if (m_input.isEmpty() || !m_inputStreamPreprocessor.peek(m_input))
93        return false;
94
95    UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
96    if (cc == kEndOfFileMarker) {
97        m_inputStreamPreprocessor.advance(m_input);
98        return false;
99    }
100
101    StringBuilder buffer;
102    StringBuilder result;
103    StringBuilder classes;
104
105    enum {
106        DataState,
107        EscapeState,
108        TagState,
109        StartTagState,
110        StartTagClassState,
111        StartTagAnnotationState,
112        EndTagState,
113        TimestampTagState,
114    } state = DataState;
115
116    // 4.8.10.13.4 WebVTT cue text tokenizer
117    switch (state) {
118    WEBVTT_BEGIN_STATE(DataState) {
119        if (cc == '&') {
120            buffer.append(static_cast<LChar>(cc));
121            WEBVTT_ADVANCE_TO(EscapeState);
122        } else if (cc == '<') {
123            if (result.isEmpty())
124                WEBVTT_ADVANCE_TO(TagState);
125            else {
126                // We don't want to advance input or perform a state transition - just return a (new) token.
127                // (On the next call to nextToken we will see '<' again, but take the other branch in this if instead.)
128                return emitToken(token, WebVTTToken::StringToken(result.toString()));
129            }
130        } else if (cc == kEndOfFileMarker)
131            return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString()));
132        else {
133            result.append(cc);
134            WEBVTT_ADVANCE_TO(DataState);
135        }
136    }
137    END_STATE()
138
139    WEBVTT_BEGIN_STATE(EscapeState) {
140        if (cc == ';') {
141            if (equalLiteral(buffer, "&amp"))
142                result.append('&');
143            else if (equalLiteral(buffer, "&lt"))
144                result.append('<');
145            else if (equalLiteral(buffer, "&gt"))
146                result.append('>');
147            else if (equalLiteral(buffer, "&lrm"))
148                result.append(leftToRightMark);
149            else if (equalLiteral(buffer, "&rlm"))
150                result.append(rightToLeftMark);
151            else if (equalLiteral(buffer, "&nbsp"))
152                result.append(noBreakSpace);
153            else {
154                buffer.append(static_cast<LChar>(cc));
155                result.append(buffer);
156            }
157            buffer.clear();
158            WEBVTT_ADVANCE_TO(DataState);
159        } else if (isASCIIAlphanumeric(cc)) {
160            buffer.append(static_cast<LChar>(cc));
161            WEBVTT_ADVANCE_TO(EscapeState);
162        } else if (cc == '<') {
163            result.append(buffer);
164            return emitToken(token, WebVTTToken::StringToken(result.toString()));
165        } else if (cc == kEndOfFileMarker) {
166            result.append(buffer);
167            return advanceAndEmitToken(m_input, token, WebVTTToken::StringToken(result.toString()));
168        } else {
169            result.append(buffer);
170            buffer.clear();
171
172            if (cc == '&') {
173                buffer.append(static_cast<LChar>(cc));
174                WEBVTT_ADVANCE_TO(EscapeState);
175            }
176            result.append(cc);
177            WEBVTT_ADVANCE_TO(DataState);
178        }
179    }
180    END_STATE()
181
182    WEBVTT_BEGIN_STATE(TagState) {
183        if (isTokenizerWhitespace(cc)) {
184            ASSERT(result.isEmpty());
185            WEBVTT_ADVANCE_TO(StartTagAnnotationState);
186        } else if (cc == '.') {
187            ASSERT(result.isEmpty());
188            WEBVTT_ADVANCE_TO(StartTagClassState);
189        } else if (cc == '/') {
190            WEBVTT_ADVANCE_TO(EndTagState);
191        } else if (WTF::isASCIIDigit(cc)) {
192            result.append(cc);
193            WEBVTT_ADVANCE_TO(TimestampTagState);
194        } else if (cc == '>' || cc == kEndOfFileMarker) {
195            ASSERT(result.isEmpty());
196            return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString()));
197        } else {
198            result.append(cc);
199            WEBVTT_ADVANCE_TO(StartTagState);
200        }
201    }
202    END_STATE()
203
204    WEBVTT_BEGIN_STATE(StartTagState) {
205        if (isTokenizerWhitespace(cc))
206            WEBVTT_ADVANCE_TO(StartTagAnnotationState);
207        else if (cc == '.')
208            WEBVTT_ADVANCE_TO(StartTagClassState);
209        else if (cc == '>' || cc == kEndOfFileMarker)
210            return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString()));
211        else {
212            result.append(cc);
213            WEBVTT_ADVANCE_TO(StartTagState);
214        }
215    }
216    END_STATE()
217
218    WEBVTT_BEGIN_STATE(StartTagClassState) {
219        if (isTokenizerWhitespace(cc)) {
220            addNewClass(classes, buffer);
221            buffer.clear();
222            WEBVTT_ADVANCE_TO(StartTagAnnotationState);
223        } else if (cc == '.') {
224            addNewClass(classes, buffer);
225            buffer.clear();
226            WEBVTT_ADVANCE_TO(StartTagClassState);
227        } else if (cc == '>' || cc == kEndOfFileMarker) {
228            addNewClass(classes, buffer);
229            buffer.clear();
230            return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomicString()));
231        } else {
232            buffer.append(cc);
233            WEBVTT_ADVANCE_TO(StartTagClassState);
234        }
235
236    }
237    END_STATE()
238
239    WEBVTT_BEGIN_STATE(StartTagAnnotationState) {
240        if (cc == '>' || cc == kEndOfFileMarker) {
241            return advanceAndEmitToken(m_input, token, WebVTTToken::StartTag(result.toString(), classes.toAtomicString(), buffer.toAtomicString()));
242        }
243        buffer.append(cc);
244        WEBVTT_ADVANCE_TO(StartTagAnnotationState);
245    }
246    END_STATE()
247
248    WEBVTT_BEGIN_STATE(EndTagState) {
249        if (cc == '>' || cc == kEndOfFileMarker)
250            return advanceAndEmitToken(m_input, token, WebVTTToken::EndTag(result.toString()));
251        result.append(cc);
252        WEBVTT_ADVANCE_TO(EndTagState);
253    }
254    END_STATE()
255
256    WEBVTT_BEGIN_STATE(TimestampTagState) {
257        if (cc == '>' || cc == kEndOfFileMarker)
258            return advanceAndEmitToken(m_input, token, WebVTTToken::TimestampTag(result.toString()));
259        result.append(cc);
260        WEBVTT_ADVANCE_TO(TimestampTagState);
261    }
262    END_STATE()
263
264    }
265
266    ASSERT_NOT_REACHED();
267    return false;
268}
269
270}
271
272#endif
273