1/*
2 * Copyright (C) 2013 Google, Inc. All Rights Reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef HTMLToken_h
27#define HTMLToken_h
28
29#include "Attribute.h"
30#include "HTMLToken.h"
31#include <wtf/RefCounted.h>
32#include <wtf/RefPtr.h>
33#include <wtf/text/StringView.h>
34
35namespace WebCore {
36
37class DoctypeData {
38    WTF_MAKE_NONCOPYABLE(DoctypeData);
39public:
40    DoctypeData()
41        : m_hasPublicIdentifier(false)
42        , m_hasSystemIdentifier(false)
43        , m_forceQuirks(false)
44    {
45    }
46
47    // FIXME: This should use String instead of Vector<UChar>.
48    bool m_hasPublicIdentifier;
49    bool m_hasSystemIdentifier;
50    WTF::Vector<UChar> m_publicIdentifier;
51    WTF::Vector<UChar> m_systemIdentifier;
52    bool m_forceQuirks;
53};
54
55static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
56{
57    for (unsigned i = 0; i < attributes.size(); ++i) {
58        if (attributes.at(i).name().matches(name))
59            return &attributes.at(i);
60    }
61    return 0;
62}
63
64class HTMLToken {
65    WTF_MAKE_NONCOPYABLE(HTMLToken);
66    WTF_MAKE_FAST_ALLOCATED;
67public:
68    enum Type {
69        Uninitialized,
70        DOCTYPE,
71        StartTag,
72        EndTag,
73        Comment,
74        Character,
75        EndOfFile,
76    };
77
78    class Attribute {
79    public:
80        class Range {
81        public:
82            int start;
83            int end;
84        };
85
86        Range nameRange;
87        Range valueRange;
88        Vector<UChar, 32> name;
89        Vector<UChar, 32> value;
90    };
91
92    typedef Vector<Attribute, 10> AttributeList;
93    typedef Vector<UChar, 256> DataVector;
94
95    HTMLToken() { clear(); }
96
97    void clear()
98    {
99        m_type = Uninitialized;
100        m_range.start = 0;
101        m_range.end = 0;
102        m_baseOffset = 0;
103        m_data.clear();
104        m_orAllData = 0;
105    }
106
107    bool isUninitialized() { return m_type == Uninitialized; }
108    Type type() const { return m_type; }
109
110    void makeEndOfFile()
111    {
112        ASSERT(m_type == Uninitialized);
113        m_type = EndOfFile;
114    }
115
116    /* Range and offset methods exposed for HTMLSourceTracker */
117    int startIndex() const { return m_range.start; }
118    int endIndex() const { return m_range.end; }
119
120    void setBaseOffset(int offset)
121    {
122        m_baseOffset = offset;
123    }
124
125    void end(int endOffset)
126    {
127        m_range.end = endOffset - m_baseOffset;
128    }
129
130    const DataVector& data() const
131    {
132        ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
133        return m_data;
134    }
135
136    bool isAll8BitData() const
137    {
138        return (m_orAllData <= 0xff);
139    }
140
141    const DataVector& name() const
142    {
143        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
144        return m_data;
145    }
146
147    void appendToName(UChar character)
148    {
149        ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
150        ASSERT(character);
151        m_data.append(character);
152        m_orAllData |= character;
153    }
154
155    /* DOCTYPE Tokens */
156
157    bool forceQuirks() const
158    {
159        ASSERT(m_type == DOCTYPE);
160        return m_doctypeData->m_forceQuirks;
161    }
162
163    void setForceQuirks()
164    {
165        ASSERT(m_type == DOCTYPE);
166        m_doctypeData->m_forceQuirks = true;
167    }
168
169    void beginDOCTYPE()
170    {
171        ASSERT(m_type == Uninitialized);
172        m_type = DOCTYPE;
173        m_doctypeData = std::make_unique<DoctypeData>();
174    }
175
176    void beginDOCTYPE(UChar character)
177    {
178        ASSERT(character);
179        beginDOCTYPE();
180        m_data.append(character);
181        m_orAllData |= character;
182    }
183
184    // FIXME: Distinguish between a missing public identifer and an empty one.
185    const WTF::Vector<UChar>& publicIdentifier() const
186    {
187        ASSERT(m_type == DOCTYPE);
188        return m_doctypeData->m_publicIdentifier;
189    }
190
191    // FIXME: Distinguish between a missing system identifer and an empty one.
192    const WTF::Vector<UChar>& systemIdentifier() const
193    {
194        ASSERT(m_type == DOCTYPE);
195        return m_doctypeData->m_systemIdentifier;
196    }
197
198    void setPublicIdentifierToEmptyString()
199    {
200        ASSERT(m_type == DOCTYPE);
201        m_doctypeData->m_hasPublicIdentifier = true;
202        m_doctypeData->m_publicIdentifier.clear();
203    }
204
205    void setSystemIdentifierToEmptyString()
206    {
207        ASSERT(m_type == DOCTYPE);
208        m_doctypeData->m_hasSystemIdentifier = true;
209        m_doctypeData->m_systemIdentifier.clear();
210    }
211
212    void appendToPublicIdentifier(UChar character)
213    {
214        ASSERT(character);
215        ASSERT(m_type == DOCTYPE);
216        ASSERT(m_doctypeData->m_hasPublicIdentifier);
217        m_doctypeData->m_publicIdentifier.append(character);
218    }
219
220    void appendToSystemIdentifier(UChar character)
221    {
222        ASSERT(character);
223        ASSERT(m_type == DOCTYPE);
224        ASSERT(m_doctypeData->m_hasSystemIdentifier);
225        m_doctypeData->m_systemIdentifier.append(character);
226    }
227
228    std::unique_ptr<DoctypeData> releaseDoctypeData()
229    {
230        return WTF::move(m_doctypeData);
231    }
232
233    /* Start/End Tag Tokens */
234
235    bool selfClosing() const
236    {
237        ASSERT(m_type == StartTag || m_type == EndTag);
238        return m_selfClosing;
239    }
240
241    void setSelfClosing()
242    {
243        ASSERT(m_type == StartTag || m_type == EndTag);
244        m_selfClosing = true;
245    }
246
247    void beginStartTag(UChar character)
248    {
249        ASSERT(character);
250        ASSERT(m_type == Uninitialized);
251        m_type = StartTag;
252        m_selfClosing = false;
253        m_currentAttribute = 0;
254        m_attributes.clear();
255
256        m_data.append(character);
257        m_orAllData |= character;
258    }
259
260    void beginEndTag(LChar character)
261    {
262        ASSERT(m_type == Uninitialized);
263        m_type = EndTag;
264        m_selfClosing = false;
265        m_currentAttribute = 0;
266        m_attributes.clear();
267
268        m_data.append(character);
269    }
270
271    void beginEndTag(const Vector<LChar, 32>& characters)
272    {
273        ASSERT(m_type == Uninitialized);
274        m_type = EndTag;
275        m_selfClosing = false;
276        m_currentAttribute = 0;
277        m_attributes.clear();
278
279        m_data.appendVector(characters);
280    }
281
282    void addNewAttribute()
283    {
284        ASSERT(m_type == StartTag || m_type == EndTag);
285        m_attributes.grow(m_attributes.size() + 1);
286        m_currentAttribute = &m_attributes.last();
287#ifndef NDEBUG
288        m_currentAttribute->nameRange.start = 0;
289        m_currentAttribute->nameRange.end = 0;
290        m_currentAttribute->valueRange.start = 0;
291        m_currentAttribute->valueRange.end = 0;
292#endif
293    }
294
295    void beginAttributeName(int offset)
296    {
297        m_currentAttribute->nameRange.start = offset - m_baseOffset;
298    }
299
300    void endAttributeName(int offset)
301    {
302        int index = offset - m_baseOffset;
303        m_currentAttribute->nameRange.end = index;
304        m_currentAttribute->valueRange.start = index;
305        m_currentAttribute->valueRange.end = index;
306    }
307
308    void beginAttributeValue(int offset)
309    {
310        m_currentAttribute->valueRange.start = offset - m_baseOffset;
311#ifndef NDEBUG
312        m_currentAttribute->valueRange.end = 0;
313#endif
314    }
315
316    void endAttributeValue(int offset)
317    {
318        m_currentAttribute->valueRange.end = offset - m_baseOffset;
319    }
320
321    void appendToAttributeName(UChar character)
322    {
323        ASSERT(character);
324        ASSERT(m_type == StartTag || m_type == EndTag);
325        // FIXME: We should be able to add the following ASSERT once we fix
326        // https://bugs.webkit.org/show_bug.cgi?id=62971
327        //   ASSERT(m_currentAttribute->nameRange.start);
328        m_currentAttribute->name.append(character);
329    }
330
331    void appendToAttributeValue(UChar character)
332    {
333        ASSERT(character);
334        ASSERT(m_type == StartTag || m_type == EndTag);
335        ASSERT(m_currentAttribute->valueRange.start);
336        m_currentAttribute->value.append(character);
337    }
338
339    void appendToAttributeValue(size_t i, StringView value)
340    {
341        ASSERT(!value.isEmpty());
342        ASSERT(m_type == StartTag || m_type == EndTag);
343        append(m_attributes[i].value, value);
344    }
345
346    const AttributeList& attributes() const
347    {
348        ASSERT(m_type == StartTag || m_type == EndTag);
349        return m_attributes;
350    }
351
352    const Attribute* getAttributeItem(const QualifiedName& name) const
353    {
354        for (unsigned i = 0; i < m_attributes.size(); ++i) {
355            if (AtomicString(m_attributes.at(i).name) == name.localName())
356                return &m_attributes.at(i);
357        }
358        return 0;
359    }
360
361    // Used by the XSSAuditor to nuke XSS-laden attributes.
362    void eraseValueOfAttribute(size_t i)
363    {
364        ASSERT(m_type == StartTag || m_type == EndTag);
365        m_attributes[i].value.clear();
366    }
367
368    /* Character Tokens */
369
370    // Starting a character token works slightly differently than starting
371    // other types of tokens because we want to save a per-character branch.
372    void ensureIsCharacterToken()
373    {
374        ASSERT(m_type == Uninitialized || m_type == Character);
375        m_type = Character;
376    }
377
378    const DataVector& characters() const
379    {
380        ASSERT(m_type == Character);
381        return m_data;
382    }
383
384    void appendToCharacter(char character)
385    {
386        ASSERT(m_type == Character);
387        m_data.append(character);
388    }
389
390    void appendToCharacter(UChar character)
391    {
392        ASSERT(m_type == Character);
393        m_data.append(character);
394        m_orAllData |= character;
395    }
396
397    void appendToCharacter(const Vector<LChar, 32>& characters)
398    {
399        ASSERT(m_type == Character);
400        m_data.appendVector(characters);
401    }
402
403    /* Comment Tokens */
404
405    const DataVector& comment() const
406    {
407        ASSERT(m_type == Comment);
408        return m_data;
409    }
410
411    void beginComment()
412    {
413        ASSERT(m_type == Uninitialized);
414        m_type = Comment;
415    }
416
417    void appendToComment(UChar character)
418    {
419        ASSERT(character);
420        ASSERT(m_type == Comment);
421        m_data.append(character);
422        m_orAllData |= character;
423    }
424
425    void eraseCharacters()
426    {
427        ASSERT(m_type == Character);
428        m_data.clear();
429        m_orAllData = 0;
430    }
431
432private:
433    Type m_type;
434    Attribute::Range m_range; // Always starts at zero.
435    int m_baseOffset;
436    DataVector m_data;
437    UChar m_orAllData;
438
439    // For StartTag and EndTag
440    bool m_selfClosing;
441    AttributeList m_attributes;
442
443    // A pointer into m_attributes used during lexing.
444    Attribute* m_currentAttribute;
445
446    // For DOCTYPE
447    std::unique_ptr<DoctypeData> m_doctypeData;
448};
449
450}
451
452#endif
453