1/*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "HTMLEntityParser.h"
30
31#include "CharacterReferenceParserInlines.h"
32#include "HTMLEntitySearch.h"
33#include "HTMLEntityTable.h"
34#include <wtf/text/StringBuilder.h>
35
36using namespace WTF;
37
38namespace WebCore {
39
40static const UChar windowsLatin1ExtensionArray[32] = {
41    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
42    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
43    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
44    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
45};
46
47static inline bool isAlphaNumeric(UChar cc)
48{
49    return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
50}
51
52class HTMLEntityParser {
53public:
54    inline static UChar adjustEntity(UChar32 value)
55    {
56        if ((value & ~0x1F) != 0x0080)
57            return value;
58        return windowsLatin1ExtensionArray[value - 0x80];
59    }
60
61    inline static UChar32 legalEntityFor(UChar32 value)
62    {
63        // FIXME: A number of specific entity values generate parse errors.
64        if (!value || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
65            return 0xFFFD;
66        if (U_IS_BMP(value))
67            return adjustEntity(value);
68        return value;
69    }
70
71    inline static bool acceptMalformed() { return true; }
72
73    inline static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
74    {
75        StringBuilder consumedCharacters;
76        HTMLEntitySearch entitySearch;
77        while (!source.isEmpty()) {
78            cc = source.currentChar();
79            entitySearch.advance(cc);
80            if (!entitySearch.isEntityPrefix())
81                break;
82            consumedCharacters.append(cc);
83            source.advanceAndASSERT(cc);
84        }
85        notEnoughCharacters = source.isEmpty();
86        if (notEnoughCharacters) {
87            // We can't an entity because there might be a longer entity
88            // that we could match if we had more data.
89            unconsumeCharacters(source, consumedCharacters);
90            return false;
91        }
92        if (!entitySearch.mostRecentMatch()) {
93            unconsumeCharacters(source, consumedCharacters);
94            return false;
95        }
96        if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
97            // We've consumed too many characters. We need to walk the
98            // source back to the point at which we had consumed an
99            // actual entity.
100            unconsumeCharacters(source, consumedCharacters);
101            consumedCharacters.clear();
102            const int length = entitySearch.mostRecentMatch()->length;
103            const UChar* reference = entitySearch.mostRecentMatch()->entity;
104            for (int i = 0; i < length; ++i) {
105                cc = source.currentChar();
106                ASSERT_UNUSED(reference, cc == *reference++);
107                consumedCharacters.append(cc);
108                source.advanceAndASSERT(cc);
109                ASSERT(!source.isEmpty());
110            }
111            cc = source.currentChar();
112        }
113        if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
114            || !additionalAllowedCharacter
115            || !(isAlphaNumeric(cc) || cc == '=')) {
116            decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
117            if (entitySearch.mostRecentMatch()->secondValue)
118                decodedEntity.append(entitySearch.mostRecentMatch()->secondValue);
119            return true;
120        }
121        unconsumeCharacters(source, consumedCharacters);
122        return false;
123    }
124};
125
126bool consumeHTMLEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
127{
128    return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
129}
130
131static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
132{
133    if (U_IS_BMP(value)) {
134        UChar character = static_cast<UChar>(value);
135        ASSERT(character == value);
136        result[0] = character;
137        return 1;
138    }
139
140    result[0] = U16_LEAD(value);
141    result[1] = U16_TRAIL(value);
142    return 2;
143}
144
145size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
146{
147    HTMLEntitySearch search;
148    while (*name) {
149        search.advance(*name++);
150        if (!search.isEntityPrefix())
151            return 0;
152    }
153    search.advance(';');
154    if (!search.isEntityPrefix())
155        return 0;
156
157    size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
158    if (!search.mostRecentMatch()->secondValue)
159        return numberOfCodePoints;
160    return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
161}
162
163} // namespace WebCore
164