1/*
2 * Copyright (C) 2011 Daniel Bates (dbates@intudata.com). All Rights Reserved.
3 * Copyright (c) 2012 Google, inc.  All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Google Inc. nor the names of its
14 *    contributors may be used to endorse or promote products derived from
15 *    this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
21 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
25 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#ifndef DecodeEscapeSequences_h
31#define DecodeEscapeSequences_h
32
33#include "TextEncoding.h"
34#include <wtf/ASCIICType.h>
35#include <wtf/Assertions.h>
36#include <wtf/text/StringBuilder.h>
37
38namespace WebCore {
39
40// See <http://en.wikipedia.org/wiki/Percent-encoding#Non-standard_implementations>.
41struct Unicode16BitEscapeSequence {
42    enum { sequenceSize = 6 }; // e.g. %u26C4
43    static size_t findInString(const String& string, size_t startPosition) { return string.find("%u", startPosition); }
44    static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition)
45    {
46        size_t runEnd = startPosition;
47        while (endPosition - runEnd >= sequenceSize && string[runEnd] == '%' && string[runEnd + 1] == 'u'
48               && isASCIIHexDigit(string[runEnd + 2]) && isASCIIHexDigit(string[runEnd + 3])
49               && isASCIIHexDigit(string[runEnd + 4]) && isASCIIHexDigit(string[runEnd + 5])) {
50            runEnd += sequenceSize;
51        }
52        return runEnd;
53    }
54    static String decodeRun(StringView run, const TextEncoding&)
55    {
56        // Each %u-escape sequence represents a UTF-16 code unit.
57        // See <http://www.w3.org/International/iri-edit/draft-duerst-iri.html#anchor29>.
58        // For 16-bit escape sequences, we know that findEndOfRun() has given us a contiguous run of sequences
59        // without any intervening characters, so decode the run without additional checks.
60        auto numberOfSequences = run.length() / sequenceSize;
61        StringBuilder builder;
62        builder.reserveCapacity(numberOfSequences);
63        while (numberOfSequences--) {
64            UChar codeUnit = (toASCIIHexValue(run[2]) << 12) | (toASCIIHexValue(run[3]) << 8) | (toASCIIHexValue(run[4]) << 4) | toASCIIHexValue(run[5]);
65            builder.append(codeUnit);
66            run = run.substring(sequenceSize);
67        }
68        return builder.toString();
69    }
70};
71
72struct URLEscapeSequence {
73    enum { sequenceSize = 3 }; // e.g. %41
74    static size_t findInString(const String& string, size_t startPosition) { return string.find('%', startPosition); }
75    static size_t findEndOfRun(const String& string, size_t startPosition, size_t endPosition)
76    {
77        // Make the simplifying assumption that supported encodings may have up to two unescaped characters
78        // in the range 0x40 - 0x7F as the trailing bytes of their sequences which need to be passed into the
79        // decoder as part of the run. In other words, we end the run at the first value outside of the
80        // 0x40 - 0x7F range, after two values in this range, or at a %-sign that does not introduce a valid
81        // escape sequence.
82        size_t runEnd = startPosition;
83        int numberOfTrailingCharacters = 0;
84        while (runEnd < endPosition) {
85            if (string[runEnd] == '%') {
86                if (endPosition - runEnd >= sequenceSize && isASCIIHexDigit(string[runEnd + 1]) && isASCIIHexDigit(string[runEnd + 2])) {
87                    runEnd += sequenceSize;
88                    numberOfTrailingCharacters = 0;
89                } else
90                    break;
91            } else if (string[runEnd] >= 0x40 && string[runEnd] <= 0x7F && numberOfTrailingCharacters < 2) {
92                runEnd += 1;
93                numberOfTrailingCharacters += 1;
94            } else
95                break;
96        }
97        return runEnd;
98    }
99    static String decodeRun(StringView run, const TextEncoding& encoding)
100    {
101        // For URL escape sequences, we know that findEndOfRun() has given us a run where every %-sign introduces
102        // a valid escape sequence, but there may be characters between the sequences.
103        Vector<char, 512> buffer;
104        buffer.resize(run.length()); // Unescaping hex sequences only makes the length smaller.
105        char* p = buffer.data();
106        while (!run.isEmpty()) {
107            if (run[0] == '%') {
108                *p++ = (toASCIIHexValue(run[1]) << 4) | toASCIIHexValue(run[2]);
109                run = run.substring(sequenceSize);
110            } else {
111                *p++ = run[0];
112                run = run.substring(1);
113            }
114        }
115        ASSERT(buffer.size() >= static_cast<size_t>(p - buffer.data())); // Prove buffer not overrun.
116        return (encoding.isValid() ? encoding : UTF8Encoding()).decode(buffer.data(), p - buffer.data());
117    }
118};
119
120template<typename EscapeSequence>
121String decodeEscapeSequences(const String& string, const TextEncoding& encoding)
122{
123    StringBuilder result;
124    size_t length = string.length();
125    size_t decodedPosition = 0;
126    size_t searchPosition = 0;
127    size_t encodedRunPosition;
128    while ((encodedRunPosition = EscapeSequence::findInString(string, searchPosition)) != notFound) {
129        size_t encodedRunEnd = EscapeSequence::findEndOfRun(string, encodedRunPosition, length);
130        searchPosition = encodedRunEnd;
131        if (encodedRunEnd == encodedRunPosition) {
132            ++searchPosition;
133            continue;
134        }
135
136        String decoded = EscapeSequence::decodeRun(StringView(string).substring(encodedRunPosition, encodedRunEnd - encodedRunPosition), encoding);
137        if (decoded.isEmpty())
138            continue;
139
140        result.append(string, decodedPosition, encodedRunPosition - decodedPosition);
141        result.append(decoded);
142        decodedPosition = encodedRunEnd;
143    }
144    result.append(string, decodedPosition, length - decodedPosition);
145    return result.toString();
146}
147
148} // namespace WebCore
149
150#endif // DecodeEscapeSequences_h
151