1/*
2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "TextCodecUTF8.h"
28
29#include "TextCodecASCIIFastPath.h"
30#include <wtf/text/CString.h>
31#include <wtf/text/StringBuffer.h>
32#include <wtf/unicode/CharacterNames.h>
33
34using namespace WTF;
35using namespace WTF::Unicode;
36
37namespace WebCore {
38
39const int nonCharacter = -1;
40
41PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
42{
43    return adoptPtr(new TextCodecUTF8);
44}
45
46void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
47{
48    registrar("UTF-8", "UTF-8");
49
50    // Additional aliases that originally were present in the encoding
51    // table in WebKit on Macintosh, and subsequently added by
52    // TextCodecICU. Perhaps we can prove some are not used on the web
53    // and remove them.
54    registrar("unicode11utf8", "UTF-8");
55    registrar("unicode20utf8", "UTF-8");
56    registrar("utf8", "UTF-8");
57    registrar("x-unicode20utf8", "UTF-8");
58}
59
60void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
61{
62    registrar("UTF-8", create, 0);
63}
64
65static inline int nonASCIISequenceLength(uint8_t firstByte)
66{
67    static const uint8_t lengths[256] = {
68        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
69        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
70        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
71        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
72        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
73        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
74        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
75        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
76        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
77        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
83        4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
84    };
85    return lengths[firstByte];
86}
87
88static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
89{
90    ASSERT(!isASCII(sequence[0]));
91    if (length == 2) {
92        ASSERT(sequence[0] <= 0xDF);
93        if (sequence[0] < 0xC2)
94            return nonCharacter;
95        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
96            return nonCharacter;
97        return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
98    }
99    if (length == 3) {
100        ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
101        switch (sequence[0]) {
102        case 0xE0:
103            if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
104                return nonCharacter;
105            break;
106        case 0xED:
107            if (sequence[1] < 0x80 || sequence[1] > 0x9F)
108                return nonCharacter;
109            break;
110        default:
111            if (sequence[1] < 0x80 || sequence[1] > 0xBF)
112                return nonCharacter;
113        }
114        if (sequence[2] < 0x80 || sequence[2] > 0xBF)
115            return nonCharacter;
116        return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
117    }
118    ASSERT(length == 4);
119    ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
120    switch (sequence[0]) {
121    case 0xF0:
122        if (sequence[1] < 0x90 || sequence[1] > 0xBF)
123            return nonCharacter;
124        break;
125    case 0xF4:
126        if (sequence[1] < 0x80 || sequence[1] > 0x8F)
127            return nonCharacter;
128        break;
129    default:
130        if (sequence[1] < 0x80 || sequence[1] > 0xBF)
131            return nonCharacter;
132    }
133    if (sequence[2] < 0x80 || sequence[2] > 0xBF)
134        return nonCharacter;
135    if (sequence[3] < 0x80 || sequence[3] > 0xBF)
136        return nonCharacter;
137    return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
138}
139
140static inline UChar* appendCharacter(UChar* destination, int character)
141{
142    ASSERT(character != nonCharacter);
143    ASSERT(!U_IS_SURROGATE(character));
144    if (U_IS_BMP(character))
145        *destination++ = character;
146    else {
147        *destination++ = U16_LEAD(character);
148        *destination++ = U16_TRAIL(character);
149    }
150    return destination;
151}
152
153void TextCodecUTF8::consumePartialSequenceByte()
154{
155    --m_partialSequenceSize;
156    memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
157}
158
159void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
160{
161    sawError = true;
162    if (stopOnError)
163        return;
164    // Each error generates a replacement character and consumes one byte.
165    *destination++ = replacementCharacter;
166    consumePartialSequenceByte();
167}
168
169template <>
170bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
171{
172    ASSERT(m_partialSequenceSize);
173    do {
174        if (isASCII(m_partialSequence[0])) {
175            *destination++ = m_partialSequence[0];
176            consumePartialSequenceByte();
177            continue;
178        }
179        int count = nonASCIISequenceLength(m_partialSequence[0]);
180        if (!count)
181            return true;
182
183        if (count > m_partialSequenceSize) {
184            if (count - m_partialSequenceSize > end - source) {
185                if (!flush) {
186                    // The new data is not enough to complete the sequence, so
187                    // add it to the existing partial sequence.
188                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
189                    m_partialSequenceSize += end - source;
190                    return false;
191                }
192                // An incomplete partial sequence at the end is an error, but it will create
193                // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
194                // the error.
195                return true;
196            }
197            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
198            source += count - m_partialSequenceSize;
199            m_partialSequenceSize = count;
200        }
201        int character = decodeNonASCIISequence(m_partialSequence, count);
202        if ((character == nonCharacter) || (character > 0xff))
203            return true;
204
205        m_partialSequenceSize -= count;
206        *destination++ = character;
207    } while (m_partialSequenceSize);
208
209    return false;
210}
211
212template <>
213bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
214{
215    ASSERT(m_partialSequenceSize);
216    do {
217        if (isASCII(m_partialSequence[0])) {
218            *destination++ = m_partialSequence[0];
219            consumePartialSequenceByte();
220            continue;
221        }
222        int count = nonASCIISequenceLength(m_partialSequence[0]);
223        if (!count) {
224            handleError(destination, stopOnError, sawError);
225            if (stopOnError)
226                return false;
227            continue;
228        }
229        if (count > m_partialSequenceSize) {
230            if (count - m_partialSequenceSize > end - source) {
231                if (!flush) {
232                    // The new data is not enough to complete the sequence, so
233                    // add it to the existing partial sequence.
234                    memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
235                    m_partialSequenceSize += end - source;
236                    return false;
237                }
238                // An incomplete partial sequence at the end is an error.
239                handleError(destination, stopOnError, sawError);
240                if (stopOnError)
241                    return false;
242                continue;
243            }
244            memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
245            source += count - m_partialSequenceSize;
246            m_partialSequenceSize = count;
247        }
248        int character = decodeNonASCIISequence(m_partialSequence, count);
249        if (character == nonCharacter) {
250            handleError(destination, stopOnError, sawError);
251            if (stopOnError)
252                return false;
253            continue;
254        }
255
256        m_partialSequenceSize -= count;
257        destination = appendCharacter(destination, character);
258    } while (m_partialSequenceSize);
259
260    return false;
261}
262
263String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
264{
265    // Each input byte might turn into a character.
266    // That includes all bytes in the partial-sequence buffer because
267    // each byte in an invalid sequence will turn into a replacement character.
268    StringBuffer<LChar> buffer(m_partialSequenceSize + length);
269
270    const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
271    const uint8_t* end = source + length;
272    const uint8_t* alignedEnd = alignToMachineWord(end);
273    LChar* destination = buffer.characters();
274
275    do {
276        if (m_partialSequenceSize) {
277            // Explicitly copy destination and source pointers to avoid taking pointers to the
278            // local variables, which may harm code generation by disabling some optimizations
279            // in some compilers.
280            LChar* destinationForHandlePartialSequence = destination;
281            const uint8_t* sourceForHandlePartialSequence = source;
282            if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
283                source = sourceForHandlePartialSequence;
284                goto upConvertTo16Bit;
285            }
286            destination = destinationForHandlePartialSequence;
287            source = sourceForHandlePartialSequence;
288            if (m_partialSequenceSize)
289                break;
290        }
291
292        while (source < end) {
293            if (isASCII(*source)) {
294                // Fast path for ASCII. Most UTF-8 text will be ASCII.
295                if (isAlignedToMachineWord(source)) {
296                    while (source < alignedEnd) {
297                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
298                        if (!isAllASCII<LChar>(chunk))
299                            break;
300                        copyASCIIMachineWord(destination, source);
301                        source += sizeof(MachineWord);
302                        destination += sizeof(MachineWord);
303                    }
304                    if (source == end)
305                        break;
306                    if (!isASCII(*source))
307                        continue;
308                }
309                *destination++ = *source++;
310                continue;
311            }
312            int count = nonASCIISequenceLength(*source);
313            int character;
314            if (!count)
315                character = nonCharacter;
316            else {
317                if (count > end - source) {
318                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
319                    ASSERT(!m_partialSequenceSize);
320                    m_partialSequenceSize = end - source;
321                    memcpy(m_partialSequence, source, m_partialSequenceSize);
322                    source = end;
323                    break;
324                }
325                character = decodeNonASCIISequence(source, count);
326            }
327            if (character == nonCharacter) {
328                sawError = true;
329                if (stopOnError)
330                    break;
331
332                goto upConvertTo16Bit;
333            }
334            if (character > 0xff)
335                goto upConvertTo16Bit;
336
337            source += count;
338            *destination++ = character;
339        }
340    } while (flush && m_partialSequenceSize);
341
342    buffer.shrink(destination - buffer.characters());
343
344    return String::adopt(buffer);
345
346upConvertTo16Bit:
347    StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
348
349    UChar* destination16 = buffer16.characters();
350
351    // Copy the already converted characters
352    for (LChar* converted8 = buffer.characters(); converted8 < destination;)
353        *destination16++ = *converted8++;
354
355    do {
356        if (m_partialSequenceSize) {
357            // Explicitly copy destination and source pointers to avoid taking pointers to the
358            // local variables, which may harm code generation by disabling some optimizations
359            // in some compilers.
360            UChar* destinationForHandlePartialSequence = destination16;
361            const uint8_t* sourceForHandlePartialSequence = source;
362            handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
363            destination16 = destinationForHandlePartialSequence;
364            source = sourceForHandlePartialSequence;
365            if (m_partialSequenceSize)
366                break;
367        }
368
369        while (source < end) {
370            if (isASCII(*source)) {
371                // Fast path for ASCII. Most UTF-8 text will be ASCII.
372                if (isAlignedToMachineWord(source)) {
373                    while (source < alignedEnd) {
374                        MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
375                        if (!isAllASCII<LChar>(chunk))
376                            break;
377                        copyASCIIMachineWord(destination16, source);
378                        source += sizeof(MachineWord);
379                        destination16 += sizeof(MachineWord);
380                    }
381                    if (source == end)
382                        break;
383                    if (!isASCII(*source))
384                        continue;
385                }
386                *destination16++ = *source++;
387                continue;
388            }
389            int count = nonASCIISequenceLength(*source);
390            int character;
391            if (!count)
392                character = nonCharacter;
393            else {
394                if (count > end - source) {
395                    ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
396                    ASSERT(!m_partialSequenceSize);
397                    m_partialSequenceSize = end - source;
398                    memcpy(m_partialSequence, source, m_partialSequenceSize);
399                    source = end;
400                    break;
401                }
402                character = decodeNonASCIISequence(source, count);
403            }
404            if (character == nonCharacter) {
405                sawError = true;
406                if (stopOnError)
407                    break;
408                // Each error generates a replacement character and consumes one byte.
409                *destination16++ = replacementCharacter;
410                ++source;
411                continue;
412            }
413            source += count;
414            destination16 = appendCharacter(destination16, character);
415        }
416    } while (flush && m_partialSequenceSize);
417
418    buffer16.shrink(destination16 - buffer16.characters());
419
420    return String::adopt(buffer16);
421}
422
423CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
424{
425    // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
426    // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
427    // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
428    if (length > std::numeric_limits<size_t>::max() / 3)
429        CRASH();
430    Vector<uint8_t> bytes(length * 3);
431
432    size_t i = 0;
433    size_t bytesWritten = 0;
434    while (i < length) {
435        UChar32 character;
436        U16_NEXT(characters, i, length, character);
437        U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
438    }
439
440    return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
441}
442
443} // namespace WebCore
444