1/*
2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com>
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "config.h"
29#include "TextEncoding.h"
30
31#include "TextCodec.h"
32#include "TextEncodingRegistry.h"
33#include <unicode/unorm.h>
34#include <wtf/OwnPtr.h>
35#include <wtf/StdLibExtras.h>
36#include <wtf/text/CString.h>
37#include <wtf/text/StringView.h>
38
39namespace WebCore {
40
41static const TextEncoding& UTF7Encoding()
42{
43    static TextEncoding globalUTF7Encoding("UTF-7");
44    return globalUTF7Encoding;
45}
46
47TextEncoding::TextEncoding(const char* name)
48    : m_name(atomicCanonicalTextEncodingName(name))
49    , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
50{
51}
52
53TextEncoding::TextEncoding(const String& name)
54    : m_name(atomicCanonicalTextEncodingName(name))
55    , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
56{
57}
58
59String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
60{
61    if (!m_name)
62        return String();
63
64    return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
65}
66
67CString TextEncoding::encode(StringView text, UnencodableHandling handling) const
68{
69    if (!m_name)
70        return CString();
71
72    if (text.isEmpty())
73        return "";
74
75    // FIXME: What's the right place to do normalization?
76    // It's a little strange to do it inside the encode function.
77    // Perhaps normalization should be an explicit step done before calling encode.
78
79    auto upconvertedCharacters = text.upconvertedCharacters();
80
81    const UChar* source = upconvertedCharacters;
82    size_t sourceLength = text.length();
83
84    Vector<UChar> normalizedCharacters;
85
86    UErrorCode err = U_ZERO_ERROR;
87    if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
88        // First try using the length of the original string, since normalization to NFC rarely increases length.
89        normalizedCharacters.grow(sourceLength);
90        int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err);
91        if (err == U_BUFFER_OVERFLOW_ERROR) {
92            err = U_ZERO_ERROR;
93            normalizedCharacters.resize(normalizedLength);
94            normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
95        }
96        ASSERT(U_SUCCESS(err));
97
98        source = normalizedCharacters.data();
99        sourceLength = normalizedLength;
100    }
101
102    return newTextCodec(*this)->encode(source, sourceLength, handling);
103}
104
105const char* TextEncoding::domName() const
106{
107    if (noExtendedTextEncodingNameUsed())
108        return m_name;
109
110    // We treat EUC-KR as windows-949 (its superset), but need to expose
111    // the name 'EUC-KR' because the name 'windows-949' is not recognized by
112    // most Korean web servers even though they do use the encoding
113    // 'windows-949' with the name 'EUC-KR'.
114    // FIXME: This is not thread-safe. At the moment, this function is
115    // only accessed in a single thread, but eventually has to be made
116    // thread-safe along with usesVisualOrdering().
117    static const char* const a = atomicCanonicalTextEncodingName("windows-949");
118    if (m_name == a)
119        return "EUC-KR";
120    return m_name;
121}
122
123bool TextEncoding::usesVisualOrdering() const
124{
125    if (noExtendedTextEncodingNameUsed())
126        return false;
127
128    static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
129    return m_name == a;
130}
131
132bool TextEncoding::isJapanese() const
133{
134    return isJapaneseEncoding(m_name);
135}
136
137UChar TextEncoding::backslashAsCurrencySymbol() const
138{
139    return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
140}
141
142bool TextEncoding::isNonByteBasedEncoding() const
143{
144    if (noExtendedTextEncodingNameUsed()) {
145        return *this == UTF16LittleEndianEncoding()
146            || *this == UTF16BigEndianEncoding();
147    }
148
149    return *this == UTF16LittleEndianEncoding()
150        || *this == UTF16BigEndianEncoding()
151        || *this == UTF32BigEndianEncoding()
152        || *this == UTF32LittleEndianEncoding();
153}
154
155bool TextEncoding::isUTF7Encoding() const
156{
157    if (noExtendedTextEncodingNameUsed())
158        return false;
159
160    return *this == UTF7Encoding();
161}
162
163const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
164{
165    if (isNonByteBasedEncoding())
166        return UTF8Encoding();
167    return *this;
168}
169
170// HTML5 specifies that UTF-8 be used in form submission when a form is
171// is a part of a document in UTF-16 probably because UTF-16 is not a
172// byte-based encoding and can contain 0x00. By extension, the same
173// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
174// but it's fraught with problems and we'd rather steer clear of it.
175const TextEncoding& TextEncoding::encodingForFormSubmission() const
176{
177    if (isNonByteBasedEncoding() || isUTF7Encoding())
178        return UTF8Encoding();
179    return *this;
180}
181
182const TextEncoding& ASCIIEncoding()
183{
184    static TextEncoding globalASCIIEncoding("ASCII");
185    return globalASCIIEncoding;
186}
187
188const TextEncoding& Latin1Encoding()
189{
190    static TextEncoding globalLatin1Encoding("latin1");
191    return globalLatin1Encoding;
192}
193
194const TextEncoding& UTF16BigEndianEncoding()
195{
196    static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
197    return globalUTF16BigEndianEncoding;
198}
199
200const TextEncoding& UTF16LittleEndianEncoding()
201{
202    static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
203    return globalUTF16LittleEndianEncoding;
204}
205
206const TextEncoding& UTF32BigEndianEncoding()
207{
208    static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
209    return globalUTF32BigEndianEncoding;
210}
211
212const TextEncoding& UTF32LittleEndianEncoding()
213{
214    static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
215    return globalUTF32LittleEndianEncoding;
216}
217
218const TextEncoding& UTF8Encoding()
219{
220    static TextEncoding globalUTF8Encoding("UTF-8");
221    ASSERT(globalUTF8Encoding.isValid());
222    return globalUTF8Encoding;
223}
224
225const TextEncoding& WindowsLatin1Encoding()
226{
227    static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
228    return globalWindowsLatin1Encoding;
229}
230
231} // namespace WebCore
232