1/*
2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
3 * Copyright (C) 2007-2009 Torch Mobile, Inc.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "TextEncodingRegistry.h"
29
30#include "TextCodecICU.h"
31#include "TextCodecLatin1.h"
32#include "TextCodecUserDefined.h"
33#include "TextCodecUTF16.h"
34#include "TextCodecUTF8.h"
35#include "TextEncoding.h"
36#include <mutex>
37#include <wtf/ASCIICType.h>
38#include <wtf/HashMap.h>
39#include <wtf/HashSet.h>
40#include <wtf/MainThread.h>
41#include <wtf/NeverDestroyed.h>
42#include <wtf/StdLibExtras.h>
43#include <wtf/StringExtras.h>
44
45#if PLATFORM(COCOA)
46#include "WebCoreSystemInterface.h"
47#endif
48
49#if PLATFORM(MAC)
50#include "TextCodecMac.h"
51#endif
52
53#include <wtf/CurrentTime.h>
54#include <wtf/text/CString.h>
55
56using namespace WTF;
57
58namespace WebCore {
59
60const size_t maxEncodingNameLength = 63;
61
62// Hash for all-ASCII strings that does case folding.
63struct TextEncodingNameHash {
64    static bool equal(const char* s1, const char* s2)
65    {
66        char c1;
67        char c2;
68        do {
69            c1 = *s1++;
70            c2 = *s2++;
71            if (toASCIILower(c1) != toASCIILower(c2))
72                return false;
73        } while (c1 && c2);
74        return !c1 && !c2;
75    }
76
77    // This algorithm is the one-at-a-time hash from:
78    // http://burtleburtle.net/bob/hash/hashfaq.html
79    // http://burtleburtle.net/bob/hash/doobs.html
80    static unsigned hash(const char* s)
81    {
82        unsigned h = WTF::stringHashingStartValue;
83        for (;;) {
84            char c = *s++;
85            if (!c) {
86                h += (h << 3);
87                h ^= (h >> 11);
88                h += (h << 15);
89                return h;
90            }
91            h += toASCIILower(c);
92            h += (h << 10);
93            h ^= (h >> 6);
94        }
95    }
96
97    static const bool safeToCompareToEmptyOrDeleted = false;
98};
99
100struct TextCodecFactory {
101    NewTextCodecFunction function;
102    const void* additionalData;
103    TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
104};
105
106typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
107typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
108
109static std::mutex& encodingRegistryMutex()
110{
111    // We don't have to construct this mutex in a thread safe way because this function
112    // is called on the main thread for any page before it is used in worker threads.
113    static NeverDestroyed<std::mutex> mutex;
114
115    return mutex;
116}
117
118static TextEncodingNameMap* textEncodingNameMap;
119static TextCodecMap* textCodecMap;
120static bool didExtendTextCodecMaps;
121static HashSet<const char*>* japaneseEncodings;
122static HashSet<const char*>* nonBackslashEncodings;
123
124static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
125
126#if ERROR_DISABLED
127
128static inline void checkExistingName(const char*, const char*) { }
129
130#else
131
132static void checkExistingName(const char* alias, const char* atomicName)
133{
134    const char* oldAtomicName = textEncodingNameMap->get(alias);
135    if (!oldAtomicName)
136        return;
137    if (oldAtomicName == atomicName)
138        return;
139    // Keep the warning silent about one case where we know this will happen.
140    if (strcmp(alias, "ISO-8859-8-I") == 0
141            && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
142            && strcasecmp(atomicName, "iso-8859-8") == 0)
143        return;
144    LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
145}
146
147#endif
148
149static bool isUndesiredAlias(const char* alias)
150{
151    // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
152    for (const char* p = alias; *p; ++p) {
153        if (*p == ',')
154            return true;
155    }
156    // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
157    // problem, see bug 43554.
158    if (0 == strcmp(alias, "8859_1"))
159        return true;
160    return false;
161}
162
163static void addToTextEncodingNameMap(const char* alias, const char* name)
164{
165    ASSERT(strlen(alias) <= maxEncodingNameLength);
166    if (isUndesiredAlias(alias))
167        return;
168    const char* atomicName = textEncodingNameMap->get(name);
169    ASSERT(strcmp(alias, name) == 0 || atomicName);
170    if (!atomicName)
171        atomicName = name;
172    checkExistingName(alias, atomicName);
173    textEncodingNameMap->add(alias, atomicName);
174}
175
176static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
177{
178    const char* atomicName = textEncodingNameMap->get(name);
179    ASSERT(atomicName);
180    textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
181}
182
183static void pruneBlacklistedCodecs()
184{
185    for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
186        const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
187        if (!atomicName)
188            continue;
189
190        Vector<const char*> names;
191        TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
192        TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
193        for (; it != end; ++it) {
194            if (it->value == atomicName)
195                names.append(it->key);
196        }
197
198        size_t length = names.size();
199        for (size_t j = 0; j < length; ++j)
200            textEncodingNameMap->remove(names[j]);
201
202        textCodecMap->remove(atomicName);
203    }
204}
205
206static void buildBaseTextCodecMaps()
207{
208    ASSERT(isMainThread());
209    ASSERT(!textCodecMap);
210    ASSERT(!textEncodingNameMap);
211
212    textCodecMap = new TextCodecMap;
213    textEncodingNameMap = new TextEncodingNameMap;
214
215    TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
216    TextCodecLatin1::registerCodecs(addToTextCodecMap);
217
218    TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
219    TextCodecUTF8::registerCodecs(addToTextCodecMap);
220
221    TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
222    TextCodecUTF16::registerCodecs(addToTextCodecMap);
223
224    TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
225    TextCodecUserDefined::registerCodecs(addToTextCodecMap);
226}
227
228static void addEncodingName(HashSet<const char*>* set, const char* name)
229{
230    // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
231    const char* atomicName = textEncodingNameMap->get(name);
232    if (atomicName)
233        set->add(atomicName);
234}
235
236static void buildQuirksSets()
237{
238    // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
239    // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
240
241    ASSERT(!japaneseEncodings);
242    ASSERT(!nonBackslashEncodings);
243
244    japaneseEncodings = new HashSet<const char*>;
245    addEncodingName(japaneseEncodings, "EUC-JP");
246    addEncodingName(japaneseEncodings, "ISO-2022-JP");
247    addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
248    addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
249    addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
250    addEncodingName(japaneseEncodings, "JIS_C6226-1978");
251    addEncodingName(japaneseEncodings, "JIS_X0201");
252    addEncodingName(japaneseEncodings, "JIS_X0208-1983");
253    addEncodingName(japaneseEncodings, "JIS_X0208-1990");
254    addEncodingName(japaneseEncodings, "JIS_X0212-1990");
255    addEncodingName(japaneseEncodings, "Shift_JIS");
256    addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
257    addEncodingName(japaneseEncodings, "cp932");
258    addEncodingName(japaneseEncodings, "x-mac-japanese");
259
260    nonBackslashEncodings = new HashSet<const char*>;
261    // The text encodings below treat backslash as a currency symbol for IE compatibility.
262    // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
263    addEncodingName(nonBackslashEncodings, "x-mac-japanese");
264    addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
265    addEncodingName(nonBackslashEncodings, "EUC-JP");
266    // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
267    addEncodingName(nonBackslashEncodings, "Shift_JIS");
268    addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
269}
270
271bool isJapaneseEncoding(const char* canonicalEncodingName)
272{
273    return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
274}
275
276bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
277{
278    return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
279}
280
281static void extendTextCodecMaps()
282{
283    TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
284    TextCodecICU::registerCodecs(addToTextCodecMap);
285
286#if PLATFORM(MAC)
287    TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
288    TextCodecMac::registerCodecs(addToTextCodecMap);
289#endif
290
291    pruneBlacklistedCodecs();
292    buildQuirksSets();
293}
294
295PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
296{
297    std::lock_guard<std::mutex> lock(encodingRegistryMutex());
298
299    ASSERT(textCodecMap);
300    TextCodecFactory factory = textCodecMap->get(encoding.name());
301    ASSERT(factory.function);
302    return factory.function(encoding, factory.additionalData);
303}
304
305const char* atomicCanonicalTextEncodingName(const char* name)
306{
307    if (!name || !name[0])
308        return nullptr;
309
310    if (!textEncodingNameMap)
311        buildBaseTextCodecMaps();
312
313    std::lock_guard<std::mutex> lock(encodingRegistryMutex());
314
315    if (const char* atomicName = textEncodingNameMap->get(name))
316        return atomicName;
317    if (didExtendTextCodecMaps)
318        return nullptr;
319
320    extendTextCodecMaps();
321    didExtendTextCodecMaps = true;
322    return textEncodingNameMap->get(name);
323}
324
325template <typename CharacterType>
326const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
327{
328    char buffer[maxEncodingNameLength + 1];
329    size_t j = 0;
330    for (size_t i = 0; i < length; ++i) {
331        CharacterType c = characters[i];
332        if (j == maxEncodingNameLength)
333            return 0;
334        buffer[j++] = c;
335    }
336    buffer[j] = 0;
337    return atomicCanonicalTextEncodingName(buffer);
338}
339
340const char* atomicCanonicalTextEncodingName(const String& alias)
341{
342    if (!alias.length())
343        return nullptr;
344
345    if (alias.is8Bit())
346        return atomicCanonicalTextEncodingName(alias.characters8(), alias.length());
347
348    return atomicCanonicalTextEncodingName(alias.characters16(), alias.length());
349}
350
351bool noExtendedTextEncodingNameUsed()
352{
353    // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
354    return !didExtendTextCodecMaps;
355}
356
357#if PLATFORM(COCOA)
358String defaultTextEncodingNameForSystemLanguage()
359{
360    String systemEncodingName = CFStringConvertEncodingToIANACharSetName(wkGetWebDefaultCFStringEncoding());
361
362    // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949.
363    // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949.
364    // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>.
365    // On some OS versions, the result is CP949 (uppercase).
366    if (equalIgnoringCase(systemEncodingName, "cp949"))
367        systemEncodingName = "ks_c_5601-1987";
368    return systemEncodingName;
369}
370#endif
371
372#ifndef NDEBUG
373void dumpTextEncodingNameMap()
374{
375    unsigned size = textEncodingNameMap->size();
376    fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
377
378    std::lock_guard<std::mutex> lock(encodingRegistryMutex());
379
380    TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
381    TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
382    for (; it != end; ++it)
383        fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
384}
385#endif
386
387} // namespace WebCore
388