1/* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27#include "config.h" 28#include "TextEncodingRegistry.h" 29 30#include "TextCodecICU.h" 31#include "TextCodecLatin1.h" 32#include "TextCodecUserDefined.h" 33#include "TextCodecUTF16.h" 34#include "TextCodecUTF8.h" 35#include "TextEncoding.h" 36#include <mutex> 37#include <wtf/ASCIICType.h> 38#include <wtf/HashMap.h> 39#include <wtf/HashSet.h> 40#include <wtf/MainThread.h> 41#include <wtf/NeverDestroyed.h> 42#include <wtf/StdLibExtras.h> 43#include <wtf/StringExtras.h> 44 45#if PLATFORM(COCOA) 46#include "WebCoreSystemInterface.h" 47#endif 48 49#if PLATFORM(MAC) 50#include "TextCodecMac.h" 51#endif 52 53#include <wtf/CurrentTime.h> 54#include <wtf/text/CString.h> 55 56using namespace WTF; 57 58namespace WebCore { 59 60const size_t maxEncodingNameLength = 63; 61 62// Hash for all-ASCII strings that does case folding. 63struct TextEncodingNameHash { 64 static bool equal(const char* s1, const char* s2) 65 { 66 char c1; 67 char c2; 68 do { 69 c1 = *s1++; 70 c2 = *s2++; 71 if (toASCIILower(c1) != toASCIILower(c2)) 72 return false; 73 } while (c1 && c2); 74 return !c1 && !c2; 75 } 76 77 // This algorithm is the one-at-a-time hash from: 78 // http://burtleburtle.net/bob/hash/hashfaq.html 79 // http://burtleburtle.net/bob/hash/doobs.html 80 static unsigned hash(const char* s) 81 { 82 unsigned h = WTF::stringHashingStartValue; 83 for (;;) { 84 char c = *s++; 85 if (!c) { 86 h += (h << 3); 87 h ^= (h >> 11); 88 h += (h << 15); 89 return h; 90 } 91 h += toASCIILower(c); 92 h += (h << 10); 93 h ^= (h >> 6); 94 } 95 } 96 97 static const bool safeToCompareToEmptyOrDeleted = false; 98}; 99 100struct TextCodecFactory { 101 NewTextCodecFunction function; 102 const void* additionalData; 103 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 104}; 105 106typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 107typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 108 109static std::mutex& encodingRegistryMutex() 110{ 111 // We don't have to construct this mutex in a thread safe way because this function 112 // is called on the main thread for any page before it is used in worker threads. 113 static NeverDestroyed<std::mutex> mutex; 114 115 return mutex; 116} 117 118static TextEncodingNameMap* textEncodingNameMap; 119static TextCodecMap* textCodecMap; 120static bool didExtendTextCodecMaps; 121static HashSet<const char*>* japaneseEncodings; 122static HashSet<const char*>* nonBackslashEncodings; 123 124static const char* const textEncodingNameBlacklist[] = { "UTF-7" }; 125 126#if ERROR_DISABLED 127 128static inline void checkExistingName(const char*, const char*) { } 129 130#else 131 132static void checkExistingName(const char* alias, const char* atomicName) 133{ 134 const char* oldAtomicName = textEncodingNameMap->get(alias); 135 if (!oldAtomicName) 136 return; 137 if (oldAtomicName == atomicName) 138 return; 139 // Keep the warning silent about one case where we know this will happen. 140 if (strcmp(alias, "ISO-8859-8-I") == 0 141 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 142 && strcasecmp(atomicName, "iso-8859-8") == 0) 143 return; 144 LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 145} 146 147#endif 148 149static bool isUndesiredAlias(const char* alias) 150{ 151 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 152 for (const char* p = alias; *p; ++p) { 153 if (*p == ',') 154 return true; 155 } 156 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 157 // problem, see bug 43554. 158 if (0 == strcmp(alias, "8859_1")) 159 return true; 160 return false; 161} 162 163static void addToTextEncodingNameMap(const char* alias, const char* name) 164{ 165 ASSERT(strlen(alias) <= maxEncodingNameLength); 166 if (isUndesiredAlias(alias)) 167 return; 168 const char* atomicName = textEncodingNameMap->get(name); 169 ASSERT(strcmp(alias, name) == 0 || atomicName); 170 if (!atomicName) 171 atomicName = name; 172 checkExistingName(alias, atomicName); 173 textEncodingNameMap->add(alias, atomicName); 174} 175 176static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 177{ 178 const char* atomicName = textEncodingNameMap->get(name); 179 ASSERT(atomicName); 180 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 181} 182 183static void pruneBlacklistedCodecs() 184{ 185 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 186 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 187 if (!atomicName) 188 continue; 189 190 Vector<const char*> names; 191 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 192 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 193 for (; it != end; ++it) { 194 if (it->value == atomicName) 195 names.append(it->key); 196 } 197 198 size_t length = names.size(); 199 for (size_t j = 0; j < length; ++j) 200 textEncodingNameMap->remove(names[j]); 201 202 textCodecMap->remove(atomicName); 203 } 204} 205 206static void buildBaseTextCodecMaps() 207{ 208 ASSERT(isMainThread()); 209 ASSERT(!textCodecMap); 210 ASSERT(!textEncodingNameMap); 211 212 textCodecMap = new TextCodecMap; 213 textEncodingNameMap = new TextEncodingNameMap; 214 215 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 216 TextCodecLatin1::registerCodecs(addToTextCodecMap); 217 218 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 219 TextCodecUTF8::registerCodecs(addToTextCodecMap); 220 221 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 222 TextCodecUTF16::registerCodecs(addToTextCodecMap); 223 224 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 225 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 226} 227 228static void addEncodingName(HashSet<const char*>* set, const char* name) 229{ 230 // We must not use atomicCanonicalTextEncodingName() because this function is called in it. 231 const char* atomicName = textEncodingNameMap->get(name); 232 if (atomicName) 233 set->add(atomicName); 234} 235 236static void buildQuirksSets() 237{ 238 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() 239 // and initializing the sets for them in TextEncodingRegistry.cpp look strange. 240 241 ASSERT(!japaneseEncodings); 242 ASSERT(!nonBackslashEncodings); 243 244 japaneseEncodings = new HashSet<const char*>; 245 addEncodingName(japaneseEncodings, "EUC-JP"); 246 addEncodingName(japaneseEncodings, "ISO-2022-JP"); 247 addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); 248 addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); 249 addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); 250 addEncodingName(japaneseEncodings, "JIS_C6226-1978"); 251 addEncodingName(japaneseEncodings, "JIS_X0201"); 252 addEncodingName(japaneseEncodings, "JIS_X0208-1983"); 253 addEncodingName(japaneseEncodings, "JIS_X0208-1990"); 254 addEncodingName(japaneseEncodings, "JIS_X0212-1990"); 255 addEncodingName(japaneseEncodings, "Shift_JIS"); 256 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); 257 addEncodingName(japaneseEncodings, "cp932"); 258 addEncodingName(japaneseEncodings, "x-mac-japanese"); 259 260 nonBackslashEncodings = new HashSet<const char*>; 261 // The text encodings below treat backslash as a currency symbol for IE compatibility. 262 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. 263 addEncodingName(nonBackslashEncodings, "x-mac-japanese"); 264 addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); 265 addEncodingName(nonBackslashEncodings, "EUC-JP"); 266 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. 267 addEncodingName(nonBackslashEncodings, "Shift_JIS"); 268 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); 269} 270 271bool isJapaneseEncoding(const char* canonicalEncodingName) 272{ 273 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); 274} 275 276bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) 277{ 278 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); 279} 280 281static void extendTextCodecMaps() 282{ 283 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 284 TextCodecICU::registerCodecs(addToTextCodecMap); 285 286#if PLATFORM(MAC) 287 TextCodecMac::registerEncodingNames(addToTextEncodingNameMap); 288 TextCodecMac::registerCodecs(addToTextCodecMap); 289#endif 290 291 pruneBlacklistedCodecs(); 292 buildQuirksSets(); 293} 294 295PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 296{ 297 std::lock_guard<std::mutex> lock(encodingRegistryMutex()); 298 299 ASSERT(textCodecMap); 300 TextCodecFactory factory = textCodecMap->get(encoding.name()); 301 ASSERT(factory.function); 302 return factory.function(encoding, factory.additionalData); 303} 304 305const char* atomicCanonicalTextEncodingName(const char* name) 306{ 307 if (!name || !name[0]) 308 return nullptr; 309 310 if (!textEncodingNameMap) 311 buildBaseTextCodecMaps(); 312 313 std::lock_guard<std::mutex> lock(encodingRegistryMutex()); 314 315 if (const char* atomicName = textEncodingNameMap->get(name)) 316 return atomicName; 317 if (didExtendTextCodecMaps) 318 return nullptr; 319 320 extendTextCodecMaps(); 321 didExtendTextCodecMaps = true; 322 return textEncodingNameMap->get(name); 323} 324 325template <typename CharacterType> 326const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) 327{ 328 char buffer[maxEncodingNameLength + 1]; 329 size_t j = 0; 330 for (size_t i = 0; i < length; ++i) { 331 CharacterType c = characters[i]; 332 if (j == maxEncodingNameLength) 333 return 0; 334 buffer[j++] = c; 335 } 336 buffer[j] = 0; 337 return atomicCanonicalTextEncodingName(buffer); 338} 339 340const char* atomicCanonicalTextEncodingName(const String& alias) 341{ 342 if (!alias.length()) 343 return nullptr; 344 345 if (alias.is8Bit()) 346 return atomicCanonicalTextEncodingName(alias.characters8(), alias.length()); 347 348 return atomicCanonicalTextEncodingName(alias.characters16(), alias.length()); 349} 350 351bool noExtendedTextEncodingNameUsed() 352{ 353 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 354 return !didExtendTextCodecMaps; 355} 356 357#if PLATFORM(COCOA) 358String defaultTextEncodingNameForSystemLanguage() 359{ 360 String systemEncodingName = CFStringConvertEncodingToIANACharSetName(wkGetWebDefaultCFStringEncoding()); 361 362 // CFStringConvertEncodingToIANACharSetName() returns cp949 for kTextEncodingDOSKorean AKA "extended EUC-KR" AKA windows-949. 363 // ICU uses this name for a different encoding, so we need to change the name to a value that actually gives us windows-949. 364 // In addition, this value must match what is used in Safari, see <rdar://problem/5579292>. 365 // On some OS versions, the result is CP949 (uppercase). 366 if (equalIgnoringCase(systemEncodingName, "cp949")) 367 systemEncodingName = "ks_c_5601-1987"; 368 return systemEncodingName; 369} 370#endif 371 372#ifndef NDEBUG 373void dumpTextEncodingNameMap() 374{ 375 unsigned size = textEncodingNameMap->size(); 376 fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size); 377 378 std::lock_guard<std::mutex> lock(encodingRegistryMutex()); 379 380 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 381 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 382 for (; it != end; ++it) 383 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value); 384} 385#endif 386 387} // namespace WebCore 388