1/* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap@nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "config.h" 29#include "TextEncoding.h" 30 31#include "TextCodec.h" 32#include "TextEncodingRegistry.h" 33#include <unicode/unorm.h> 34#include <wtf/OwnPtr.h> 35#include <wtf/StdLibExtras.h> 36#include <wtf/text/CString.h> 37#include <wtf/text/StringView.h> 38 39namespace WebCore { 40 41static const TextEncoding& UTF7Encoding() 42{ 43 static TextEncoding globalUTF7Encoding("UTF-7"); 44 return globalUTF7Encoding; 45} 46 47TextEncoding::TextEncoding(const char* name) 48 : m_name(atomicCanonicalTextEncodingName(name)) 49 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 50{ 51} 52 53TextEncoding::TextEncoding(const String& name) 54 : m_name(atomicCanonicalTextEncodingName(name)) 55 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 56{ 57} 58 59String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 60{ 61 if (!m_name) 62 return String(); 63 64 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); 65} 66 67CString TextEncoding::encode(StringView text, UnencodableHandling handling) const 68{ 69 if (!m_name) 70 return CString(); 71 72 if (text.isEmpty()) 73 return ""; 74 75 // FIXME: What's the right place to do normalization? 76 // It's a little strange to do it inside the encode function. 77 // Perhaps normalization should be an explicit step done before calling encode. 78 79 auto upconvertedCharacters = text.upconvertedCharacters(); 80 81 const UChar* source = upconvertedCharacters; 82 size_t sourceLength = text.length(); 83 84 Vector<UChar> normalizedCharacters; 85 86 UErrorCode err = U_ZERO_ERROR; 87 if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { 88 // First try using the length of the original string, since normalization to NFC rarely increases length. 89 normalizedCharacters.grow(sourceLength); 90 int32_t normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), sourceLength, &err); 91 if (err == U_BUFFER_OVERFLOW_ERROR) { 92 err = U_ZERO_ERROR; 93 normalizedCharacters.resize(normalizedLength); 94 normalizedLength = unorm_normalize(source, sourceLength, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 95 } 96 ASSERT(U_SUCCESS(err)); 97 98 source = normalizedCharacters.data(); 99 sourceLength = normalizedLength; 100 } 101 102 return newTextCodec(*this)->encode(source, sourceLength, handling); 103} 104 105const char* TextEncoding::domName() const 106{ 107 if (noExtendedTextEncodingNameUsed()) 108 return m_name; 109 110 // We treat EUC-KR as windows-949 (its superset), but need to expose 111 // the name 'EUC-KR' because the name 'windows-949' is not recognized by 112 // most Korean web servers even though they do use the encoding 113 // 'windows-949' with the name 'EUC-KR'. 114 // FIXME: This is not thread-safe. At the moment, this function is 115 // only accessed in a single thread, but eventually has to be made 116 // thread-safe along with usesVisualOrdering(). 117 static const char* const a = atomicCanonicalTextEncodingName("windows-949"); 118 if (m_name == a) 119 return "EUC-KR"; 120 return m_name; 121} 122 123bool TextEncoding::usesVisualOrdering() const 124{ 125 if (noExtendedTextEncodingNameUsed()) 126 return false; 127 128 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 129 return m_name == a; 130} 131 132bool TextEncoding::isJapanese() const 133{ 134 return isJapaneseEncoding(m_name); 135} 136 137UChar TextEncoding::backslashAsCurrencySymbol() const 138{ 139 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; 140} 141 142bool TextEncoding::isNonByteBasedEncoding() const 143{ 144 if (noExtendedTextEncodingNameUsed()) { 145 return *this == UTF16LittleEndianEncoding() 146 || *this == UTF16BigEndianEncoding(); 147 } 148 149 return *this == UTF16LittleEndianEncoding() 150 || *this == UTF16BigEndianEncoding() 151 || *this == UTF32BigEndianEncoding() 152 || *this == UTF32LittleEndianEncoding(); 153} 154 155bool TextEncoding::isUTF7Encoding() const 156{ 157 if (noExtendedTextEncodingNameUsed()) 158 return false; 159 160 return *this == UTF7Encoding(); 161} 162 163const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 164{ 165 if (isNonByteBasedEncoding()) 166 return UTF8Encoding(); 167 return *this; 168} 169 170// HTML5 specifies that UTF-8 be used in form submission when a form is 171// is a part of a document in UTF-16 probably because UTF-16 is not a 172// byte-based encoding and can contain 0x00. By extension, the same 173// should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 174// but it's fraught with problems and we'd rather steer clear of it. 175const TextEncoding& TextEncoding::encodingForFormSubmission() const 176{ 177 if (isNonByteBasedEncoding() || isUTF7Encoding()) 178 return UTF8Encoding(); 179 return *this; 180} 181 182const TextEncoding& ASCIIEncoding() 183{ 184 static TextEncoding globalASCIIEncoding("ASCII"); 185 return globalASCIIEncoding; 186} 187 188const TextEncoding& Latin1Encoding() 189{ 190 static TextEncoding globalLatin1Encoding("latin1"); 191 return globalLatin1Encoding; 192} 193 194const TextEncoding& UTF16BigEndianEncoding() 195{ 196 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 197 return globalUTF16BigEndianEncoding; 198} 199 200const TextEncoding& UTF16LittleEndianEncoding() 201{ 202 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 203 return globalUTF16LittleEndianEncoding; 204} 205 206const TextEncoding& UTF32BigEndianEncoding() 207{ 208 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 209 return globalUTF32BigEndianEncoding; 210} 211 212const TextEncoding& UTF32LittleEndianEncoding() 213{ 214 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 215 return globalUTF32LittleEndianEncoding; 216} 217 218const TextEncoding& UTF8Encoding() 219{ 220 static TextEncoding globalUTF8Encoding("UTF-8"); 221 ASSERT(globalUTF8Encoding.isValid()); 222 return globalUTF8Encoding; 223} 224 225const TextEncoding& WindowsLatin1Encoding() 226{ 227 static TextEncoding globalWindowsLatin1Encoding("WinLatin-1"); 228 return globalWindowsLatin1Encoding; 229} 230 231} // namespace WebCore 232