1/* 2 * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "config.h" 27#include "TextCodecUTF8.h" 28 29#include "TextCodecASCIIFastPath.h" 30#include <wtf/text/CString.h> 31#include <wtf/text/StringBuffer.h> 32#include <wtf/unicode/CharacterNames.h> 33 34using namespace WTF; 35using namespace WTF::Unicode; 36 37namespace WebCore { 38 39const int nonCharacter = -1; 40 41PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*) 42{ 43 return adoptPtr(new TextCodecUTF8); 44} 45 46void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar) 47{ 48 registrar("UTF-8", "UTF-8"); 49 50 // Additional aliases that originally were present in the encoding 51 // table in WebKit on Macintosh, and subsequently added by 52 // TextCodecICU. Perhaps we can prove some are not used on the web 53 // and remove them. 54 registrar("unicode11utf8", "UTF-8"); 55 registrar("unicode20utf8", "UTF-8"); 56 registrar("utf8", "UTF-8"); 57 registrar("x-unicode20utf8", "UTF-8"); 58} 59 60void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar) 61{ 62 registrar("UTF-8", create, 0); 63} 64 65static inline int nonASCIISequenceLength(uint8_t firstByte) 66{ 67 static const uint8_t lengths[256] = { 68 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 69 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 70 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 71 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 75 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 76 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 82 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 83 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 84 }; 85 return lengths[firstByte]; 86} 87 88static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) 89{ 90 ASSERT(!isASCII(sequence[0])); 91 if (length == 2) { 92 ASSERT(sequence[0] <= 0xDF); 93 if (sequence[0] < 0xC2) 94 return nonCharacter; 95 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 96 return nonCharacter; 97 return ((sequence[0] << 6) + sequence[1]) - 0x00003080; 98 } 99 if (length == 3) { 100 ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); 101 switch (sequence[0]) { 102 case 0xE0: 103 if (sequence[1] < 0xA0 || sequence[1] > 0xBF) 104 return nonCharacter; 105 break; 106 case 0xED: 107 if (sequence[1] < 0x80 || sequence[1] > 0x9F) 108 return nonCharacter; 109 break; 110 default: 111 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 112 return nonCharacter; 113 } 114 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 115 return nonCharacter; 116 return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; 117 } 118 ASSERT(length == 4); 119 ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); 120 switch (sequence[0]) { 121 case 0xF0: 122 if (sequence[1] < 0x90 || sequence[1] > 0xBF) 123 return nonCharacter; 124 break; 125 case 0xF4: 126 if (sequence[1] < 0x80 || sequence[1] > 0x8F) 127 return nonCharacter; 128 break; 129 default: 130 if (sequence[1] < 0x80 || sequence[1] > 0xBF) 131 return nonCharacter; 132 } 133 if (sequence[2] < 0x80 || sequence[2] > 0xBF) 134 return nonCharacter; 135 if (sequence[3] < 0x80 || sequence[3] > 0xBF) 136 return nonCharacter; 137 return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; 138} 139 140static inline UChar* appendCharacter(UChar* destination, int character) 141{ 142 ASSERT(character != nonCharacter); 143 ASSERT(!U_IS_SURROGATE(character)); 144 if (U_IS_BMP(character)) 145 *destination++ = character; 146 else { 147 *destination++ = U16_LEAD(character); 148 *destination++ = U16_TRAIL(character); 149 } 150 return destination; 151} 152 153void TextCodecUTF8::consumePartialSequenceByte() 154{ 155 --m_partialSequenceSize; 156 memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize); 157} 158 159void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError) 160{ 161 sawError = true; 162 if (stopOnError) 163 return; 164 // Each error generates a replacement character and consumes one byte. 165 *destination++ = replacementCharacter; 166 consumePartialSequenceByte(); 167} 168 169template <> 170bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&) 171{ 172 ASSERT(m_partialSequenceSize); 173 do { 174 if (isASCII(m_partialSequence[0])) { 175 *destination++ = m_partialSequence[0]; 176 consumePartialSequenceByte(); 177 continue; 178 } 179 int count = nonASCIISequenceLength(m_partialSequence[0]); 180 if (!count) 181 return true; 182 183 if (count > m_partialSequenceSize) { 184 if (count - m_partialSequenceSize > end - source) { 185 if (!flush) { 186 // The new data is not enough to complete the sequence, so 187 // add it to the existing partial sequence. 188 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 189 m_partialSequenceSize += end - source; 190 return false; 191 } 192 // An incomplete partial sequence at the end is an error, but it will create 193 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle 194 // the error. 195 return true; 196 } 197 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 198 source += count - m_partialSequenceSize; 199 m_partialSequenceSize = count; 200 } 201 int character = decodeNonASCIISequence(m_partialSequence, count); 202 if ((character == nonCharacter) || (character > 0xff)) 203 return true; 204 205 m_partialSequenceSize -= count; 206 *destination++ = character; 207 } while (m_partialSequenceSize); 208 209 return false; 210} 211 212template <> 213bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) 214{ 215 ASSERT(m_partialSequenceSize); 216 do { 217 if (isASCII(m_partialSequence[0])) { 218 *destination++ = m_partialSequence[0]; 219 consumePartialSequenceByte(); 220 continue; 221 } 222 int count = nonASCIISequenceLength(m_partialSequence[0]); 223 if (!count) { 224 handleError(destination, stopOnError, sawError); 225 if (stopOnError) 226 return false; 227 continue; 228 } 229 if (count > m_partialSequenceSize) { 230 if (count - m_partialSequenceSize > end - source) { 231 if (!flush) { 232 // The new data is not enough to complete the sequence, so 233 // add it to the existing partial sequence. 234 memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); 235 m_partialSequenceSize += end - source; 236 return false; 237 } 238 // An incomplete partial sequence at the end is an error. 239 handleError(destination, stopOnError, sawError); 240 if (stopOnError) 241 return false; 242 continue; 243 } 244 memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); 245 source += count - m_partialSequenceSize; 246 m_partialSequenceSize = count; 247 } 248 int character = decodeNonASCIISequence(m_partialSequence, count); 249 if (character == nonCharacter) { 250 handleError(destination, stopOnError, sawError); 251 if (stopOnError) 252 return false; 253 continue; 254 } 255 256 m_partialSequenceSize -= count; 257 destination = appendCharacter(destination, character); 258 } while (m_partialSequenceSize); 259 260 return false; 261} 262 263String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 264{ 265 // Each input byte might turn into a character. 266 // That includes all bytes in the partial-sequence buffer because 267 // each byte in an invalid sequence will turn into a replacement character. 268 StringBuffer<LChar> buffer(m_partialSequenceSize + length); 269 270 const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); 271 const uint8_t* end = source + length; 272 const uint8_t* alignedEnd = alignToMachineWord(end); 273 LChar* destination = buffer.characters(); 274 275 do { 276 if (m_partialSequenceSize) { 277 // Explicitly copy destination and source pointers to avoid taking pointers to the 278 // local variables, which may harm code generation by disabling some optimizations 279 // in some compilers. 280 LChar* destinationForHandlePartialSequence = destination; 281 const uint8_t* sourceForHandlePartialSequence = source; 282 if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) { 283 source = sourceForHandlePartialSequence; 284 goto upConvertTo16Bit; 285 } 286 destination = destinationForHandlePartialSequence; 287 source = sourceForHandlePartialSequence; 288 if (m_partialSequenceSize) 289 break; 290 } 291 292 while (source < end) { 293 if (isASCII(*source)) { 294 // Fast path for ASCII. Most UTF-8 text will be ASCII. 295 if (isAlignedToMachineWord(source)) { 296 while (source < alignedEnd) { 297 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 298 if (!isAllASCII<LChar>(chunk)) 299 break; 300 copyASCIIMachineWord(destination, source); 301 source += sizeof(MachineWord); 302 destination += sizeof(MachineWord); 303 } 304 if (source == end) 305 break; 306 if (!isASCII(*source)) 307 continue; 308 } 309 *destination++ = *source++; 310 continue; 311 } 312 int count = nonASCIISequenceLength(*source); 313 int character; 314 if (!count) 315 character = nonCharacter; 316 else { 317 if (count > end - source) { 318 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 319 ASSERT(!m_partialSequenceSize); 320 m_partialSequenceSize = end - source; 321 memcpy(m_partialSequence, source, m_partialSequenceSize); 322 source = end; 323 break; 324 } 325 character = decodeNonASCIISequence(source, count); 326 } 327 if (character == nonCharacter) { 328 sawError = true; 329 if (stopOnError) 330 break; 331 332 goto upConvertTo16Bit; 333 } 334 if (character > 0xff) 335 goto upConvertTo16Bit; 336 337 source += count; 338 *destination++ = character; 339 } 340 } while (flush && m_partialSequenceSize); 341 342 buffer.shrink(destination - buffer.characters()); 343 344 return String::adopt(buffer); 345 346upConvertTo16Bit: 347 StringBuffer<UChar> buffer16(m_partialSequenceSize + length); 348 349 UChar* destination16 = buffer16.characters(); 350 351 // Copy the already converted characters 352 for (LChar* converted8 = buffer.characters(); converted8 < destination;) 353 *destination16++ = *converted8++; 354 355 do { 356 if (m_partialSequenceSize) { 357 // Explicitly copy destination and source pointers to avoid taking pointers to the 358 // local variables, which may harm code generation by disabling some optimizations 359 // in some compilers. 360 UChar* destinationForHandlePartialSequence = destination16; 361 const uint8_t* sourceForHandlePartialSequence = source; 362 handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); 363 destination16 = destinationForHandlePartialSequence; 364 source = sourceForHandlePartialSequence; 365 if (m_partialSequenceSize) 366 break; 367 } 368 369 while (source < end) { 370 if (isASCII(*source)) { 371 // Fast path for ASCII. Most UTF-8 text will be ASCII. 372 if (isAlignedToMachineWord(source)) { 373 while (source < alignedEnd) { 374 MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); 375 if (!isAllASCII<LChar>(chunk)) 376 break; 377 copyASCIIMachineWord(destination16, source); 378 source += sizeof(MachineWord); 379 destination16 += sizeof(MachineWord); 380 } 381 if (source == end) 382 break; 383 if (!isASCII(*source)) 384 continue; 385 } 386 *destination16++ = *source++; 387 continue; 388 } 389 int count = nonASCIISequenceLength(*source); 390 int character; 391 if (!count) 392 character = nonCharacter; 393 else { 394 if (count > end - source) { 395 ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); 396 ASSERT(!m_partialSequenceSize); 397 m_partialSequenceSize = end - source; 398 memcpy(m_partialSequence, source, m_partialSequenceSize); 399 source = end; 400 break; 401 } 402 character = decodeNonASCIISequence(source, count); 403 } 404 if (character == nonCharacter) { 405 sawError = true; 406 if (stopOnError) 407 break; 408 // Each error generates a replacement character and consumes one byte. 409 *destination16++ = replacementCharacter; 410 ++source; 411 continue; 412 } 413 source += count; 414 destination16 = appendCharacter(destination16, character); 415 } 416 } while (flush && m_partialSequenceSize); 417 418 buffer16.shrink(destination16 - buffer16.characters()); 419 420 return String::adopt(buffer16); 421} 422 423CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling) 424{ 425 // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3. 426 // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x). 427 // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x). 428 if (length > std::numeric_limits<size_t>::max() / 3) 429 CRASH(); 430 Vector<uint8_t> bytes(length * 3); 431 432 size_t i = 0; 433 size_t bytesWritten = 0; 434 while (i < length) { 435 UChar32 character; 436 U16_NEXT(characters, i, length, character); 437 U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character); 438 } 439 440 return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten); 441} 442 443} // namespace WebCore 444