1/* 2 * Copyright (C) 1999 Lars Knoll (knoll@kde.org) 3 * (C) 1999 Antti Koivisto (koivisto@kde.org) 4 * (C) 2001 Dirk Mueller ( mueller@kde.org ) 5 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2013-2014 Apple Inc. All rights reserved. 6 * Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net) 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Library General Public 10 * License as published by the Free Software Foundation; either 11 * version 2 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Library General Public License for more details. 17 * 18 * You should have received a copy of the GNU Library General Public License 19 * along with this library; see the file COPYING.LIB. If not, write to 20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 * Boston, MA 02110-1301, USA. 22 * 23 */ 24 25#include "config.h" 26#include "StringImpl.h" 27 28#include "AtomicString.h" 29#include "StringBuffer.h" 30#include "StringHash.h" 31#include <wtf/ProcessID.h> 32#include <wtf/StdLibExtras.h> 33#include <wtf/WTFThreadData.h> 34#include <wtf/text/CString.h> 35#include <wtf/text/StringView.h> 36#include <wtf/unicode/CharacterNames.h> 37#include <wtf/unicode/UTF8.h> 38 39#ifdef STRING_STATS 40#include <unistd.h> 41#include <wtf/DataLog.h> 42#endif 43 44namespace WTF { 45 46using namespace Unicode; 47 48COMPILE_ASSERT(sizeof(StringImpl) == 2 * sizeof(int) + 2 * sizeof(void*), StringImpl_should_stay_small); 49 50#ifdef STRING_STATS 51StringStats StringImpl::m_stringStats; 52 53unsigned StringStats::s_stringRemovesTillPrintStats = StringStats::s_printStringStatsFrequency; 54 55void StringStats::removeString(StringImpl* string) 56{ 57 unsigned length = string->length(); 58 bool isSubString = string->isSubString(); 59 60 --m_totalNumberStrings; 61 62 if (string->has16BitShadow()) { 63 --m_numberUpconvertedStrings; 64 if (!isSubString) 65 m_totalUpconvertedData -= length; 66 } 67 68 if (string->is8Bit()) { 69 --m_number8BitStrings; 70 if (!isSubString) 71 m_total8BitData -= length; 72 } else { 73 --m_number16BitStrings; 74 if (!isSubString) 75 m_total16BitData -= length; 76 } 77 78 if (!--s_stringRemovesTillPrintStats) { 79 s_stringRemovesTillPrintStats = s_printStringStatsFrequency; 80 printStats(); 81 } 82} 83 84void StringStats::printStats() 85{ 86 dataLogF("String stats for process id %d:\n", getCurrentProcessID()); 87 88 unsigned long long totalNumberCharacters = m_total8BitData + m_total16BitData; 89 double percent8Bit = m_totalNumberStrings ? ((double)m_number8BitStrings * 100) / (double)m_totalNumberStrings : 0.0; 90 double average8bitLength = m_number8BitStrings ? (double)m_total8BitData / (double)m_number8BitStrings : 0.0; 91 dataLogF("%8u (%5.2f%%) 8 bit %12llu chars %12llu bytes avg length %6.1f\n", m_number8BitStrings, percent8Bit, m_total8BitData, m_total8BitData, average8bitLength); 92 93 double percent16Bit = m_totalNumberStrings ? ((double)m_number16BitStrings * 100) / (double)m_totalNumberStrings : 0.0; 94 double average16bitLength = m_number16BitStrings ? (double)m_total16BitData / (double)m_number16BitStrings : 0.0; 95 dataLogF("%8u (%5.2f%%) 16 bit %12llu chars %12llu bytes avg length %6.1f\n", m_number16BitStrings, percent16Bit, m_total16BitData, m_total16BitData * 2, average16bitLength); 96 97 double percentUpconverted = m_totalNumberStrings ? ((double)m_numberUpconvertedStrings * 100) / (double)m_number8BitStrings : 0.0; 98 double averageUpconvertedLength = m_numberUpconvertedStrings ? (double)m_totalUpconvertedData / (double)m_numberUpconvertedStrings : 0.0; 99 dataLogF("%8u (%5.2f%%) upconverted %12llu chars %12llu bytes avg length %6.1f\n", m_numberUpconvertedStrings, percentUpconverted, m_totalUpconvertedData, m_totalUpconvertedData * 2, averageUpconvertedLength); 100 101 double averageLength = m_totalNumberStrings ? (double)totalNumberCharacters / (double)m_totalNumberStrings : 0.0; 102 unsigned long long totalDataBytes = m_total8BitData + (m_total16BitData + m_totalUpconvertedData) * 2; 103 dataLogF("%8u Total %12llu chars %12llu bytes avg length %6.1f\n", m_totalNumberStrings, totalNumberCharacters, totalDataBytes, averageLength); 104 unsigned long long totalSavedBytes = m_total8BitData - m_totalUpconvertedData; 105 double percentSavings = totalSavedBytes ? ((double)totalSavedBytes * 100) / (double)(totalDataBytes + totalSavedBytes) : 0.0; 106 dataLogF(" Total savings %12llu bytes (%5.2f%%)\n", totalSavedBytes, percentSavings); 107} 108#endif 109 110 111StringImpl::~StringImpl() 112{ 113 ASSERT(!isStatic()); 114 115 STRING_STATS_REMOVE_STRING(this); 116 117 if (isAtomic() && m_length) 118 AtomicString::remove(this); 119 120 BufferOwnership ownership = bufferOwnership(); 121 122 if (ownership == BufferInternal) 123 return; 124 if (ownership == BufferOwned) { 125 // We use m_data8, but since it is a union with m_data16 this works either way. 126 ASSERT(m_data8); 127 fastFree(const_cast<LChar*>(m_data8)); 128 return; 129 } 130 131 ASSERT(ownership == BufferSubstring); 132 ASSERT(substringBuffer()); 133 substringBuffer()->deref(); 134} 135 136void StringImpl::destroy(StringImpl* stringImpl) 137{ 138 stringImpl->~StringImpl(); 139 fastFree(stringImpl); 140} 141 142PassRef<StringImpl> StringImpl::createFromLiteral(const char* characters, unsigned length) 143{ 144 ASSERT_WITH_MESSAGE(length, "Use StringImpl::empty() to create an empty string"); 145 ASSERT(charactersAreAllASCII<LChar>(reinterpret_cast<const LChar*>(characters), length)); 146 return adoptRef(*new StringImpl(reinterpret_cast<const LChar*>(characters), length, ConstructWithoutCopying)); 147} 148 149PassRef<StringImpl> StringImpl::createFromLiteral(const char* characters) 150{ 151 return createFromLiteral(characters, strlen(characters)); 152} 153 154PassRef<StringImpl> StringImpl::createWithoutCopying(const UChar* characters, unsigned length) 155{ 156 if (!length) 157 return *empty(); 158 159 return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying)); 160} 161 162PassRef<StringImpl> StringImpl::createWithoutCopying(const LChar* characters, unsigned length) 163{ 164 if (!length) 165 return *empty(); 166 167 return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying)); 168} 169 170template <typename CharType> 171inline PassRef<StringImpl> StringImpl::createUninitializedInternal(unsigned length, CharType*& data) 172{ 173 if (!length) { 174 data = 0; 175 return *empty(); 176 } 177 return createUninitializedInternalNonEmpty(length, data); 178} 179 180template <typename CharType> 181inline PassRef<StringImpl> StringImpl::createUninitializedInternalNonEmpty(unsigned length, CharType*& data) 182{ 183 ASSERT(length); 184 185 // Allocate a single buffer large enough to contain the StringImpl 186 // struct as well as the data which it contains. This removes one 187 // heap allocation from this call. 188 if (length > ((std::numeric_limits<unsigned>::max() - sizeof(StringImpl)) / sizeof(CharType))) 189 CRASH(); 190 StringImpl* string = static_cast<StringImpl*>(fastMalloc(allocationSize<CharType>(length))); 191 192 data = string->tailPointer<CharType>(); 193 return constructInternal<CharType>(string, length); 194} 195 196PassRef<StringImpl> StringImpl::createUninitialized(unsigned length, LChar*& data) 197{ 198 return createUninitializedInternal(length, data); 199} 200 201PassRef<StringImpl> StringImpl::createUninitialized(unsigned length, UChar*& data) 202{ 203 return createUninitializedInternal(length, data); 204} 205 206template <typename CharType> 207inline PassRef<StringImpl> StringImpl::reallocateInternal(PassRefPtr<StringImpl> originalString, unsigned length, CharType*& data) 208{ 209 ASSERT(originalString->hasOneRef()); 210 ASSERT(originalString->bufferOwnership() == BufferInternal); 211 212 if (!length) { 213 data = 0; 214 return *empty(); 215 } 216 217 // Same as createUninitialized() except here we use fastRealloc. 218 if (length > ((std::numeric_limits<unsigned>::max() - sizeof(StringImpl)) / sizeof(CharType))) 219 CRASH(); 220 221 originalString->~StringImpl(); 222 StringImpl* string = static_cast<StringImpl*>(fastRealloc(originalString.leakRef(), allocationSize<CharType>(length))); 223 224 data = string->tailPointer<CharType>(); 225 return constructInternal<CharType>(string, length); 226} 227 228PassRef<StringImpl> StringImpl::reallocate(PassRefPtr<StringImpl> originalString, unsigned length, LChar*& data) 229{ 230 ASSERT(originalString->is8Bit()); 231 return reallocateInternal(originalString, length, data); 232} 233 234PassRef<StringImpl> StringImpl::reallocate(PassRefPtr<StringImpl> originalString, unsigned length, UChar*& data) 235{ 236 ASSERT(!originalString->is8Bit()); 237 return reallocateInternal(originalString, length, data); 238} 239 240template <typename CharType> 241inline PassRef<StringImpl> StringImpl::createInternal(const CharType* characters, unsigned length) 242{ 243 if (!characters || !length) 244 return *empty(); 245 246 CharType* data; 247 auto string = createUninitializedInternalNonEmpty(length, data); 248 memcpy(data, characters, length * sizeof(CharType)); 249 return string; 250} 251 252PassRef<StringImpl> StringImpl::create(const UChar* characters, unsigned length) 253{ 254 return createInternal(characters, length); 255} 256 257PassRef<StringImpl> StringImpl::create(const LChar* characters, unsigned length) 258{ 259 return createInternal(characters, length); 260} 261 262PassRef<StringImpl> StringImpl::create8BitIfPossible(const UChar* characters, unsigned length) 263{ 264 if (!characters || !length) 265 return *empty(); 266 267 LChar* data; 268 RefPtr<StringImpl> string = createUninitializedInternalNonEmpty(length, data); 269 270 for (size_t i = 0; i < length; ++i) { 271 if (characters[i] & 0xff00) 272 return create(characters, length); 273 data[i] = static_cast<LChar>(characters[i]); 274 } 275 276 return string.releaseNonNull(); 277} 278 279PassRef<StringImpl> StringImpl::create8BitIfPossible(const UChar* string) 280{ 281 return StringImpl::create8BitIfPossible(string, lengthOfNullTerminatedString(string)); 282} 283 284PassRef<StringImpl> StringImpl::create(const LChar* string) 285{ 286 if (!string) 287 return *empty(); 288 size_t length = strlen(reinterpret_cast<const char*>(string)); 289 if (length > std::numeric_limits<unsigned>::max()) 290 CRASH(); 291 return create(string, length); 292} 293 294bool StringImpl::containsOnlyWhitespace() 295{ 296 // FIXME: The definition of whitespace here includes a number of characters 297 // that are not whitespace from the point of view of RenderText; I wonder if 298 // that's a problem in practice. 299 if (is8Bit()) { 300 for (unsigned i = 0; i < m_length; ++i) { 301 UChar c = m_data8[i]; 302 if (!isASCIISpace(c)) 303 return false; 304 } 305 306 return true; 307 } 308 309 for (unsigned i = 0; i < m_length; ++i) { 310 UChar c = m_data16[i]; 311 if (!isASCIISpace(c)) 312 return false; 313 } 314 return true; 315} 316 317PassRef<StringImpl> StringImpl::substring(unsigned start, unsigned length) 318{ 319 if (start >= m_length) 320 return *empty(); 321 unsigned maxLength = m_length - start; 322 if (length >= maxLength) { 323 if (!start) 324 return *this; 325 length = maxLength; 326 } 327 if (is8Bit()) 328 return create(m_data8 + start, length); 329 330 return create(m_data16 + start, length); 331} 332 333UChar32 StringImpl::characterStartingAt(unsigned i) 334{ 335 if (is8Bit()) 336 return m_data8[i]; 337 if (U16_IS_SINGLE(m_data16[i])) 338 return m_data16[i]; 339 if (i + 1 < m_length && U16_IS_LEAD(m_data16[i]) && U16_IS_TRAIL(m_data16[i + 1])) 340 return U16_GET_SUPPLEMENTARY(m_data16[i], m_data16[i + 1]); 341 return 0; 342} 343 344PassRef<StringImpl> StringImpl::lower() 345{ 346 // Note: This is a hot function in the Dromaeo benchmark, specifically the 347 // no-op code path up through the first 'return' statement. 348 349 // First scan the string for uppercase and non-ASCII characters: 350 if (is8Bit()) { 351 unsigned failingIndex; 352 for (unsigned i = 0; i < m_length; ++i) { 353 LChar character = m_data8[i]; 354 if (UNLIKELY((character & ~0x7F) || isASCIIUpper(character))) { 355 failingIndex = i; 356 goto SlowPath; 357 } 358 } 359 return *this; 360 361SlowPath: 362 LChar* data8; 363 auto newImpl = createUninitializedInternalNonEmpty(m_length, data8); 364 365 for (unsigned i = 0; i < failingIndex; ++i) 366 data8[i] = m_data8[i]; 367 368 for (unsigned i = failingIndex; i < m_length; ++i) { 369 LChar character = m_data8[i]; 370 if (!(character & ~0x7F)) 371 data8[i] = toASCIILower(character); 372 else { 373 ASSERT(u_tolower(character) <= 0xFF); 374 data8[i] = static_cast<LChar>(u_tolower(character)); 375 } 376 } 377 378 return newImpl; 379 } 380 bool noUpper = true; 381 unsigned ored = 0; 382 383 for (unsigned i = 0; i < m_length; ++i) { 384 UChar character = m_data16[i]; 385 if (UNLIKELY(isASCIIUpper(character))) 386 noUpper = false; 387 ored |= character; 388 } 389 // Nothing to do if the string is all ASCII with no uppercase. 390 if (noUpper && !(ored & ~0x7F)) 391 return *this; 392 393 if (!(ored & ~0x7F)) { 394 UChar* data16; 395 auto newImpl = createUninitializedInternalNonEmpty(m_length, data16); 396 397 for (unsigned i = 0; i < m_length; ++i) { 398 UChar c = m_data16[i]; 399 data16[i] = toASCIILower(c); 400 } 401 return newImpl; 402 } 403 404 if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) 405 CRASH(); 406 int32_t length = m_length; 407 408 // Do a slower implementation for cases that include non-ASCII characters. 409 UChar* data16; 410 RefPtr<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data16); 411 412 UErrorCode status = U_ZERO_ERROR; 413 int32_t realLength = u_strToLower(data16, length, m_data16, m_length, "", &status); 414 if (U_SUCCESS(status) && realLength == length) 415 return newImpl.releaseNonNull(); 416 417 newImpl = createUninitialized(realLength, data16); 418 status = U_ZERO_ERROR; 419 u_strToLower(data16, realLength, m_data16, m_length, "", &status); 420 if (U_FAILURE(status)) 421 return *this; 422 return newImpl.releaseNonNull(); 423} 424 425PassRef<StringImpl> StringImpl::upper() 426{ 427 // This function could be optimized for no-op cases the way lower() is, 428 // but in empirical testing, few actual calls to upper() are no-ops, so 429 // it wouldn't be worth the extra time for pre-scanning. 430 431 if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) 432 CRASH(); 433 int32_t length = m_length; 434 435 if (is8Bit()) { 436 LChar* data8; 437 RefPtr<StringImpl> newImpl = createUninitialized(m_length, data8); 438 439 // Do a faster loop for the case where all the characters are ASCII. 440 unsigned ored = 0; 441 for (int i = 0; i < length; ++i) { 442 LChar c = m_data8[i]; 443 ored |= c; 444#if CPU(X86) && defined(_MSC_VER) && _MSC_VER >=1700 445 // Workaround for an MSVC 2012 x86 optimizer bug. Remove once the bug is fixed. 446 // See https://connect.microsoft.com/VisualStudio/feedback/details/780362/optimization-bug-of-range-comparison 447 // for more details. 448 data8[i] = c >= 'a' && c <= 'z' ? c & ~0x20 : c; 449#else 450 data8[i] = toASCIIUpper(c); 451#endif 452 } 453 if (!(ored & ~0x7F)) 454 return newImpl.releaseNonNull(); 455 456 // Do a slower implementation for cases that include non-ASCII Latin-1 characters. 457 int numberSharpSCharacters = 0; 458 459 // There are two special cases. 460 // 1. latin-1 characters when converted to upper case are 16 bit characters. 461 // 2. Lower case sharp-S converts to "SS" (two characters) 462 for (int32_t i = 0; i < length; ++i) { 463 LChar c = m_data8[i]; 464 if (UNLIKELY(c == smallLetterSharpS)) 465 ++numberSharpSCharacters; 466 ASSERT(u_toupper(c) <= 0xFFFF); 467 UChar upper = u_toupper(c); 468 if (UNLIKELY(upper > 0xff)) { 469 // Since this upper-cased character does not fit in an 8-bit string, we need to take the 16-bit path. 470 goto upconvert; 471 } 472 data8[i] = static_cast<LChar>(upper); 473 } 474 475 if (!numberSharpSCharacters) 476 return newImpl.releaseNonNull(); 477 478 // We have numberSSCharacters sharp-s characters, but none of the other special characters. 479 newImpl = createUninitialized(m_length + numberSharpSCharacters, data8); 480 481 LChar* dest = data8; 482 483 for (int32_t i = 0; i < length; ++i) { 484 LChar c = m_data8[i]; 485 if (c == smallLetterSharpS) { 486 *dest++ = 'S'; 487 *dest++ = 'S'; 488 } else { 489 ASSERT(u_toupper(c) <= 0xFF); 490 *dest++ = static_cast<LChar>(u_toupper(c)); 491 } 492 } 493 494 return newImpl.releaseNonNull(); 495 } 496 497upconvert: 498 auto upconvertedCharacters = StringView(*this).upconvertedCharacters(); 499 const UChar* source16 = upconvertedCharacters; 500 501 UChar* data16; 502 RefPtr<StringImpl> newImpl = createUninitialized(m_length, data16); 503 504 // Do a faster loop for the case where all the characters are ASCII. 505 unsigned ored = 0; 506 for (int i = 0; i < length; ++i) { 507 UChar c = source16[i]; 508 ored |= c; 509 data16[i] = toASCIIUpper(c); 510 } 511 if (!(ored & ~0x7F)) 512 return newImpl.releaseNonNull(); 513 514 // Do a slower implementation for cases that include non-ASCII characters. 515 UErrorCode status = U_ZERO_ERROR; 516 int32_t realLength = u_strToUpper(data16, length, source16, m_length, "", &status); 517 if (U_SUCCESS(status) && realLength == length) 518 return newImpl.releaseNonNull(); 519 newImpl = createUninitialized(realLength, data16); 520 status = U_ZERO_ERROR; 521 u_strToUpper(data16, realLength, source16, m_length, "", &status); 522 if (U_FAILURE(status)) 523 return *this; 524 return newImpl.releaseNonNull(); 525} 526 527static inline bool needsTurkishCasingRules(const AtomicString& localeIdentifier) 528{ 529 // Either "tr" or "az" locale, with case sensitive comparison and allowing for an ignored subtag. 530 UChar first = localeIdentifier[0]; 531 UChar second = localeIdentifier[1]; 532 return ((isASCIIAlphaCaselessEqual(first, 't') && isASCIIAlphaCaselessEqual(second, 'r')) 533 || (isASCIIAlphaCaselessEqual(first, 'a') && isASCIIAlphaCaselessEqual(second, 'z'))) 534 && (localeIdentifier.length() == 2 || localeIdentifier[2] == '-'); 535} 536 537PassRef<StringImpl> StringImpl::lower(const AtomicString& localeIdentifier) 538{ 539 // Use the more-optimized code path most of the time. 540 // Assuming here that the only locale-specific lowercasing is the Turkish casing rules. 541 // FIXME: Could possibly optimize further by looking for the specific sequences 542 // that have locale-specific lowercasing. There are only three of them. 543 if (!needsTurkishCasingRules(localeIdentifier)) 544 return lower(); 545 546 // FIXME: Could share more code with the main StringImpl::lower by factoring out 547 // this last part into a shared function that takes a locale string, since this is 548 // just like the end of that function. 549 550 if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) 551 CRASH(); 552 int length = m_length; 553 554 // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than 555 // allocating memory just to turn localeIdentifier into a C string, and we assume 556 // there is no difference between the uppercasing for "tr" and "az" locales. 557 auto upconvertedCharacters = StringView(*this).upconvertedCharacters(); 558 const UChar* source16 = upconvertedCharacters; 559 UChar* data16; 560 RefPtr<StringImpl> newString = createUninitialized(length, data16); 561 UErrorCode status = U_ZERO_ERROR; 562 int realLength = u_strToLower(data16, length, source16, length, "tr", &status); 563 if (U_SUCCESS(status) && realLength == length) 564 return newString.releaseNonNull(); 565 newString = createUninitialized(realLength, data16); 566 status = U_ZERO_ERROR; 567 u_strToLower(data16, realLength, source16, length, "tr", &status); 568 if (U_FAILURE(status)) 569 return *this; 570 return newString.releaseNonNull(); 571} 572 573PassRef<StringImpl> StringImpl::upper(const AtomicString& localeIdentifier) 574{ 575 // Use the more-optimized code path most of the time. 576 // Assuming here that the only locale-specific lowercasing is the Turkish casing rules, 577 // and that the only affected character is lowercase "i". 578 if (!needsTurkishCasingRules(localeIdentifier) || find('i') == notFound) 579 return upper(); 580 581 if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) 582 CRASH(); 583 int length = m_length; 584 585 // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than 586 // allocating memory just to turn localeIdentifier into a C string, and we assume 587 // there is no difference between the uppercasing for "tr" and "az" locales. 588 auto upconvertedCharacters = StringView(*this).upconvertedCharacters(); 589 const UChar* source16 = upconvertedCharacters; 590 UChar* data16; 591 RefPtr<StringImpl> newString = createUninitialized(length, data16); 592 UErrorCode status = U_ZERO_ERROR; 593 int realLength = u_strToUpper(data16, length, source16, length, "tr", &status); 594 if (U_SUCCESS(status) && realLength == length) 595 return newString.releaseNonNull(); 596 newString = createUninitialized(realLength, data16); 597 status = U_ZERO_ERROR; 598 u_strToUpper(data16, realLength, source16, length, "tr", &status); 599 if (U_FAILURE(status)) 600 return *this; 601 return newString.releaseNonNull(); 602} 603 604PassRef<StringImpl> StringImpl::fill(UChar character) 605{ 606 if (!(character & ~0x7F)) { 607 LChar* data; 608 auto newImpl = createUninitialized(m_length, data); 609 for (unsigned i = 0; i < m_length; ++i) 610 data[i] = character; 611 return newImpl; 612 } 613 UChar* data; 614 auto newImpl = createUninitialized(m_length, data); 615 for (unsigned i = 0; i < m_length; ++i) 616 data[i] = character; 617 return newImpl; 618} 619 620PassRef<StringImpl> StringImpl::foldCase() 621{ 622 // FIXME: Why doesn't this function have a preflight like the one in StringImpl::lower? 623 624 if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max())) 625 CRASH(); 626 int32_t length = m_length; 627 628 if (is8Bit()) { 629 // Do a faster loop for the case where all the characters are ASCII. 630 LChar* data; 631 auto newImpl = createUninitialized(m_length, data); 632 LChar ored = 0; 633 634 for (int32_t i = 0; i < length; ++i) { 635 LChar c = m_data8[i]; 636 data[i] = toASCIILower(c); 637 ored |= c; 638 } 639 640 if (!(ored & ~0x7F)) 641 return newImpl; 642 643 // Do a slower implementation for cases that include non-ASCII Latin-1 characters. 644 // FIXME: Shouldn't this use u_foldCase instead of u_tolower? 645 for (int32_t i = 0; i < length; ++i) { 646 ASSERT(u_tolower(m_data8[i]) <= 0xFF); 647 data[i] = static_cast<LChar>(u_tolower(m_data8[i])); 648 } 649 650 return newImpl; 651 } 652 653 // Do a faster loop for the case where all the characters are ASCII. 654 UChar* data; 655 RefPtr<StringImpl> newImpl = createUninitialized(m_length, data); 656 UChar ored = 0; 657 for (int32_t i = 0; i < length; ++i) { 658 UChar c = m_data16[i]; 659 ored |= c; 660 data[i] = toASCIILower(c); 661 } 662 if (!(ored & ~0x7F)) 663 return newImpl.releaseNonNull(); 664 665 // Do a slower implementation for cases that include non-ASCII characters. 666 UErrorCode status = U_ZERO_ERROR; 667 int32_t realLength = u_strFoldCase(data, length, m_data16, m_length, U_FOLD_CASE_DEFAULT, &status); 668 if (U_SUCCESS(status) && realLength == length) 669 return newImpl.releaseNonNull(); 670 newImpl = createUninitialized(realLength, data); 671 status = U_ZERO_ERROR; 672 u_strFoldCase(data, realLength, m_data16, m_length, U_FOLD_CASE_DEFAULT, &status); 673 if (U_FAILURE(status)) 674 return *this; 675 return newImpl.releaseNonNull(); 676} 677 678PassRef<StringImpl> StringImpl::convertToASCIILowercase() 679{ 680 if (is8Bit()) { 681 unsigned failingIndex; 682 for (unsigned i = 0; i < m_length; ++i) { 683 LChar character = m_data8[i]; 684 if (UNLIKELY(isASCIIUpper(character))) { 685 failingIndex = i; 686 goto SlowPath; 687 } 688 } 689 return *this; 690 691SlowPath: 692 LChar* data8; 693 PassRef<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data8); 694 for (unsigned i = 0; i < failingIndex; ++i) 695 data8[i] = m_data8[i]; 696 for (unsigned i = failingIndex; i < m_length; ++i) 697 data8[i] = toASCIILower(m_data8[i]); 698 return newImpl; 699 } 700 701 bool noUpper = true; 702 for (unsigned i = 0; i < m_length; ++i) { 703 if (UNLIKELY(isASCIIUpper(m_data16[i]))) 704 noUpper = false; 705 } 706 if (noUpper) 707 return *this; 708 709 UChar* data16; 710 PassRef<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data16); 711 for (unsigned i = 0; i < m_length; ++i) 712 data16[i] = toASCIILower(m_data16[i]); 713 return newImpl; 714} 715 716template <class UCharPredicate> 717inline PassRef<StringImpl> StringImpl::stripMatchedCharacters(UCharPredicate predicate) 718{ 719 if (!m_length) 720 return *this; 721 722 unsigned start = 0; 723 unsigned end = m_length - 1; 724 725 // skip white space from start 726 while (start <= end && predicate(is8Bit() ? m_data8[start] : m_data16[start])) 727 ++start; 728 729 // only white space 730 if (start > end) 731 return *empty(); 732 733 // skip white space from end 734 while (end && predicate(is8Bit() ? m_data8[end] : m_data16[end])) 735 --end; 736 737 if (!start && end == m_length - 1) 738 return *this; 739 if (is8Bit()) 740 return create(m_data8 + start, end + 1 - start); 741 return create(m_data16 + start, end + 1 - start); 742} 743 744class UCharPredicate { 745public: 746 inline UCharPredicate(CharacterMatchFunctionPtr function): m_function(function) { } 747 748 inline bool operator()(UChar ch) const 749 { 750 return m_function(ch); 751 } 752 753private: 754 const CharacterMatchFunctionPtr m_function; 755}; 756 757class SpaceOrNewlinePredicate { 758public: 759 inline bool operator()(UChar ch) const 760 { 761 return isSpaceOrNewline(ch); 762 } 763}; 764 765PassRef<StringImpl> StringImpl::stripWhiteSpace() 766{ 767 return stripMatchedCharacters(SpaceOrNewlinePredicate()); 768} 769 770PassRef<StringImpl> StringImpl::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) 771{ 772 return stripMatchedCharacters(UCharPredicate(isWhiteSpace)); 773} 774 775template <typename CharType> 776ALWAYS_INLINE PassRef<StringImpl> StringImpl::removeCharacters(const CharType* characters, CharacterMatchFunctionPtr findMatch) 777{ 778 const CharType* from = characters; 779 const CharType* fromend = from + m_length; 780 781 // Assume the common case will not remove any characters 782 while (from != fromend && !findMatch(*from)) 783 ++from; 784 if (from == fromend) 785 return *this; 786 787 StringBuffer<CharType> data(m_length); 788 CharType* to = data.characters(); 789 unsigned outc = from - characters; 790 791 if (outc) 792 memcpy(to, characters, outc * sizeof(CharType)); 793 794 while (true) { 795 while (from != fromend && findMatch(*from)) 796 ++from; 797 while (from != fromend && !findMatch(*from)) 798 to[outc++] = *from++; 799 if (from == fromend) 800 break; 801 } 802 803 data.shrink(outc); 804 805 return adopt(data); 806} 807 808PassRef<StringImpl> StringImpl::removeCharacters(CharacterMatchFunctionPtr findMatch) 809{ 810 if (is8Bit()) 811 return removeCharacters(characters8(), findMatch); 812 return removeCharacters(characters16(), findMatch); 813} 814 815template <typename CharType, class UCharPredicate> 816inline PassRef<StringImpl> StringImpl::simplifyMatchedCharactersToSpace(UCharPredicate predicate) 817{ 818 StringBuffer<CharType> data(m_length); 819 820 const CharType* from = characters<CharType>(); 821 const CharType* fromend = from + m_length; 822 int outc = 0; 823 bool changedToSpace = false; 824 825 CharType* to = data.characters(); 826 827 while (true) { 828 while (from != fromend && predicate(*from)) { 829 if (*from != ' ') 830 changedToSpace = true; 831 ++from; 832 } 833 while (from != fromend && !predicate(*from)) 834 to[outc++] = *from++; 835 if (from != fromend) 836 to[outc++] = ' '; 837 else 838 break; 839 } 840 841 if (outc > 0 && to[outc - 1] == ' ') 842 --outc; 843 844 if (static_cast<unsigned>(outc) == m_length && !changedToSpace) 845 return *this; 846 847 data.shrink(outc); 848 849 return adopt(data); 850} 851 852PassRef<StringImpl> StringImpl::simplifyWhiteSpace() 853{ 854 if (is8Bit()) 855 return StringImpl::simplifyMatchedCharactersToSpace<LChar>(SpaceOrNewlinePredicate()); 856 return StringImpl::simplifyMatchedCharactersToSpace<UChar>(SpaceOrNewlinePredicate()); 857} 858 859PassRef<StringImpl> StringImpl::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) 860{ 861 if (is8Bit()) 862 return StringImpl::simplifyMatchedCharactersToSpace<LChar>(UCharPredicate(isWhiteSpace)); 863 return StringImpl::simplifyMatchedCharactersToSpace<UChar>(UCharPredicate(isWhiteSpace)); 864} 865 866int StringImpl::toIntStrict(bool* ok, int base) 867{ 868 if (is8Bit()) 869 return charactersToIntStrict(characters8(), m_length, ok, base); 870 return charactersToIntStrict(characters16(), m_length, ok, base); 871} 872 873unsigned StringImpl::toUIntStrict(bool* ok, int base) 874{ 875 if (is8Bit()) 876 return charactersToUIntStrict(characters8(), m_length, ok, base); 877 return charactersToUIntStrict(characters16(), m_length, ok, base); 878} 879 880int64_t StringImpl::toInt64Strict(bool* ok, int base) 881{ 882 if (is8Bit()) 883 return charactersToInt64Strict(characters8(), m_length, ok, base); 884 return charactersToInt64Strict(characters16(), m_length, ok, base); 885} 886 887uint64_t StringImpl::toUInt64Strict(bool* ok, int base) 888{ 889 if (is8Bit()) 890 return charactersToUInt64Strict(characters8(), m_length, ok, base); 891 return charactersToUInt64Strict(characters16(), m_length, ok, base); 892} 893 894intptr_t StringImpl::toIntPtrStrict(bool* ok, int base) 895{ 896 if (is8Bit()) 897 return charactersToIntPtrStrict(characters8(), m_length, ok, base); 898 return charactersToIntPtrStrict(characters16(), m_length, ok, base); 899} 900 901int StringImpl::toInt(bool* ok) 902{ 903 if (is8Bit()) 904 return charactersToInt(characters8(), m_length, ok); 905 return charactersToInt(characters16(), m_length, ok); 906} 907 908unsigned StringImpl::toUInt(bool* ok) 909{ 910 if (is8Bit()) 911 return charactersToUInt(characters8(), m_length, ok); 912 return charactersToUInt(characters16(), m_length, ok); 913} 914 915int64_t StringImpl::toInt64(bool* ok) 916{ 917 if (is8Bit()) 918 return charactersToInt64(characters8(), m_length, ok); 919 return charactersToInt64(characters16(), m_length, ok); 920} 921 922uint64_t StringImpl::toUInt64(bool* ok) 923{ 924 if (is8Bit()) 925 return charactersToUInt64(characters8(), m_length, ok); 926 return charactersToUInt64(characters16(), m_length, ok); 927} 928 929intptr_t StringImpl::toIntPtr(bool* ok) 930{ 931 if (is8Bit()) 932 return charactersToIntPtr(characters8(), m_length, ok); 933 return charactersToIntPtr(characters16(), m_length, ok); 934} 935 936double StringImpl::toDouble(bool* ok) 937{ 938 if (is8Bit()) 939 return charactersToDouble(characters8(), m_length, ok); 940 return charactersToDouble(characters16(), m_length, ok); 941} 942 943float StringImpl::toFloat(bool* ok) 944{ 945 if (is8Bit()) 946 return charactersToFloat(characters8(), m_length, ok); 947 return charactersToFloat(characters16(), m_length, ok); 948} 949 950bool equalIgnoringCase(const LChar* a, const LChar* b, unsigned length) 951{ 952 while (length--) { 953 if (StringImpl::latin1CaseFoldTable[*a++] != StringImpl::latin1CaseFoldTable[*b++]) 954 return false; 955 } 956 return true; 957} 958 959bool equalIgnoringCase(const UChar* a, const LChar* b, unsigned length) 960{ 961 while (length--) { 962 if (u_foldCase(*a++, U_FOLD_CASE_DEFAULT) != StringImpl::latin1CaseFoldTable[*b++]) 963 return false; 964 } 965 return true; 966} 967 968size_t StringImpl::find(CharacterMatchFunctionPtr matchFunction, unsigned start) 969{ 970 if (is8Bit()) 971 return WTF::find(characters8(), m_length, matchFunction, start); 972 return WTF::find(characters16(), m_length, matchFunction, start); 973} 974 975size_t StringImpl::find(const LChar* matchString, unsigned index) 976{ 977 // Check for null or empty string to match against 978 if (!matchString) 979 return notFound; 980 size_t matchStringLength = strlen(reinterpret_cast<const char*>(matchString)); 981 if (matchStringLength > std::numeric_limits<unsigned>::max()) 982 CRASH(); 983 unsigned matchLength = matchStringLength; 984 if (!matchLength) 985 return std::min(index, length()); 986 987 // Optimization 1: fast case for strings of length 1. 988 if (matchLength == 1) { 989 if (is8Bit()) 990 return WTF::find(characters8(), length(), matchString[0], index); 991 return WTF::find(characters16(), length(), *matchString, index); 992 } 993 994 // Check index & matchLength are in range. 995 if (index > length()) 996 return notFound; 997 unsigned searchLength = length() - index; 998 if (matchLength > searchLength) 999 return notFound; 1000 // delta is the number of additional times to test; delta == 0 means test only once. 1001 unsigned delta = searchLength - matchLength; 1002 1003 // Optimization 2: keep a running hash of the strings, 1004 // only call equal if the hashes match. 1005 1006 if (is8Bit()) { 1007 const LChar* searchCharacters = characters8() + index; 1008 1009 unsigned searchHash = 0; 1010 unsigned matchHash = 0; 1011 for (unsigned i = 0; i < matchLength; ++i) { 1012 searchHash += searchCharacters[i]; 1013 matchHash += matchString[i]; 1014 } 1015 1016 unsigned i = 0; 1017 while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) { 1018 if (i == delta) 1019 return notFound; 1020 searchHash += searchCharacters[i + matchLength]; 1021 searchHash -= searchCharacters[i]; 1022 ++i; 1023 } 1024 return index + i; 1025 } 1026 1027 const UChar* searchCharacters = characters16() + index; 1028 1029 unsigned searchHash = 0; 1030 unsigned matchHash = 0; 1031 for (unsigned i = 0; i < matchLength; ++i) { 1032 searchHash += searchCharacters[i]; 1033 matchHash += matchString[i]; 1034 } 1035 1036 unsigned i = 0; 1037 while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) { 1038 if (i == delta) 1039 return notFound; 1040 searchHash += searchCharacters[i + matchLength]; 1041 searchHash -= searchCharacters[i]; 1042 ++i; 1043 } 1044 return index + i; 1045} 1046 1047size_t StringImpl::findIgnoringCase(const LChar* matchString, unsigned index) 1048{ 1049 // Check for null or empty string to match against 1050 if (!matchString) 1051 return notFound; 1052 size_t matchStringLength = strlen(reinterpret_cast<const char*>(matchString)); 1053 if (matchStringLength > std::numeric_limits<unsigned>::max()) 1054 CRASH(); 1055 unsigned matchLength = matchStringLength; 1056 if (!matchLength) 1057 return std::min(index, length()); 1058 1059 // Check index & matchLength are in range. 1060 if (index > length()) 1061 return notFound; 1062 unsigned searchLength = length() - index; 1063 if (matchLength > searchLength) 1064 return notFound; 1065 // delta is the number of additional times to test; delta == 0 means test only once. 1066 unsigned delta = searchLength - matchLength; 1067 1068 if (is8Bit()) { 1069 const LChar* searchCharacters = characters8() + index; 1070 1071 unsigned i = 0; 1072 while (!equalIgnoringCase(searchCharacters + i, matchString, matchLength)) { 1073 if (i == delta) 1074 return notFound; 1075 ++i; 1076 } 1077 return index + i; 1078 } 1079 1080 const UChar* searchCharacters = characters16() + index; 1081 1082 unsigned i = 0; 1083 while (!equalIgnoringCase(searchCharacters + i, matchString, matchLength)) { 1084 if (i == delta) 1085 return notFound; 1086 ++i; 1087 } 1088 return index + i; 1089} 1090 1091template <typename SearchCharacterType, typename MatchCharacterType> 1092ALWAYS_INLINE static size_t findInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned searchLength, unsigned matchLength) 1093{ 1094 // Optimization: keep a running hash of the strings, 1095 // only call equal() if the hashes match. 1096 1097 // delta is the number of additional times to test; delta == 0 means test only once. 1098 unsigned delta = searchLength - matchLength; 1099 1100 unsigned searchHash = 0; 1101 unsigned matchHash = 0; 1102 1103 for (unsigned i = 0; i < matchLength; ++i) { 1104 searchHash += searchCharacters[i]; 1105 matchHash += matchCharacters[i]; 1106 } 1107 1108 unsigned i = 0; 1109 // keep looping until we match 1110 while (searchHash != matchHash || !equal(searchCharacters + i, matchCharacters, matchLength)) { 1111 if (i == delta) 1112 return notFound; 1113 searchHash += searchCharacters[i + matchLength]; 1114 searchHash -= searchCharacters[i]; 1115 ++i; 1116 } 1117 return index + i; 1118} 1119 1120size_t StringImpl::find(StringImpl* matchString) 1121{ 1122 // Check for null string to match against 1123 if (UNLIKELY(!matchString)) 1124 return notFound; 1125 unsigned matchLength = matchString->length(); 1126 1127 // Optimization 1: fast case for strings of length 1. 1128 if (matchLength == 1) { 1129 if (is8Bit()) { 1130 if (matchString->is8Bit()) 1131 return WTF::find(characters8(), length(), matchString->characters8()[0]); 1132 return WTF::find(characters8(), length(), matchString->characters16()[0]); 1133 } 1134 if (matchString->is8Bit()) 1135 return WTF::find(characters16(), length(), matchString->characters8()[0]); 1136 return WTF::find(characters16(), length(), matchString->characters16()[0]); 1137 } 1138 1139 // Check matchLength is in range. 1140 if (matchLength > length()) 1141 return notFound; 1142 1143 // Check for empty string to match against 1144 if (UNLIKELY(!matchLength)) 1145 return 0; 1146 1147 if (is8Bit()) { 1148 if (matchString->is8Bit()) 1149 return findInner(characters8(), matchString->characters8(), 0, length(), matchLength); 1150 return findInner(characters8(), matchString->characters16(), 0, length(), matchLength); 1151 } 1152 1153 if (matchString->is8Bit()) 1154 return findInner(characters16(), matchString->characters8(), 0, length(), matchLength); 1155 1156 return findInner(characters16(), matchString->characters16(), 0, length(), matchLength); 1157} 1158 1159size_t StringImpl::find(StringImpl* matchString, unsigned index) 1160{ 1161 // Check for null or empty string to match against 1162 if (UNLIKELY(!matchString)) 1163 return notFound; 1164 1165 unsigned matchLength = matchString->length(); 1166 1167 // Optimization 1: fast case for strings of length 1. 1168 if (matchLength == 1) { 1169 if (is8Bit()) 1170 return WTF::find(characters8(), length(), (*matchString)[0], index); 1171 return WTF::find(characters16(), length(), (*matchString)[0], index); 1172 } 1173 1174 if (UNLIKELY(!matchLength)) 1175 return std::min(index, length()); 1176 1177 // Check index & matchLength are in range. 1178 if (index > length()) 1179 return notFound; 1180 unsigned searchLength = length() - index; 1181 if (matchLength > searchLength) 1182 return notFound; 1183 1184 if (is8Bit()) { 1185 if (matchString->is8Bit()) 1186 return findInner(characters8() + index, matchString->characters8(), index, searchLength, matchLength); 1187 return findInner(characters8() + index, matchString->characters16(), index, searchLength, matchLength); 1188 } 1189 1190 if (matchString->is8Bit()) 1191 return findInner(characters16() + index, matchString->characters8(), index, searchLength, matchLength); 1192 1193 return findInner(characters16() + index, matchString->characters16(), index, searchLength, matchLength); 1194} 1195 1196template <typename SearchCharacterType, typename MatchCharacterType> 1197ALWAYS_INLINE static size_t findIgnoringCaseInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned searchLength, unsigned matchLength) 1198{ 1199 // delta is the number of additional times to test; delta == 0 means test only once. 1200 unsigned delta = searchLength - matchLength; 1201 1202 unsigned i = 0; 1203 // keep looping until we match 1204 while (!equalIgnoringCase(searchCharacters + i, matchCharacters, matchLength)) { 1205 if (i == delta) 1206 return notFound; 1207 ++i; 1208 } 1209 return index + i; 1210} 1211 1212size_t StringImpl::findIgnoringCase(StringImpl* matchString, unsigned index) 1213{ 1214 // Check for null or empty string to match against 1215 if (!matchString) 1216 return notFound; 1217 unsigned matchLength = matchString->length(); 1218 if (!matchLength) 1219 return std::min(index, length()); 1220 1221 // Check index & matchLength are in range. 1222 if (index > length()) 1223 return notFound; 1224 unsigned searchLength = length() - index; 1225 if (matchLength > searchLength) 1226 return notFound; 1227 1228 if (is8Bit()) { 1229 if (matchString->is8Bit()) 1230 return findIgnoringCaseInner(characters8() + index, matchString->characters8(), index, searchLength, matchLength); 1231 return findIgnoringCaseInner(characters8() + index, matchString->characters16(), index, searchLength, matchLength); 1232 } 1233 1234 if (matchString->is8Bit()) 1235 return findIgnoringCaseInner(characters16() + index, matchString->characters8(), index, searchLength, matchLength); 1236 1237 return findIgnoringCaseInner(characters16() + index, matchString->characters16(), index, searchLength, matchLength); 1238} 1239 1240size_t StringImpl::findNextLineStart(unsigned index) 1241{ 1242 if (is8Bit()) 1243 return WTF::findNextLineStart(characters8(), m_length, index); 1244 return WTF::findNextLineStart(characters16(), m_length, index); 1245} 1246 1247size_t StringImpl::reverseFind(UChar c, unsigned index) 1248{ 1249 if (is8Bit()) 1250 return WTF::reverseFind(characters8(), m_length, c, index); 1251 return WTF::reverseFind(characters16(), m_length, c, index); 1252} 1253 1254template <typename SearchCharacterType, typename MatchCharacterType> 1255ALWAYS_INLINE static size_t reverseFindInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned length, unsigned matchLength) 1256{ 1257 // Optimization: keep a running hash of the strings, 1258 // only call equal if the hashes match. 1259 1260 // delta is the number of additional times to test; delta == 0 means test only once. 1261 unsigned delta = std::min(index, length - matchLength); 1262 1263 unsigned searchHash = 0; 1264 unsigned matchHash = 0; 1265 for (unsigned i = 0; i < matchLength; ++i) { 1266 searchHash += searchCharacters[delta + i]; 1267 matchHash += matchCharacters[i]; 1268 } 1269 1270 // keep looping until we match 1271 while (searchHash != matchHash || !equal(searchCharacters + delta, matchCharacters, matchLength)) { 1272 if (!delta) 1273 return notFound; 1274 --delta; 1275 searchHash -= searchCharacters[delta + matchLength]; 1276 searchHash += searchCharacters[delta]; 1277 } 1278 return delta; 1279} 1280 1281size_t StringImpl::reverseFind(StringImpl* matchString, unsigned index) 1282{ 1283 // Check for null or empty string to match against 1284 if (!matchString) 1285 return notFound; 1286 unsigned matchLength = matchString->length(); 1287 unsigned ourLength = length(); 1288 if (!matchLength) 1289 return std::min(index, ourLength); 1290 1291 // Optimization 1: fast case for strings of length 1. 1292 if (matchLength == 1) { 1293 if (is8Bit()) 1294 return WTF::reverseFind(characters8(), ourLength, (*matchString)[0], index); 1295 return WTF::reverseFind(characters16(), ourLength, (*matchString)[0], index); 1296 } 1297 1298 // Check index & matchLength are in range. 1299 if (matchLength > ourLength) 1300 return notFound; 1301 1302 if (is8Bit()) { 1303 if (matchString->is8Bit()) 1304 return reverseFindInner(characters8(), matchString->characters8(), index, ourLength, matchLength); 1305 return reverseFindInner(characters8(), matchString->characters16(), index, ourLength, matchLength); 1306 } 1307 1308 if (matchString->is8Bit()) 1309 return reverseFindInner(characters16(), matchString->characters8(), index, ourLength, matchLength); 1310 1311 return reverseFindInner(characters16(), matchString->characters16(), index, ourLength, matchLength); 1312} 1313 1314template <typename SearchCharacterType, typename MatchCharacterType> 1315ALWAYS_INLINE static size_t reverseFindIgnoringCaseInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned length, unsigned matchLength) 1316{ 1317 // delta is the number of additional times to test; delta == 0 means test only once. 1318 unsigned delta = std::min(index, length - matchLength); 1319 1320 // keep looping until we match 1321 while (!equalIgnoringCase(searchCharacters + delta, matchCharacters, matchLength)) { 1322 if (!delta) 1323 return notFound; 1324 --delta; 1325 } 1326 return delta; 1327} 1328 1329size_t StringImpl::reverseFindIgnoringCase(StringImpl* matchString, unsigned index) 1330{ 1331 // Check for null or empty string to match against 1332 if (!matchString) 1333 return notFound; 1334 unsigned matchLength = matchString->length(); 1335 unsigned ourLength = length(); 1336 if (!matchLength) 1337 return std::min(index, ourLength); 1338 1339 // Check index & matchLength are in range. 1340 if (matchLength > ourLength) 1341 return notFound; 1342 1343 if (is8Bit()) { 1344 if (matchString->is8Bit()) 1345 return reverseFindIgnoringCaseInner(characters8(), matchString->characters8(), index, ourLength, matchLength); 1346 return reverseFindIgnoringCaseInner(characters8(), matchString->characters16(), index, ourLength, matchLength); 1347 } 1348 1349 if (matchString->is8Bit()) 1350 return reverseFindIgnoringCaseInner(characters16(), matchString->characters8(), index, ourLength, matchLength); 1351 1352 return reverseFindIgnoringCaseInner(characters16(), matchString->characters16(), index, ourLength, matchLength); 1353} 1354 1355ALWAYS_INLINE static bool equalInner(const StringImpl* stringImpl, unsigned startOffset, const char* matchString, unsigned matchLength, bool caseSensitive) 1356{ 1357 ASSERT(stringImpl); 1358 ASSERT(matchLength <= stringImpl->length()); 1359 ASSERT(startOffset + matchLength <= stringImpl->length()); 1360 1361 if (caseSensitive) { 1362 if (stringImpl->is8Bit()) 1363 return equal(stringImpl->characters8() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength); 1364 return equal(stringImpl->characters16() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength); 1365 } 1366 if (stringImpl->is8Bit()) 1367 return equalIgnoringCase(stringImpl->characters8() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength); 1368 return equalIgnoringCase(stringImpl->characters16() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength); 1369} 1370 1371bool StringImpl::startsWith(const StringImpl* str) const 1372{ 1373 if (!str) 1374 return false; 1375 1376 if (str->length() > length()) 1377 return false; 1378 1379 if (is8Bit()) { 1380 if (str->is8Bit()) 1381 return equal(characters8(), str->characters8(), str->length()); 1382 return equal(characters8(), str->characters16(), str->length()); 1383 } 1384 if (str->is8Bit()) 1385 return equal(characters16(), str->characters8(), str->length()); 1386 return equal(characters16(), str->characters16(), str->length()); 1387} 1388 1389bool StringImpl::startsWith(UChar character) const 1390{ 1391 return m_length && (*this)[0] == character; 1392} 1393 1394bool StringImpl::startsWith(const char* matchString, unsigned matchLength, bool caseSensitive) const 1395{ 1396 ASSERT(matchLength); 1397 if (matchLength > length()) 1398 return false; 1399 return equalInner(this, 0, matchString, matchLength, caseSensitive); 1400} 1401 1402bool StringImpl::endsWith(StringImpl* matchString, bool caseSensitive) 1403{ 1404 ASSERT(matchString); 1405 if (m_length >= matchString->m_length) { 1406 unsigned start = m_length - matchString->m_length; 1407 return (caseSensitive ? find(matchString, start) : findIgnoringCase(matchString, start)) == start; 1408 } 1409 return false; 1410} 1411 1412bool StringImpl::endsWith(UChar character) const 1413{ 1414 return m_length && (*this)[m_length - 1] == character; 1415} 1416 1417bool StringImpl::endsWith(const char* matchString, unsigned matchLength, bool caseSensitive) const 1418{ 1419 ASSERT(matchLength); 1420 if (matchLength > length()) 1421 return false; 1422 unsigned startOffset = length() - matchLength; 1423 return equalInner(this, startOffset, matchString, matchLength, caseSensitive); 1424} 1425 1426PassRef<StringImpl> StringImpl::replace(UChar oldC, UChar newC) 1427{ 1428 if (oldC == newC) 1429 return *this; 1430 unsigned i; 1431 for (i = 0; i != m_length; ++i) { 1432 UChar c = is8Bit() ? m_data8[i] : m_data16[i]; 1433 if (c == oldC) 1434 break; 1435 } 1436 if (i == m_length) 1437 return *this; 1438 1439 if (is8Bit()) { 1440 if (oldC > 0xff) 1441 // Looking for a 16 bit char in an 8 bit string, we're done. 1442 return *this; 1443 1444 if (newC <= 0xff) { 1445 LChar* data; 1446 LChar oldChar = static_cast<LChar>(oldC); 1447 LChar newChar = static_cast<LChar>(newC); 1448 1449 auto newImpl = createUninitializedInternalNonEmpty(m_length, data); 1450 1451 for (i = 0; i != m_length; ++i) { 1452 LChar ch = m_data8[i]; 1453 if (ch == oldChar) 1454 ch = newChar; 1455 data[i] = ch; 1456 } 1457 return newImpl; 1458 } 1459 1460 // There is the possibility we need to up convert from 8 to 16 bit, 1461 // create a 16 bit string for the result. 1462 UChar* data; 1463 auto newImpl = createUninitializedInternalNonEmpty(m_length, data); 1464 1465 for (i = 0; i != m_length; ++i) { 1466 UChar ch = m_data8[i]; 1467 if (ch == oldC) 1468 ch = newC; 1469 data[i] = ch; 1470 } 1471 1472 return newImpl; 1473 } 1474 1475 UChar* data; 1476 auto newImpl = createUninitializedInternalNonEmpty(m_length, data); 1477 1478 for (i = 0; i != m_length; ++i) { 1479 UChar ch = m_data16[i]; 1480 if (ch == oldC) 1481 ch = newC; 1482 data[i] = ch; 1483 } 1484 return newImpl; 1485} 1486 1487PassRef<StringImpl> StringImpl::replace(unsigned position, unsigned lengthToReplace, StringImpl* str) 1488{ 1489 position = std::min(position, length()); 1490 lengthToReplace = std::min(lengthToReplace, length() - position); 1491 unsigned lengthToInsert = str ? str->length() : 0; 1492 if (!lengthToReplace && !lengthToInsert) 1493 return *this; 1494 1495 if ((length() - lengthToReplace) >= (std::numeric_limits<unsigned>::max() - lengthToInsert)) 1496 CRASH(); 1497 1498 if (is8Bit() && (!str || str->is8Bit())) { 1499 LChar* data; 1500 auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data); 1501 memcpy(data, m_data8, position * sizeof(LChar)); 1502 if (str) 1503 memcpy(data + position, str->m_data8, lengthToInsert * sizeof(LChar)); 1504 memcpy(data + position + lengthToInsert, m_data8 + position + lengthToReplace, 1505 (length() - position - lengthToReplace) * sizeof(LChar)); 1506 return newImpl; 1507 } 1508 UChar* data; 1509 auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data); 1510 if (is8Bit()) 1511 for (unsigned i = 0; i < position; ++i) 1512 data[i] = m_data8[i]; 1513 else 1514 memcpy(data, m_data16, position * sizeof(UChar)); 1515 if (str) { 1516 if (str->is8Bit()) 1517 for (unsigned i = 0; i < lengthToInsert; ++i) 1518 data[i + position] = str->m_data8[i]; 1519 else 1520 memcpy(data + position, str->m_data16, lengthToInsert * sizeof(UChar)); 1521 } 1522 if (is8Bit()) { 1523 for (unsigned i = 0; i < length() - position - lengthToReplace; ++i) 1524 data[i + position + lengthToInsert] = m_data8[i + position + lengthToReplace]; 1525 } else { 1526 memcpy(data + position + lengthToInsert, characters16() + position + lengthToReplace, 1527 (length() - position - lengthToReplace) * sizeof(UChar)); 1528 } 1529 return newImpl; 1530} 1531 1532PassRef<StringImpl> StringImpl::replace(UChar pattern, StringImpl* replacement) 1533{ 1534 if (!replacement) 1535 return *this; 1536 1537 if (replacement->is8Bit()) 1538 return replace(pattern, replacement->m_data8, replacement->length()); 1539 1540 return replace(pattern, replacement->m_data16, replacement->length()); 1541} 1542 1543PassRef<StringImpl> StringImpl::replace(UChar pattern, const LChar* replacement, unsigned repStrLength) 1544{ 1545 ASSERT(replacement); 1546 1547 size_t srcSegmentStart = 0; 1548 unsigned matchCount = 0; 1549 1550 // Count the matches. 1551 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) { 1552 ++matchCount; 1553 ++srcSegmentStart; 1554 } 1555 1556 // If we have 0 matches then we don't have to do any more work. 1557 if (!matchCount) 1558 return *this; 1559 1560 if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength) 1561 CRASH(); 1562 1563 unsigned replaceSize = matchCount * repStrLength; 1564 unsigned newSize = m_length - matchCount; 1565 if (newSize >= (std::numeric_limits<unsigned>::max() - replaceSize)) 1566 CRASH(); 1567 1568 newSize += replaceSize; 1569 1570 // Construct the new data. 1571 size_t srcSegmentEnd; 1572 unsigned srcSegmentLength; 1573 srcSegmentStart = 0; 1574 unsigned dstOffset = 0; 1575 1576 if (is8Bit()) { 1577 LChar* data; 1578 auto newImpl = createUninitialized(newSize, data); 1579 1580 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1581 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1582 memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar)); 1583 dstOffset += srcSegmentLength; 1584 memcpy(data + dstOffset, replacement, repStrLength * sizeof(LChar)); 1585 dstOffset += repStrLength; 1586 srcSegmentStart = srcSegmentEnd + 1; 1587 } 1588 1589 srcSegmentLength = m_length - srcSegmentStart; 1590 memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar)); 1591 1592 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1593 1594 return newImpl; 1595 } 1596 1597 UChar* data; 1598 auto newImpl = createUninitialized(newSize, data); 1599 1600 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1601 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1602 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1603 1604 dstOffset += srcSegmentLength; 1605 for (unsigned i = 0; i < repStrLength; ++i) 1606 data[i + dstOffset] = replacement[i]; 1607 1608 dstOffset += repStrLength; 1609 srcSegmentStart = srcSegmentEnd + 1; 1610 } 1611 1612 srcSegmentLength = m_length - srcSegmentStart; 1613 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1614 1615 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1616 1617 return newImpl; 1618} 1619 1620PassRef<StringImpl> StringImpl::replace(UChar pattern, const UChar* replacement, unsigned repStrLength) 1621{ 1622 ASSERT(replacement); 1623 1624 size_t srcSegmentStart = 0; 1625 unsigned matchCount = 0; 1626 1627 // Count the matches. 1628 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) { 1629 ++matchCount; 1630 ++srcSegmentStart; 1631 } 1632 1633 // If we have 0 matches then we don't have to do any more work. 1634 if (!matchCount) 1635 return *this; 1636 1637 if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength) 1638 CRASH(); 1639 1640 unsigned replaceSize = matchCount * repStrLength; 1641 unsigned newSize = m_length - matchCount; 1642 if (newSize >= (std::numeric_limits<unsigned>::max() - replaceSize)) 1643 CRASH(); 1644 1645 newSize += replaceSize; 1646 1647 // Construct the new data. 1648 size_t srcSegmentEnd; 1649 unsigned srcSegmentLength; 1650 srcSegmentStart = 0; 1651 unsigned dstOffset = 0; 1652 1653 if (is8Bit()) { 1654 UChar* data; 1655 auto newImpl = createUninitialized(newSize, data); 1656 1657 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1658 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1659 for (unsigned i = 0; i < srcSegmentLength; ++i) 1660 data[i + dstOffset] = m_data8[i + srcSegmentStart]; 1661 1662 dstOffset += srcSegmentLength; 1663 memcpy(data + dstOffset, replacement, repStrLength * sizeof(UChar)); 1664 1665 dstOffset += repStrLength; 1666 srcSegmentStart = srcSegmentEnd + 1; 1667 } 1668 1669 srcSegmentLength = m_length - srcSegmentStart; 1670 for (unsigned i = 0; i < srcSegmentLength; ++i) 1671 data[i + dstOffset] = m_data8[i + srcSegmentStart]; 1672 1673 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1674 1675 return newImpl; 1676 } 1677 1678 UChar* data; 1679 auto newImpl = createUninitialized(newSize, data); 1680 1681 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1682 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1683 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1684 1685 dstOffset += srcSegmentLength; 1686 memcpy(data + dstOffset, replacement, repStrLength * sizeof(UChar)); 1687 1688 dstOffset += repStrLength; 1689 srcSegmentStart = srcSegmentEnd + 1; 1690 } 1691 1692 srcSegmentLength = m_length - srcSegmentStart; 1693 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1694 1695 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1696 1697 return newImpl; 1698} 1699 1700PassRef<StringImpl> StringImpl::replace(StringImpl* pattern, StringImpl* replacement) 1701{ 1702 if (!pattern || !replacement) 1703 return *this; 1704 1705 unsigned patternLength = pattern->length(); 1706 if (!patternLength) 1707 return *this; 1708 1709 unsigned repStrLength = replacement->length(); 1710 size_t srcSegmentStart = 0; 1711 unsigned matchCount = 0; 1712 1713 // Count the matches. 1714 while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) { 1715 ++matchCount; 1716 srcSegmentStart += patternLength; 1717 } 1718 1719 // If we have 0 matches, we don't have to do any more work 1720 if (!matchCount) 1721 return *this; 1722 1723 unsigned newSize = m_length - matchCount * patternLength; 1724 if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength) 1725 CRASH(); 1726 1727 if (newSize > (std::numeric_limits<unsigned>::max() - matchCount * repStrLength)) 1728 CRASH(); 1729 1730 newSize += matchCount * repStrLength; 1731 1732 1733 // Construct the new data 1734 size_t srcSegmentEnd; 1735 unsigned srcSegmentLength; 1736 srcSegmentStart = 0; 1737 unsigned dstOffset = 0; 1738 bool srcIs8Bit = is8Bit(); 1739 bool replacementIs8Bit = replacement->is8Bit(); 1740 1741 // There are 4 cases: 1742 // 1. This and replacement are both 8 bit. 1743 // 2. This and replacement are both 16 bit. 1744 // 3. This is 8 bit and replacement is 16 bit. 1745 // 4. This is 16 bit and replacement is 8 bit. 1746 if (srcIs8Bit && replacementIs8Bit) { 1747 // Case 1 1748 LChar* data; 1749 auto newImpl = createUninitialized(newSize, data); 1750 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1751 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1752 memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar)); 1753 dstOffset += srcSegmentLength; 1754 memcpy(data + dstOffset, replacement->m_data8, repStrLength * sizeof(LChar)); 1755 dstOffset += repStrLength; 1756 srcSegmentStart = srcSegmentEnd + patternLength; 1757 } 1758 1759 srcSegmentLength = m_length - srcSegmentStart; 1760 memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar)); 1761 1762 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1763 1764 return newImpl; 1765 } 1766 1767 UChar* data; 1768 auto newImpl = createUninitialized(newSize, data); 1769 while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) { 1770 srcSegmentLength = srcSegmentEnd - srcSegmentStart; 1771 if (srcIs8Bit) { 1772 // Case 3. 1773 for (unsigned i = 0; i < srcSegmentLength; ++i) 1774 data[i + dstOffset] = m_data8[i + srcSegmentStart]; 1775 } else { 1776 // Case 2 & 4. 1777 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1778 } 1779 dstOffset += srcSegmentLength; 1780 if (replacementIs8Bit) { 1781 // Cases 2 & 3. 1782 for (unsigned i = 0; i < repStrLength; ++i) 1783 data[i + dstOffset] = replacement->m_data8[i]; 1784 } else { 1785 // Case 4 1786 memcpy(data + dstOffset, replacement->m_data16, repStrLength * sizeof(UChar)); 1787 } 1788 dstOffset += repStrLength; 1789 srcSegmentStart = srcSegmentEnd + patternLength; 1790 } 1791 1792 srcSegmentLength = m_length - srcSegmentStart; 1793 if (srcIs8Bit) { 1794 // Case 3. 1795 for (unsigned i = 0; i < srcSegmentLength; ++i) 1796 data[i + dstOffset] = m_data8[i + srcSegmentStart]; 1797 } else { 1798 // Cases 2 & 4. 1799 memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar)); 1800 } 1801 1802 ASSERT(dstOffset + srcSegmentLength == newImpl.get().length()); 1803 1804 return newImpl; 1805} 1806 1807static ALWAYS_INLINE bool stringImplContentEqual(const StringImpl& a, const StringImpl& b) 1808{ 1809 unsigned aLength = a.length(); 1810 unsigned bLength = b.length(); 1811 if (aLength != bLength) 1812 return false; 1813 1814 if (a.is8Bit()) { 1815 if (b.is8Bit()) 1816 return equal(a.characters8(), b.characters8(), aLength); 1817 1818 return equal(a.characters8(), b.characters16(), aLength); 1819 } 1820 1821 if (b.is8Bit()) 1822 return equal(a.characters16(), b.characters8(), aLength); 1823 1824 return equal(a.characters16(), b.characters16(), aLength); 1825} 1826 1827bool equal(const StringImpl* a, const StringImpl* b) 1828{ 1829 if (a == b) 1830 return true; 1831 if (!a || !b) 1832 return false; 1833 1834 return stringImplContentEqual(*a, *b); 1835} 1836 1837template <typename CharType> 1838inline bool equalInternal(const StringImpl* a, const CharType* b, unsigned length) 1839{ 1840 if (!a) 1841 return !b; 1842 if (!b) 1843 return false; 1844 1845 if (a->length() != length) 1846 return false; 1847 if (a->is8Bit()) 1848 return equal(a->characters8(), b, length); 1849 return equal(a->characters16(), b, length); 1850} 1851 1852bool equal(const StringImpl* a, const LChar* b, unsigned length) 1853{ 1854 return equalInternal(a, b, length); 1855} 1856 1857bool equal(const StringImpl* a, const UChar* b, unsigned length) 1858{ 1859 return equalInternal(a, b, length); 1860} 1861 1862bool equal(const StringImpl* a, const LChar* b) 1863{ 1864 if (!a) 1865 return !b; 1866 if (!b) 1867 return !a; 1868 1869 unsigned length = a->length(); 1870 1871 if (a->is8Bit()) { 1872 const LChar* aPtr = a->characters8(); 1873 for (unsigned i = 0; i != length; ++i) { 1874 LChar bc = b[i]; 1875 LChar ac = aPtr[i]; 1876 if (!bc) 1877 return false; 1878 if (ac != bc) 1879 return false; 1880 } 1881 1882 return !b[length]; 1883 } 1884 1885 const UChar* aPtr = a->characters16(); 1886 for (unsigned i = 0; i != length; ++i) { 1887 LChar bc = b[i]; 1888 if (!bc) 1889 return false; 1890 if (aPtr[i] != bc) 1891 return false; 1892 } 1893 1894 return !b[length]; 1895} 1896 1897bool equal(const StringImpl& a, const StringImpl& b) 1898{ 1899 if (&a == &b) 1900 return true; 1901 1902 return stringImplContentEqual(a, b); 1903} 1904 1905bool equalIgnoringCase(const StringImpl* a, const StringImpl* b) 1906{ 1907 if (a == b) 1908 return true; 1909 if (!a || !b) 1910 return false; 1911 1912 return CaseFoldingHash::equal(a, b); 1913} 1914 1915bool equalIgnoringCase(const StringImpl* a, const LChar* b) 1916{ 1917 if (!a) 1918 return !b; 1919 if (!b) 1920 return !a; 1921 1922 unsigned length = a->length(); 1923 1924 // Do a faster loop for the case where all the characters are ASCII. 1925 UChar ored = 0; 1926 bool equal = true; 1927 if (a->is8Bit()) { 1928 const LChar* as = a->characters8(); 1929 for (unsigned i = 0; i != length; ++i) { 1930 LChar bc = b[i]; 1931 if (!bc) 1932 return false; 1933 UChar ac = as[i]; 1934 ored |= ac; 1935 equal = equal && (toASCIILower(ac) == toASCIILower(bc)); 1936 } 1937 1938 // Do a slower implementation for cases that include non-ASCII characters. 1939 if (ored & ~0x7F) { 1940 equal = true; 1941 for (unsigned i = 0; i != length; ++i) 1942 equal = equal && u_foldCase(as[i], U_FOLD_CASE_DEFAULT) == u_foldCase(b[i], U_FOLD_CASE_DEFAULT); 1943 } 1944 1945 return equal && !b[length]; 1946 } 1947 1948 const UChar* as = a->characters16(); 1949 for (unsigned i = 0; i != length; ++i) { 1950 LChar bc = b[i]; 1951 if (!bc) 1952 return false; 1953 UChar ac = as[i]; 1954 ored |= ac; 1955 equal = equal && (toASCIILower(ac) == toASCIILower(bc)); 1956 } 1957 1958 // Do a slower implementation for cases that include non-ASCII characters. 1959 if (ored & ~0x7F) { 1960 equal = true; 1961 for (unsigned i = 0; i != length; ++i) { 1962 equal = equal && u_foldCase(as[i], U_FOLD_CASE_DEFAULT) == u_foldCase(b[i], U_FOLD_CASE_DEFAULT); 1963 } 1964 } 1965 1966 return equal && !b[length]; 1967} 1968 1969bool equalIgnoringCaseNonNull(const StringImpl* a, const StringImpl* b) 1970{ 1971 ASSERT(a && b); 1972 if (a == b) 1973 return true; 1974 1975 unsigned length = a->length(); 1976 if (length != b->length()) 1977 return false; 1978 1979 if (a->is8Bit()) { 1980 if (b->is8Bit()) 1981 return equalIgnoringCase(a->characters8(), b->characters8(), length); 1982 1983 return equalIgnoringCase(b->characters16(), a->characters8(), length); 1984 } 1985 1986 if (b->is8Bit()) 1987 return equalIgnoringCase(a->characters16(), b->characters8(), length); 1988 1989 return equalIgnoringCase(a->characters16(), b->characters16(), length); 1990} 1991 1992bool equalIgnoringNullity(StringImpl* a, StringImpl* b) 1993{ 1994 if (!a && b && !b->length()) 1995 return true; 1996 if (!b && a && !a->length()) 1997 return true; 1998 return equal(a, b); 1999} 2000 2001UCharDirection StringImpl::defaultWritingDirection(bool* hasStrongDirectionality) 2002{ 2003 for (unsigned i = 0; i < m_length; ++i) { 2004 UCharDirection charDirection = u_charDirection(is8Bit() ? m_data8[i] : m_data16[i]); 2005 if (charDirection == U_LEFT_TO_RIGHT) { 2006 if (hasStrongDirectionality) 2007 *hasStrongDirectionality = true; 2008 return U_LEFT_TO_RIGHT; 2009 } 2010 if (charDirection == U_RIGHT_TO_LEFT || charDirection == U_RIGHT_TO_LEFT_ARABIC) { 2011 if (hasStrongDirectionality) 2012 *hasStrongDirectionality = true; 2013 return U_RIGHT_TO_LEFT; 2014 } 2015 } 2016 if (hasStrongDirectionality) 2017 *hasStrongDirectionality = false; 2018 return U_LEFT_TO_RIGHT; 2019} 2020 2021PassRef<StringImpl> StringImpl::adopt(StringBuffer<LChar>& buffer) 2022{ 2023 unsigned length = buffer.length(); 2024 if (!length) 2025 return *empty(); 2026 return adoptRef(*new StringImpl(buffer.release(), length)); 2027} 2028 2029PassRef<StringImpl> StringImpl::adopt(StringBuffer<UChar>& buffer) 2030{ 2031 unsigned length = buffer.length(); 2032 if (!length) 2033 return *empty(); 2034 return adoptRef(*new StringImpl(buffer.release(), length)); 2035} 2036 2037size_t StringImpl::sizeInBytes() const 2038{ 2039 // FIXME: support substrings 2040 size_t size = length(); 2041 if (!is8Bit()) 2042 size *= 2; 2043 return size + sizeof(*this); 2044} 2045 2046// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 2047static inline void putUTF8Triple(char*& buffer, UChar ch) 2048{ 2049 ASSERT(ch >= 0x0800); 2050 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 2051 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 2052 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 2053} 2054 2055bool StringImpl::utf8Impl(const UChar* characters, unsigned length, char*& buffer, size_t bufferSize, ConversionMode mode) 2056{ 2057 if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) { 2058 const UChar* charactersEnd = characters + length; 2059 char* bufferEnd = buffer + bufferSize; 2060 while (characters < charactersEnd) { 2061 // Use strict conversion to detect unpaired surrogates. 2062 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true); 2063 ASSERT(result != targetExhausted); 2064 // Conversion fails when there is an unpaired surrogate. 2065 // Put replacement character (U+FFFD) instead of the unpaired surrogate. 2066 if (result != conversionOK) { 2067 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); 2068 // There should be room left, since one UChar hasn't been converted. 2069 ASSERT((buffer + 3) <= bufferEnd); 2070 putUTF8Triple(buffer, replacementCharacter); 2071 ++characters; 2072 } 2073 } 2074 } else { 2075 bool strict = mode == StrictConversion; 2076 const UChar* originalCharacters = characters; 2077 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferSize, strict); 2078 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 2079 2080 // Only produced from strict conversion. 2081 if (result == sourceIllegal) { 2082 ASSERT(strict); 2083 return false; 2084 } 2085 2086 // Check for an unconverted high surrogate. 2087 if (result == sourceExhausted) { 2088 if (strict) 2089 return false; 2090 // This should be one unpaired high surrogate. Treat it the same 2091 // was as an unpaired high surrogate would have been handled in 2092 // the middle of a string with non-strict conversion - which is 2093 // to say, simply encode it to UTF-8. 2094 ASSERT_UNUSED( 2095 originalCharacters, (characters + 1) == (originalCharacters + length)); 2096 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 2097 // There should be room left, since one UChar hasn't been converted. 2098 ASSERT((buffer + 3) <= (buffer + bufferSize)); 2099 putUTF8Triple(buffer, *characters); 2100 } 2101 } 2102 2103 return true; 2104} 2105 2106CString StringImpl::utf8ForCharacters(const UChar* characters, unsigned length, ConversionMode mode) 2107{ 2108 if (!length) 2109 return CString("", 0); 2110 if (length > std::numeric_limits<unsigned>::max() / 3) 2111 return CString(); 2112 Vector<char, 1024> bufferVector(length * 3); 2113 char* buffer = bufferVector.data(); 2114 if (!utf8Impl(characters, length, buffer, bufferVector.size(), mode)) 2115 return CString(); 2116 return CString(bufferVector.data(), buffer - bufferVector.data()); 2117} 2118 2119CString StringImpl::utf8ForRange(unsigned offset, unsigned length, ConversionMode mode) const 2120{ 2121 ASSERT(offset <= this->length()); 2122 ASSERT(offset + length <= this->length()); 2123 2124 if (!length) 2125 return CString("", 0); 2126 2127 // Allocate a buffer big enough to hold all the characters 2128 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 2129 // Optimization ideas, if we find this function is hot: 2130 // * We could speculatively create a CStringBuffer to contain 'length' 2131 // characters, and resize if necessary (i.e. if the buffer contains 2132 // non-ascii characters). (Alternatively, scan the buffer first for 2133 // ascii characters, so we know this will be sufficient). 2134 // * We could allocate a CStringBuffer with an appropriate size to 2135 // have a good chance of being able to write the string into the 2136 // buffer without reallocing (say, 1.5 x length). 2137 if (length > std::numeric_limits<unsigned>::max() / 3) 2138 return CString(); 2139 Vector<char, 1024> bufferVector(length * 3); 2140 2141 char* buffer = bufferVector.data(); 2142 2143 if (is8Bit()) { 2144 const LChar* characters = this->characters8() + offset; 2145 2146 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size()); 2147 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion 2148 } else { 2149 if (!utf8Impl(this->characters16() + offset, length, buffer, bufferVector.size(), mode)) 2150 return CString(); 2151 } 2152 2153 return CString(bufferVector.data(), buffer - bufferVector.data()); 2154} 2155 2156CString StringImpl::utf8(ConversionMode mode) const 2157{ 2158 return utf8ForRange(0, length(), mode); 2159} 2160 2161// Table is based on ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt 2162const UChar StringImpl::latin1CaseFoldTable[256] = { 2163 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, 2164 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, 2165 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, 2166 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, 2167 0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 2168 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 2169 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, 2170 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, 2171 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, 2172 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, 2173 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, 2174 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x03bc, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, 2175 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 2176 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00d7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00df, 2177 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, 2178 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, 2179}; 2180 2181bool equalIgnoringNullity(const UChar* a, size_t aLength, StringImpl* b) 2182{ 2183 if (!b) 2184 return !aLength; 2185 if (aLength != b->length()) 2186 return false; 2187 if (b->is8Bit()) { 2188 const LChar* bCharacters = b->characters8(); 2189 for (unsigned i = 0; i < aLength; ++i) { 2190 if (a[i] != bCharacters[i]) 2191 return false; 2192 } 2193 return true; 2194 } 2195 return !memcmp(a, b->characters16(), b->length() * sizeof(UChar)); 2196} 2197 2198} // namespace WTF 2199