1/*
2 * Copyright (C) 1999 Lars Knoll (knoll@kde.org)
3 *           (C) 1999 Antti Koivisto (koivisto@kde.org)
4 *           (C) 2001 Dirk Mueller ( mueller@kde.org )
5 * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2013-2014 Apple Inc. All rights reserved.
6 * Copyright (C) 2006 Andrew Wellington (proton@wiretapped.net)
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB.  If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25#include "config.h"
26#include "StringImpl.h"
27
28#include "AtomicString.h"
29#include "StringBuffer.h"
30#include "StringHash.h"
31#include <wtf/ProcessID.h>
32#include <wtf/StdLibExtras.h>
33#include <wtf/WTFThreadData.h>
34#include <wtf/text/CString.h>
35#include <wtf/text/StringView.h>
36#include <wtf/unicode/CharacterNames.h>
37#include <wtf/unicode/UTF8.h>
38
39#ifdef STRING_STATS
40#include <unistd.h>
41#include <wtf/DataLog.h>
42#endif
43
44namespace WTF {
45
46using namespace Unicode;
47
48COMPILE_ASSERT(sizeof(StringImpl) == 2 * sizeof(int) + 2 * sizeof(void*), StringImpl_should_stay_small);
49
50#ifdef STRING_STATS
51StringStats StringImpl::m_stringStats;
52
53unsigned StringStats::s_stringRemovesTillPrintStats = StringStats::s_printStringStatsFrequency;
54
55void StringStats::removeString(StringImpl* string)
56{
57    unsigned length = string->length();
58    bool isSubString = string->isSubString();
59
60    --m_totalNumberStrings;
61
62    if (string->has16BitShadow()) {
63        --m_numberUpconvertedStrings;
64        if (!isSubString)
65            m_totalUpconvertedData -= length;
66    }
67
68    if (string->is8Bit()) {
69        --m_number8BitStrings;
70        if (!isSubString)
71            m_total8BitData -= length;
72    } else {
73        --m_number16BitStrings;
74        if (!isSubString)
75            m_total16BitData -= length;
76    }
77
78    if (!--s_stringRemovesTillPrintStats) {
79        s_stringRemovesTillPrintStats = s_printStringStatsFrequency;
80        printStats();
81    }
82}
83
84void StringStats::printStats()
85{
86    dataLogF("String stats for process id %d:\n", getCurrentProcessID());
87
88    unsigned long long totalNumberCharacters = m_total8BitData + m_total16BitData;
89    double percent8Bit = m_totalNumberStrings ? ((double)m_number8BitStrings * 100) / (double)m_totalNumberStrings : 0.0;
90    double average8bitLength = m_number8BitStrings ? (double)m_total8BitData / (double)m_number8BitStrings : 0.0;
91    dataLogF("%8u (%5.2f%%) 8 bit        %12llu chars  %12llu bytes  avg length %6.1f\n", m_number8BitStrings, percent8Bit, m_total8BitData, m_total8BitData, average8bitLength);
92
93    double percent16Bit = m_totalNumberStrings ? ((double)m_number16BitStrings * 100) / (double)m_totalNumberStrings : 0.0;
94    double average16bitLength = m_number16BitStrings ? (double)m_total16BitData / (double)m_number16BitStrings : 0.0;
95    dataLogF("%8u (%5.2f%%) 16 bit       %12llu chars  %12llu bytes  avg length %6.1f\n", m_number16BitStrings, percent16Bit, m_total16BitData, m_total16BitData * 2, average16bitLength);
96
97    double percentUpconverted = m_totalNumberStrings ? ((double)m_numberUpconvertedStrings * 100) / (double)m_number8BitStrings : 0.0;
98    double averageUpconvertedLength = m_numberUpconvertedStrings ? (double)m_totalUpconvertedData / (double)m_numberUpconvertedStrings : 0.0;
99    dataLogF("%8u (%5.2f%%) upconverted  %12llu chars  %12llu bytes  avg length %6.1f\n", m_numberUpconvertedStrings, percentUpconverted, m_totalUpconvertedData, m_totalUpconvertedData * 2, averageUpconvertedLength);
100
101    double averageLength = m_totalNumberStrings ? (double)totalNumberCharacters / (double)m_totalNumberStrings : 0.0;
102    unsigned long long totalDataBytes = m_total8BitData + (m_total16BitData + m_totalUpconvertedData) * 2;
103    dataLogF("%8u Total                 %12llu chars  %12llu bytes  avg length %6.1f\n", m_totalNumberStrings, totalNumberCharacters, totalDataBytes, averageLength);
104    unsigned long long totalSavedBytes = m_total8BitData - m_totalUpconvertedData;
105    double percentSavings = totalSavedBytes ? ((double)totalSavedBytes * 100) / (double)(totalDataBytes + totalSavedBytes) : 0.0;
106    dataLogF("         Total savings %12llu bytes (%5.2f%%)\n", totalSavedBytes, percentSavings);
107}
108#endif
109
110
111StringImpl::~StringImpl()
112{
113    ASSERT(!isStatic());
114
115    STRING_STATS_REMOVE_STRING(this);
116
117    if (isAtomic() && m_length)
118        AtomicString::remove(this);
119
120    BufferOwnership ownership = bufferOwnership();
121
122    if (ownership == BufferInternal)
123        return;
124    if (ownership == BufferOwned) {
125        // We use m_data8, but since it is a union with m_data16 this works either way.
126        ASSERT(m_data8);
127        fastFree(const_cast<LChar*>(m_data8));
128        return;
129    }
130
131    ASSERT(ownership == BufferSubstring);
132    ASSERT(substringBuffer());
133    substringBuffer()->deref();
134}
135
136void StringImpl::destroy(StringImpl* stringImpl)
137{
138    stringImpl->~StringImpl();
139    fastFree(stringImpl);
140}
141
142PassRef<StringImpl> StringImpl::createFromLiteral(const char* characters, unsigned length)
143{
144    ASSERT_WITH_MESSAGE(length, "Use StringImpl::empty() to create an empty string");
145    ASSERT(charactersAreAllASCII<LChar>(reinterpret_cast<const LChar*>(characters), length));
146    return adoptRef(*new StringImpl(reinterpret_cast<const LChar*>(characters), length, ConstructWithoutCopying));
147}
148
149PassRef<StringImpl> StringImpl::createFromLiteral(const char* characters)
150{
151    return createFromLiteral(characters, strlen(characters));
152}
153
154PassRef<StringImpl> StringImpl::createWithoutCopying(const UChar* characters, unsigned length)
155{
156    if (!length)
157        return *empty();
158
159    return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying));
160}
161
162PassRef<StringImpl> StringImpl::createWithoutCopying(const LChar* characters, unsigned length)
163{
164    if (!length)
165        return *empty();
166
167    return adoptRef(*new StringImpl(characters, length, ConstructWithoutCopying));
168}
169
170template <typename CharType>
171inline PassRef<StringImpl> StringImpl::createUninitializedInternal(unsigned length, CharType*& data)
172{
173    if (!length) {
174        data = 0;
175        return *empty();
176    }
177    return createUninitializedInternalNonEmpty(length, data);
178}
179
180template <typename CharType>
181inline PassRef<StringImpl> StringImpl::createUninitializedInternalNonEmpty(unsigned length, CharType*& data)
182{
183    ASSERT(length);
184
185    // Allocate a single buffer large enough to contain the StringImpl
186    // struct as well as the data which it contains. This removes one
187    // heap allocation from this call.
188    if (length > ((std::numeric_limits<unsigned>::max() - sizeof(StringImpl)) / sizeof(CharType)))
189        CRASH();
190    StringImpl* string = static_cast<StringImpl*>(fastMalloc(allocationSize<CharType>(length)));
191
192    data = string->tailPointer<CharType>();
193    return constructInternal<CharType>(string, length);
194}
195
196PassRef<StringImpl> StringImpl::createUninitialized(unsigned length, LChar*& data)
197{
198    return createUninitializedInternal(length, data);
199}
200
201PassRef<StringImpl> StringImpl::createUninitialized(unsigned length, UChar*& data)
202{
203    return createUninitializedInternal(length, data);
204}
205
206template <typename CharType>
207inline PassRef<StringImpl> StringImpl::reallocateInternal(PassRefPtr<StringImpl> originalString, unsigned length, CharType*& data)
208{
209    ASSERT(originalString->hasOneRef());
210    ASSERT(originalString->bufferOwnership() == BufferInternal);
211
212    if (!length) {
213        data = 0;
214        return *empty();
215    }
216
217    // Same as createUninitialized() except here we use fastRealloc.
218    if (length > ((std::numeric_limits<unsigned>::max() - sizeof(StringImpl)) / sizeof(CharType)))
219        CRASH();
220
221    originalString->~StringImpl();
222    StringImpl* string = static_cast<StringImpl*>(fastRealloc(originalString.leakRef(), allocationSize<CharType>(length)));
223
224    data = string->tailPointer<CharType>();
225    return constructInternal<CharType>(string, length);
226}
227
228PassRef<StringImpl> StringImpl::reallocate(PassRefPtr<StringImpl> originalString, unsigned length, LChar*& data)
229{
230    ASSERT(originalString->is8Bit());
231    return reallocateInternal(originalString, length, data);
232}
233
234PassRef<StringImpl> StringImpl::reallocate(PassRefPtr<StringImpl> originalString, unsigned length, UChar*& data)
235{
236    ASSERT(!originalString->is8Bit());
237    return reallocateInternal(originalString, length, data);
238}
239
240template <typename CharType>
241inline PassRef<StringImpl> StringImpl::createInternal(const CharType* characters, unsigned length)
242{
243    if (!characters || !length)
244        return *empty();
245
246    CharType* data;
247    auto string = createUninitializedInternalNonEmpty(length, data);
248    memcpy(data, characters, length * sizeof(CharType));
249    return string;
250}
251
252PassRef<StringImpl> StringImpl::create(const UChar* characters, unsigned length)
253{
254    return createInternal(characters, length);
255}
256
257PassRef<StringImpl> StringImpl::create(const LChar* characters, unsigned length)
258{
259    return createInternal(characters, length);
260}
261
262PassRef<StringImpl> StringImpl::create8BitIfPossible(const UChar* characters, unsigned length)
263{
264    if (!characters || !length)
265        return *empty();
266
267    LChar* data;
268    RefPtr<StringImpl> string = createUninitializedInternalNonEmpty(length, data);
269
270    for (size_t i = 0; i < length; ++i) {
271        if (characters[i] & 0xff00)
272            return create(characters, length);
273        data[i] = static_cast<LChar>(characters[i]);
274    }
275
276    return string.releaseNonNull();
277}
278
279PassRef<StringImpl> StringImpl::create8BitIfPossible(const UChar* string)
280{
281    return StringImpl::create8BitIfPossible(string, lengthOfNullTerminatedString(string));
282}
283
284PassRef<StringImpl> StringImpl::create(const LChar* string)
285{
286    if (!string)
287        return *empty();
288    size_t length = strlen(reinterpret_cast<const char*>(string));
289    if (length > std::numeric_limits<unsigned>::max())
290        CRASH();
291    return create(string, length);
292}
293
294bool StringImpl::containsOnlyWhitespace()
295{
296    // FIXME: The definition of whitespace here includes a number of characters
297    // that are not whitespace from the point of view of RenderText; I wonder if
298    // that's a problem in practice.
299    if (is8Bit()) {
300        for (unsigned i = 0; i < m_length; ++i) {
301            UChar c = m_data8[i];
302            if (!isASCIISpace(c))
303                return false;
304        }
305
306        return true;
307    }
308
309    for (unsigned i = 0; i < m_length; ++i) {
310        UChar c = m_data16[i];
311        if (!isASCIISpace(c))
312            return false;
313    }
314    return true;
315}
316
317PassRef<StringImpl> StringImpl::substring(unsigned start, unsigned length)
318{
319    if (start >= m_length)
320        return *empty();
321    unsigned maxLength = m_length - start;
322    if (length >= maxLength) {
323        if (!start)
324            return *this;
325        length = maxLength;
326    }
327    if (is8Bit())
328        return create(m_data8 + start, length);
329
330    return create(m_data16 + start, length);
331}
332
333UChar32 StringImpl::characterStartingAt(unsigned i)
334{
335    if (is8Bit())
336        return m_data8[i];
337    if (U16_IS_SINGLE(m_data16[i]))
338        return m_data16[i];
339    if (i + 1 < m_length && U16_IS_LEAD(m_data16[i]) && U16_IS_TRAIL(m_data16[i + 1]))
340        return U16_GET_SUPPLEMENTARY(m_data16[i], m_data16[i + 1]);
341    return 0;
342}
343
344PassRef<StringImpl> StringImpl::lower()
345{
346    // Note: This is a hot function in the Dromaeo benchmark, specifically the
347    // no-op code path up through the first 'return' statement.
348
349    // First scan the string for uppercase and non-ASCII characters:
350    if (is8Bit()) {
351        unsigned failingIndex;
352        for (unsigned i = 0; i < m_length; ++i) {
353            LChar character = m_data8[i];
354            if (UNLIKELY((character & ~0x7F) || isASCIIUpper(character))) {
355                failingIndex = i;
356                goto SlowPath;
357            }
358        }
359        return *this;
360
361SlowPath:
362        LChar* data8;
363        auto newImpl = createUninitializedInternalNonEmpty(m_length, data8);
364
365        for (unsigned i = 0; i < failingIndex; ++i)
366            data8[i] = m_data8[i];
367
368        for (unsigned i = failingIndex; i < m_length; ++i) {
369            LChar character = m_data8[i];
370            if (!(character & ~0x7F))
371                data8[i] = toASCIILower(character);
372            else {
373                ASSERT(u_tolower(character) <= 0xFF);
374                data8[i] = static_cast<LChar>(u_tolower(character));
375            }
376        }
377
378        return newImpl;
379    }
380    bool noUpper = true;
381    unsigned ored = 0;
382
383    for (unsigned i = 0; i < m_length; ++i) {
384        UChar character = m_data16[i];
385        if (UNLIKELY(isASCIIUpper(character)))
386            noUpper = false;
387        ored |= character;
388    }
389    // Nothing to do if the string is all ASCII with no uppercase.
390    if (noUpper && !(ored & ~0x7F))
391        return *this;
392
393    if (!(ored & ~0x7F)) {
394        UChar* data16;
395        auto newImpl = createUninitializedInternalNonEmpty(m_length, data16);
396
397        for (unsigned i = 0; i < m_length; ++i) {
398            UChar c = m_data16[i];
399            data16[i] = toASCIILower(c);
400        }
401        return newImpl;
402    }
403
404    if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max()))
405        CRASH();
406    int32_t length = m_length;
407
408    // Do a slower implementation for cases that include non-ASCII characters.
409    UChar* data16;
410    RefPtr<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data16);
411
412    UErrorCode status = U_ZERO_ERROR;
413    int32_t realLength = u_strToLower(data16, length, m_data16, m_length, "", &status);
414    if (U_SUCCESS(status) && realLength == length)
415        return newImpl.releaseNonNull();
416
417    newImpl = createUninitialized(realLength, data16);
418    status = U_ZERO_ERROR;
419    u_strToLower(data16, realLength, m_data16, m_length, "", &status);
420    if (U_FAILURE(status))
421        return *this;
422    return newImpl.releaseNonNull();
423}
424
425PassRef<StringImpl> StringImpl::upper()
426{
427    // This function could be optimized for no-op cases the way lower() is,
428    // but in empirical testing, few actual calls to upper() are no-ops, so
429    // it wouldn't be worth the extra time for pre-scanning.
430
431    if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max()))
432        CRASH();
433    int32_t length = m_length;
434
435    if (is8Bit()) {
436        LChar* data8;
437        RefPtr<StringImpl> newImpl = createUninitialized(m_length, data8);
438
439        // Do a faster loop for the case where all the characters are ASCII.
440        unsigned ored = 0;
441        for (int i = 0; i < length; ++i) {
442            LChar c = m_data8[i];
443            ored |= c;
444#if CPU(X86) && defined(_MSC_VER) && _MSC_VER >=1700
445            // Workaround for an MSVC 2012 x86 optimizer bug. Remove once the bug is fixed.
446            // See https://connect.microsoft.com/VisualStudio/feedback/details/780362/optimization-bug-of-range-comparison
447            // for more details.
448            data8[i] = c >= 'a' && c <= 'z' ? c & ~0x20 : c;
449#else
450            data8[i] = toASCIIUpper(c);
451#endif
452        }
453        if (!(ored & ~0x7F))
454            return newImpl.releaseNonNull();
455
456        // Do a slower implementation for cases that include non-ASCII Latin-1 characters.
457        int numberSharpSCharacters = 0;
458
459        // There are two special cases.
460        //  1. latin-1 characters when converted to upper case are 16 bit characters.
461        //  2. Lower case sharp-S converts to "SS" (two characters)
462        for (int32_t i = 0; i < length; ++i) {
463            LChar c = m_data8[i];
464            if (UNLIKELY(c == smallLetterSharpS))
465                ++numberSharpSCharacters;
466            ASSERT(u_toupper(c) <= 0xFFFF);
467            UChar upper = u_toupper(c);
468            if (UNLIKELY(upper > 0xff)) {
469                // Since this upper-cased character does not fit in an 8-bit string, we need to take the 16-bit path.
470                goto upconvert;
471            }
472            data8[i] = static_cast<LChar>(upper);
473        }
474
475        if (!numberSharpSCharacters)
476            return newImpl.releaseNonNull();
477
478        // We have numberSSCharacters sharp-s characters, but none of the other special characters.
479        newImpl = createUninitialized(m_length + numberSharpSCharacters, data8);
480
481        LChar* dest = data8;
482
483        for (int32_t i = 0; i < length; ++i) {
484            LChar c = m_data8[i];
485            if (c == smallLetterSharpS) {
486                *dest++ = 'S';
487                *dest++ = 'S';
488            } else {
489                ASSERT(u_toupper(c) <= 0xFF);
490                *dest++ = static_cast<LChar>(u_toupper(c));
491            }
492        }
493
494        return newImpl.releaseNonNull();
495    }
496
497upconvert:
498    auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
499    const UChar* source16 = upconvertedCharacters;
500
501    UChar* data16;
502    RefPtr<StringImpl> newImpl = createUninitialized(m_length, data16);
503
504    // Do a faster loop for the case where all the characters are ASCII.
505    unsigned ored = 0;
506    for (int i = 0; i < length; ++i) {
507        UChar c = source16[i];
508        ored |= c;
509        data16[i] = toASCIIUpper(c);
510    }
511    if (!(ored & ~0x7F))
512        return newImpl.releaseNonNull();
513
514    // Do a slower implementation for cases that include non-ASCII characters.
515    UErrorCode status = U_ZERO_ERROR;
516    int32_t realLength = u_strToUpper(data16, length, source16, m_length, "", &status);
517    if (U_SUCCESS(status) && realLength == length)
518        return newImpl.releaseNonNull();
519    newImpl = createUninitialized(realLength, data16);
520    status = U_ZERO_ERROR;
521    u_strToUpper(data16, realLength, source16, m_length, "", &status);
522    if (U_FAILURE(status))
523        return *this;
524    return newImpl.releaseNonNull();
525}
526
527static inline bool needsTurkishCasingRules(const AtomicString& localeIdentifier)
528{
529    // Either "tr" or "az" locale, with case sensitive comparison and allowing for an ignored subtag.
530    UChar first = localeIdentifier[0];
531    UChar second = localeIdentifier[1];
532    return ((isASCIIAlphaCaselessEqual(first, 't') && isASCIIAlphaCaselessEqual(second, 'r'))
533        || (isASCIIAlphaCaselessEqual(first, 'a') && isASCIIAlphaCaselessEqual(second, 'z')))
534        && (localeIdentifier.length() == 2 || localeIdentifier[2] == '-');
535}
536
537PassRef<StringImpl> StringImpl::lower(const AtomicString& localeIdentifier)
538{
539    // Use the more-optimized code path most of the time.
540    // Assuming here that the only locale-specific lowercasing is the Turkish casing rules.
541    // FIXME: Could possibly optimize further by looking for the specific sequences
542    // that have locale-specific lowercasing. There are only three of them.
543    if (!needsTurkishCasingRules(localeIdentifier))
544        return lower();
545
546    // FIXME: Could share more code with the main StringImpl::lower by factoring out
547    // this last part into a shared function that takes a locale string, since this is
548    // just like the end of that function.
549
550    if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max()))
551        CRASH();
552    int length = m_length;
553
554    // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than
555    // allocating memory just to turn localeIdentifier into a C string, and we assume
556    // there is no difference between the uppercasing for "tr" and "az" locales.
557    auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
558    const UChar* source16 = upconvertedCharacters;
559    UChar* data16;
560    RefPtr<StringImpl> newString = createUninitialized(length, data16);
561    UErrorCode status = U_ZERO_ERROR;
562    int realLength = u_strToLower(data16, length, source16, length, "tr", &status);
563    if (U_SUCCESS(status) && realLength == length)
564        return newString.releaseNonNull();
565    newString = createUninitialized(realLength, data16);
566    status = U_ZERO_ERROR;
567    u_strToLower(data16, realLength, source16, length, "tr", &status);
568    if (U_FAILURE(status))
569        return *this;
570    return newString.releaseNonNull();
571}
572
573PassRef<StringImpl> StringImpl::upper(const AtomicString& localeIdentifier)
574{
575    // Use the more-optimized code path most of the time.
576    // Assuming here that the only locale-specific lowercasing is the Turkish casing rules,
577    // and that the only affected character is lowercase "i".
578    if (!needsTurkishCasingRules(localeIdentifier) || find('i') == notFound)
579        return upper();
580
581    if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max()))
582        CRASH();
583    int length = m_length;
584
585    // Below, we pass in the hardcoded locale "tr". Passing that is more efficient than
586    // allocating memory just to turn localeIdentifier into a C string, and we assume
587    // there is no difference between the uppercasing for "tr" and "az" locales.
588    auto upconvertedCharacters = StringView(*this).upconvertedCharacters();
589    const UChar* source16 = upconvertedCharacters;
590    UChar* data16;
591    RefPtr<StringImpl> newString = createUninitialized(length, data16);
592    UErrorCode status = U_ZERO_ERROR;
593    int realLength = u_strToUpper(data16, length, source16, length, "tr", &status);
594    if (U_SUCCESS(status) && realLength == length)
595        return newString.releaseNonNull();
596    newString = createUninitialized(realLength, data16);
597    status = U_ZERO_ERROR;
598    u_strToUpper(data16, realLength, source16, length, "tr", &status);
599    if (U_FAILURE(status))
600        return *this;
601    return newString.releaseNonNull();
602}
603
604PassRef<StringImpl> StringImpl::fill(UChar character)
605{
606    if (!(character & ~0x7F)) {
607        LChar* data;
608        auto newImpl = createUninitialized(m_length, data);
609        for (unsigned i = 0; i < m_length; ++i)
610            data[i] = character;
611        return newImpl;
612    }
613    UChar* data;
614    auto newImpl = createUninitialized(m_length, data);
615    for (unsigned i = 0; i < m_length; ++i)
616        data[i] = character;
617    return newImpl;
618}
619
620PassRef<StringImpl> StringImpl::foldCase()
621{
622    // FIXME: Why doesn't this function have a preflight like the one in StringImpl::lower?
623
624    if (m_length > static_cast<unsigned>(std::numeric_limits<int32_t>::max()))
625        CRASH();
626    int32_t length = m_length;
627
628    if (is8Bit()) {
629        // Do a faster loop for the case where all the characters are ASCII.
630        LChar* data;
631        auto newImpl = createUninitialized(m_length, data);
632        LChar ored = 0;
633
634        for (int32_t i = 0; i < length; ++i) {
635            LChar c = m_data8[i];
636            data[i] = toASCIILower(c);
637            ored |= c;
638        }
639
640        if (!(ored & ~0x7F))
641            return newImpl;
642
643        // Do a slower implementation for cases that include non-ASCII Latin-1 characters.
644        // FIXME: Shouldn't this use u_foldCase instead of u_tolower?
645        for (int32_t i = 0; i < length; ++i) {
646            ASSERT(u_tolower(m_data8[i]) <= 0xFF);
647            data[i] = static_cast<LChar>(u_tolower(m_data8[i]));
648        }
649
650        return newImpl;
651    }
652
653    // Do a faster loop for the case where all the characters are ASCII.
654    UChar* data;
655    RefPtr<StringImpl> newImpl = createUninitialized(m_length, data);
656    UChar ored = 0;
657    for (int32_t i = 0; i < length; ++i) {
658        UChar c = m_data16[i];
659        ored |= c;
660        data[i] = toASCIILower(c);
661    }
662    if (!(ored & ~0x7F))
663        return newImpl.releaseNonNull();
664
665    // Do a slower implementation for cases that include non-ASCII characters.
666    UErrorCode status = U_ZERO_ERROR;
667    int32_t realLength = u_strFoldCase(data, length, m_data16, m_length, U_FOLD_CASE_DEFAULT, &status);
668    if (U_SUCCESS(status) && realLength == length)
669        return newImpl.releaseNonNull();
670    newImpl = createUninitialized(realLength, data);
671    status = U_ZERO_ERROR;
672    u_strFoldCase(data, realLength, m_data16, m_length, U_FOLD_CASE_DEFAULT, &status);
673    if (U_FAILURE(status))
674        return *this;
675    return newImpl.releaseNonNull();
676}
677
678PassRef<StringImpl> StringImpl::convertToASCIILowercase()
679{
680    if (is8Bit()) {
681        unsigned failingIndex;
682        for (unsigned i = 0; i < m_length; ++i) {
683            LChar character = m_data8[i];
684            if (UNLIKELY(isASCIIUpper(character))) {
685                failingIndex = i;
686                goto SlowPath;
687            }
688        }
689        return *this;
690
691SlowPath:
692        LChar* data8;
693        PassRef<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data8);
694        for (unsigned i = 0; i < failingIndex; ++i)
695            data8[i] = m_data8[i];
696        for (unsigned i = failingIndex; i < m_length; ++i)
697            data8[i] = toASCIILower(m_data8[i]);
698        return newImpl;
699    }
700
701    bool noUpper = true;
702    for (unsigned i = 0; i < m_length; ++i) {
703        if (UNLIKELY(isASCIIUpper(m_data16[i])))
704            noUpper = false;
705    }
706    if (noUpper)
707        return *this;
708
709    UChar* data16;
710    PassRef<StringImpl> newImpl = createUninitializedInternalNonEmpty(m_length, data16);
711    for (unsigned i = 0; i < m_length; ++i)
712        data16[i] = toASCIILower(m_data16[i]);
713    return newImpl;
714}
715
716template <class UCharPredicate>
717inline PassRef<StringImpl> StringImpl::stripMatchedCharacters(UCharPredicate predicate)
718{
719    if (!m_length)
720        return *this;
721
722    unsigned start = 0;
723    unsigned end = m_length - 1;
724
725    // skip white space from start
726    while (start <= end && predicate(is8Bit() ? m_data8[start] : m_data16[start]))
727        ++start;
728
729    // only white space
730    if (start > end)
731        return *empty();
732
733    // skip white space from end
734    while (end && predicate(is8Bit() ? m_data8[end] : m_data16[end]))
735        --end;
736
737    if (!start && end == m_length - 1)
738        return *this;
739    if (is8Bit())
740        return create(m_data8 + start, end + 1 - start);
741    return create(m_data16 + start, end + 1 - start);
742}
743
744class UCharPredicate {
745public:
746    inline UCharPredicate(CharacterMatchFunctionPtr function): m_function(function) { }
747
748    inline bool operator()(UChar ch) const
749    {
750        return m_function(ch);
751    }
752
753private:
754    const CharacterMatchFunctionPtr m_function;
755};
756
757class SpaceOrNewlinePredicate {
758public:
759    inline bool operator()(UChar ch) const
760    {
761        return isSpaceOrNewline(ch);
762    }
763};
764
765PassRef<StringImpl> StringImpl::stripWhiteSpace()
766{
767    return stripMatchedCharacters(SpaceOrNewlinePredicate());
768}
769
770PassRef<StringImpl> StringImpl::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace)
771{
772    return stripMatchedCharacters(UCharPredicate(isWhiteSpace));
773}
774
775template <typename CharType>
776ALWAYS_INLINE PassRef<StringImpl> StringImpl::removeCharacters(const CharType* characters, CharacterMatchFunctionPtr findMatch)
777{
778    const CharType* from = characters;
779    const CharType* fromend = from + m_length;
780
781    // Assume the common case will not remove any characters
782    while (from != fromend && !findMatch(*from))
783        ++from;
784    if (from == fromend)
785        return *this;
786
787    StringBuffer<CharType> data(m_length);
788    CharType* to = data.characters();
789    unsigned outc = from - characters;
790
791    if (outc)
792        memcpy(to, characters, outc * sizeof(CharType));
793
794    while (true) {
795        while (from != fromend && findMatch(*from))
796            ++from;
797        while (from != fromend && !findMatch(*from))
798            to[outc++] = *from++;
799        if (from == fromend)
800            break;
801    }
802
803    data.shrink(outc);
804
805    return adopt(data);
806}
807
808PassRef<StringImpl> StringImpl::removeCharacters(CharacterMatchFunctionPtr findMatch)
809{
810    if (is8Bit())
811        return removeCharacters(characters8(), findMatch);
812    return removeCharacters(characters16(), findMatch);
813}
814
815template <typename CharType, class UCharPredicate>
816inline PassRef<StringImpl> StringImpl::simplifyMatchedCharactersToSpace(UCharPredicate predicate)
817{
818    StringBuffer<CharType> data(m_length);
819
820    const CharType* from = characters<CharType>();
821    const CharType* fromend = from + m_length;
822    int outc = 0;
823    bool changedToSpace = false;
824
825    CharType* to = data.characters();
826
827    while (true) {
828        while (from != fromend && predicate(*from)) {
829            if (*from != ' ')
830                changedToSpace = true;
831            ++from;
832        }
833        while (from != fromend && !predicate(*from))
834            to[outc++] = *from++;
835        if (from != fromend)
836            to[outc++] = ' ';
837        else
838            break;
839    }
840
841    if (outc > 0 && to[outc - 1] == ' ')
842        --outc;
843
844    if (static_cast<unsigned>(outc) == m_length && !changedToSpace)
845        return *this;
846
847    data.shrink(outc);
848
849    return adopt(data);
850}
851
852PassRef<StringImpl> StringImpl::simplifyWhiteSpace()
853{
854    if (is8Bit())
855        return StringImpl::simplifyMatchedCharactersToSpace<LChar>(SpaceOrNewlinePredicate());
856    return StringImpl::simplifyMatchedCharactersToSpace<UChar>(SpaceOrNewlinePredicate());
857}
858
859PassRef<StringImpl> StringImpl::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace)
860{
861    if (is8Bit())
862        return StringImpl::simplifyMatchedCharactersToSpace<LChar>(UCharPredicate(isWhiteSpace));
863    return StringImpl::simplifyMatchedCharactersToSpace<UChar>(UCharPredicate(isWhiteSpace));
864}
865
866int StringImpl::toIntStrict(bool* ok, int base)
867{
868    if (is8Bit())
869        return charactersToIntStrict(characters8(), m_length, ok, base);
870    return charactersToIntStrict(characters16(), m_length, ok, base);
871}
872
873unsigned StringImpl::toUIntStrict(bool* ok, int base)
874{
875    if (is8Bit())
876        return charactersToUIntStrict(characters8(), m_length, ok, base);
877    return charactersToUIntStrict(characters16(), m_length, ok, base);
878}
879
880int64_t StringImpl::toInt64Strict(bool* ok, int base)
881{
882    if (is8Bit())
883        return charactersToInt64Strict(characters8(), m_length, ok, base);
884    return charactersToInt64Strict(characters16(), m_length, ok, base);
885}
886
887uint64_t StringImpl::toUInt64Strict(bool* ok, int base)
888{
889    if (is8Bit())
890        return charactersToUInt64Strict(characters8(), m_length, ok, base);
891    return charactersToUInt64Strict(characters16(), m_length, ok, base);
892}
893
894intptr_t StringImpl::toIntPtrStrict(bool* ok, int base)
895{
896    if (is8Bit())
897        return charactersToIntPtrStrict(characters8(), m_length, ok, base);
898    return charactersToIntPtrStrict(characters16(), m_length, ok, base);
899}
900
901int StringImpl::toInt(bool* ok)
902{
903    if (is8Bit())
904        return charactersToInt(characters8(), m_length, ok);
905    return charactersToInt(characters16(), m_length, ok);
906}
907
908unsigned StringImpl::toUInt(bool* ok)
909{
910    if (is8Bit())
911        return charactersToUInt(characters8(), m_length, ok);
912    return charactersToUInt(characters16(), m_length, ok);
913}
914
915int64_t StringImpl::toInt64(bool* ok)
916{
917    if (is8Bit())
918        return charactersToInt64(characters8(), m_length, ok);
919    return charactersToInt64(characters16(), m_length, ok);
920}
921
922uint64_t StringImpl::toUInt64(bool* ok)
923{
924    if (is8Bit())
925        return charactersToUInt64(characters8(), m_length, ok);
926    return charactersToUInt64(characters16(), m_length, ok);
927}
928
929intptr_t StringImpl::toIntPtr(bool* ok)
930{
931    if (is8Bit())
932        return charactersToIntPtr(characters8(), m_length, ok);
933    return charactersToIntPtr(characters16(), m_length, ok);
934}
935
936double StringImpl::toDouble(bool* ok)
937{
938    if (is8Bit())
939        return charactersToDouble(characters8(), m_length, ok);
940    return charactersToDouble(characters16(), m_length, ok);
941}
942
943float StringImpl::toFloat(bool* ok)
944{
945    if (is8Bit())
946        return charactersToFloat(characters8(), m_length, ok);
947    return charactersToFloat(characters16(), m_length, ok);
948}
949
950bool equalIgnoringCase(const LChar* a, const LChar* b, unsigned length)
951{
952    while (length--) {
953        if (StringImpl::latin1CaseFoldTable[*a++] != StringImpl::latin1CaseFoldTable[*b++])
954            return false;
955    }
956    return true;
957}
958
959bool equalIgnoringCase(const UChar* a, const LChar* b, unsigned length)
960{
961    while (length--) {
962        if (u_foldCase(*a++, U_FOLD_CASE_DEFAULT) != StringImpl::latin1CaseFoldTable[*b++])
963            return false;
964    }
965    return true;
966}
967
968size_t StringImpl::find(CharacterMatchFunctionPtr matchFunction, unsigned start)
969{
970    if (is8Bit())
971        return WTF::find(characters8(), m_length, matchFunction, start);
972    return WTF::find(characters16(), m_length, matchFunction, start);
973}
974
975size_t StringImpl::find(const LChar* matchString, unsigned index)
976{
977    // Check for null or empty string to match against
978    if (!matchString)
979        return notFound;
980    size_t matchStringLength = strlen(reinterpret_cast<const char*>(matchString));
981    if (matchStringLength > std::numeric_limits<unsigned>::max())
982        CRASH();
983    unsigned matchLength = matchStringLength;
984    if (!matchLength)
985        return std::min(index, length());
986
987    // Optimization 1: fast case for strings of length 1.
988    if (matchLength == 1) {
989        if (is8Bit())
990            return WTF::find(characters8(), length(), matchString[0], index);
991        return WTF::find(characters16(), length(), *matchString, index);
992    }
993
994    // Check index & matchLength are in range.
995    if (index > length())
996        return notFound;
997    unsigned searchLength = length() - index;
998    if (matchLength > searchLength)
999        return notFound;
1000    // delta is the number of additional times to test; delta == 0 means test only once.
1001    unsigned delta = searchLength - matchLength;
1002
1003    // Optimization 2: keep a running hash of the strings,
1004    // only call equal if the hashes match.
1005
1006    if (is8Bit()) {
1007        const LChar* searchCharacters = characters8() + index;
1008
1009        unsigned searchHash = 0;
1010        unsigned matchHash = 0;
1011        for (unsigned i = 0; i < matchLength; ++i) {
1012            searchHash += searchCharacters[i];
1013            matchHash += matchString[i];
1014        }
1015
1016        unsigned i = 0;
1017        while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) {
1018            if (i == delta)
1019                return notFound;
1020            searchHash += searchCharacters[i + matchLength];
1021            searchHash -= searchCharacters[i];
1022            ++i;
1023        }
1024        return index + i;
1025    }
1026
1027    const UChar* searchCharacters = characters16() + index;
1028
1029    unsigned searchHash = 0;
1030    unsigned matchHash = 0;
1031    for (unsigned i = 0; i < matchLength; ++i) {
1032        searchHash += searchCharacters[i];
1033        matchHash += matchString[i];
1034    }
1035
1036    unsigned i = 0;
1037    while (searchHash != matchHash || !equal(searchCharacters + i, matchString, matchLength)) {
1038        if (i == delta)
1039            return notFound;
1040        searchHash += searchCharacters[i + matchLength];
1041        searchHash -= searchCharacters[i];
1042        ++i;
1043    }
1044    return index + i;
1045}
1046
1047size_t StringImpl::findIgnoringCase(const LChar* matchString, unsigned index)
1048{
1049    // Check for null or empty string to match against
1050    if (!matchString)
1051        return notFound;
1052    size_t matchStringLength = strlen(reinterpret_cast<const char*>(matchString));
1053    if (matchStringLength > std::numeric_limits<unsigned>::max())
1054        CRASH();
1055    unsigned matchLength = matchStringLength;
1056    if (!matchLength)
1057        return std::min(index, length());
1058
1059    // Check index & matchLength are in range.
1060    if (index > length())
1061        return notFound;
1062    unsigned searchLength = length() - index;
1063    if (matchLength > searchLength)
1064        return notFound;
1065    // delta is the number of additional times to test; delta == 0 means test only once.
1066    unsigned delta = searchLength - matchLength;
1067
1068    if (is8Bit()) {
1069        const LChar* searchCharacters = characters8() + index;
1070
1071        unsigned i = 0;
1072        while (!equalIgnoringCase(searchCharacters + i, matchString, matchLength)) {
1073            if (i == delta)
1074                return notFound;
1075            ++i;
1076        }
1077        return index + i;
1078    }
1079
1080    const UChar* searchCharacters = characters16() + index;
1081
1082    unsigned i = 0;
1083    while (!equalIgnoringCase(searchCharacters + i, matchString, matchLength)) {
1084        if (i == delta)
1085            return notFound;
1086        ++i;
1087    }
1088    return index + i;
1089}
1090
1091template <typename SearchCharacterType, typename MatchCharacterType>
1092ALWAYS_INLINE static size_t findInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned searchLength, unsigned matchLength)
1093{
1094    // Optimization: keep a running hash of the strings,
1095    // only call equal() if the hashes match.
1096
1097    // delta is the number of additional times to test; delta == 0 means test only once.
1098    unsigned delta = searchLength - matchLength;
1099
1100    unsigned searchHash = 0;
1101    unsigned matchHash = 0;
1102
1103    for (unsigned i = 0; i < matchLength; ++i) {
1104        searchHash += searchCharacters[i];
1105        matchHash += matchCharacters[i];
1106    }
1107
1108    unsigned i = 0;
1109    // keep looping until we match
1110    while (searchHash != matchHash || !equal(searchCharacters + i, matchCharacters, matchLength)) {
1111        if (i == delta)
1112            return notFound;
1113        searchHash += searchCharacters[i + matchLength];
1114        searchHash -= searchCharacters[i];
1115        ++i;
1116    }
1117    return index + i;
1118}
1119
1120size_t StringImpl::find(StringImpl* matchString)
1121{
1122    // Check for null string to match against
1123    if (UNLIKELY(!matchString))
1124        return notFound;
1125    unsigned matchLength = matchString->length();
1126
1127    // Optimization 1: fast case for strings of length 1.
1128    if (matchLength == 1) {
1129        if (is8Bit()) {
1130            if (matchString->is8Bit())
1131                return WTF::find(characters8(), length(), matchString->characters8()[0]);
1132            return WTF::find(characters8(), length(), matchString->characters16()[0]);
1133        }
1134        if (matchString->is8Bit())
1135            return WTF::find(characters16(), length(), matchString->characters8()[0]);
1136        return WTF::find(characters16(), length(), matchString->characters16()[0]);
1137    }
1138
1139    // Check matchLength is in range.
1140    if (matchLength > length())
1141        return notFound;
1142
1143    // Check for empty string to match against
1144    if (UNLIKELY(!matchLength))
1145        return 0;
1146
1147    if (is8Bit()) {
1148        if (matchString->is8Bit())
1149            return findInner(characters8(), matchString->characters8(), 0, length(), matchLength);
1150        return findInner(characters8(), matchString->characters16(), 0, length(), matchLength);
1151    }
1152
1153    if (matchString->is8Bit())
1154        return findInner(characters16(), matchString->characters8(), 0, length(), matchLength);
1155
1156    return findInner(characters16(), matchString->characters16(), 0, length(), matchLength);
1157}
1158
1159size_t StringImpl::find(StringImpl* matchString, unsigned index)
1160{
1161    // Check for null or empty string to match against
1162    if (UNLIKELY(!matchString))
1163        return notFound;
1164
1165    unsigned matchLength = matchString->length();
1166
1167    // Optimization 1: fast case for strings of length 1.
1168    if (matchLength == 1) {
1169        if (is8Bit())
1170            return WTF::find(characters8(), length(), (*matchString)[0], index);
1171        return WTF::find(characters16(), length(), (*matchString)[0], index);
1172    }
1173
1174    if (UNLIKELY(!matchLength))
1175        return std::min(index, length());
1176
1177    // Check index & matchLength are in range.
1178    if (index > length())
1179        return notFound;
1180    unsigned searchLength = length() - index;
1181    if (matchLength > searchLength)
1182        return notFound;
1183
1184    if (is8Bit()) {
1185        if (matchString->is8Bit())
1186            return findInner(characters8() + index, matchString->characters8(), index, searchLength, matchLength);
1187        return findInner(characters8() + index, matchString->characters16(), index, searchLength, matchLength);
1188    }
1189
1190    if (matchString->is8Bit())
1191        return findInner(characters16() + index, matchString->characters8(), index, searchLength, matchLength);
1192
1193    return findInner(characters16() + index, matchString->characters16(), index, searchLength, matchLength);
1194}
1195
1196template <typename SearchCharacterType, typename MatchCharacterType>
1197ALWAYS_INLINE static size_t findIgnoringCaseInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned searchLength, unsigned matchLength)
1198{
1199    // delta is the number of additional times to test; delta == 0 means test only once.
1200    unsigned delta = searchLength - matchLength;
1201
1202    unsigned i = 0;
1203    // keep looping until we match
1204    while (!equalIgnoringCase(searchCharacters + i, matchCharacters, matchLength)) {
1205        if (i == delta)
1206            return notFound;
1207        ++i;
1208    }
1209    return index + i;
1210}
1211
1212size_t StringImpl::findIgnoringCase(StringImpl* matchString, unsigned index)
1213{
1214    // Check for null or empty string to match against
1215    if (!matchString)
1216        return notFound;
1217    unsigned matchLength = matchString->length();
1218    if (!matchLength)
1219        return std::min(index, length());
1220
1221    // Check index & matchLength are in range.
1222    if (index > length())
1223        return notFound;
1224    unsigned searchLength = length() - index;
1225    if (matchLength > searchLength)
1226        return notFound;
1227
1228    if (is8Bit()) {
1229        if (matchString->is8Bit())
1230            return findIgnoringCaseInner(characters8() + index, matchString->characters8(), index, searchLength, matchLength);
1231        return findIgnoringCaseInner(characters8() + index, matchString->characters16(), index, searchLength, matchLength);
1232    }
1233
1234    if (matchString->is8Bit())
1235        return findIgnoringCaseInner(characters16() + index, matchString->characters8(), index, searchLength, matchLength);
1236
1237    return findIgnoringCaseInner(characters16() + index, matchString->characters16(), index, searchLength, matchLength);
1238}
1239
1240size_t StringImpl::findNextLineStart(unsigned index)
1241{
1242    if (is8Bit())
1243        return WTF::findNextLineStart(characters8(), m_length, index);
1244    return WTF::findNextLineStart(characters16(), m_length, index);
1245}
1246
1247size_t StringImpl::reverseFind(UChar c, unsigned index)
1248{
1249    if (is8Bit())
1250        return WTF::reverseFind(characters8(), m_length, c, index);
1251    return WTF::reverseFind(characters16(), m_length, c, index);
1252}
1253
1254template <typename SearchCharacterType, typename MatchCharacterType>
1255ALWAYS_INLINE static size_t reverseFindInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned length, unsigned matchLength)
1256{
1257    // Optimization: keep a running hash of the strings,
1258    // only call equal if the hashes match.
1259
1260    // delta is the number of additional times to test; delta == 0 means test only once.
1261    unsigned delta = std::min(index, length - matchLength);
1262
1263    unsigned searchHash = 0;
1264    unsigned matchHash = 0;
1265    for (unsigned i = 0; i < matchLength; ++i) {
1266        searchHash += searchCharacters[delta + i];
1267        matchHash += matchCharacters[i];
1268    }
1269
1270    // keep looping until we match
1271    while (searchHash != matchHash || !equal(searchCharacters + delta, matchCharacters, matchLength)) {
1272        if (!delta)
1273            return notFound;
1274        --delta;
1275        searchHash -= searchCharacters[delta + matchLength];
1276        searchHash += searchCharacters[delta];
1277    }
1278    return delta;
1279}
1280
1281size_t StringImpl::reverseFind(StringImpl* matchString, unsigned index)
1282{
1283    // Check for null or empty string to match against
1284    if (!matchString)
1285        return notFound;
1286    unsigned matchLength = matchString->length();
1287    unsigned ourLength = length();
1288    if (!matchLength)
1289        return std::min(index, ourLength);
1290
1291    // Optimization 1: fast case for strings of length 1.
1292    if (matchLength == 1) {
1293        if (is8Bit())
1294            return WTF::reverseFind(characters8(), ourLength, (*matchString)[0], index);
1295        return WTF::reverseFind(characters16(), ourLength, (*matchString)[0], index);
1296    }
1297
1298    // Check index & matchLength are in range.
1299    if (matchLength > ourLength)
1300        return notFound;
1301
1302    if (is8Bit()) {
1303        if (matchString->is8Bit())
1304            return reverseFindInner(characters8(), matchString->characters8(), index, ourLength, matchLength);
1305        return reverseFindInner(characters8(), matchString->characters16(), index, ourLength, matchLength);
1306    }
1307
1308    if (matchString->is8Bit())
1309        return reverseFindInner(characters16(), matchString->characters8(), index, ourLength, matchLength);
1310
1311    return reverseFindInner(characters16(), matchString->characters16(), index, ourLength, matchLength);
1312}
1313
1314template <typename SearchCharacterType, typename MatchCharacterType>
1315ALWAYS_INLINE static size_t reverseFindIgnoringCaseInner(const SearchCharacterType* searchCharacters, const MatchCharacterType* matchCharacters, unsigned index, unsigned length, unsigned matchLength)
1316{
1317    // delta is the number of additional times to test; delta == 0 means test only once.
1318    unsigned delta = std::min(index, length - matchLength);
1319
1320    // keep looping until we match
1321    while (!equalIgnoringCase(searchCharacters + delta, matchCharacters, matchLength)) {
1322        if (!delta)
1323            return notFound;
1324        --delta;
1325    }
1326    return delta;
1327}
1328
1329size_t StringImpl::reverseFindIgnoringCase(StringImpl* matchString, unsigned index)
1330{
1331    // Check for null or empty string to match against
1332    if (!matchString)
1333        return notFound;
1334    unsigned matchLength = matchString->length();
1335    unsigned ourLength = length();
1336    if (!matchLength)
1337        return std::min(index, ourLength);
1338
1339    // Check index & matchLength are in range.
1340    if (matchLength > ourLength)
1341        return notFound;
1342
1343    if (is8Bit()) {
1344        if (matchString->is8Bit())
1345            return reverseFindIgnoringCaseInner(characters8(), matchString->characters8(), index, ourLength, matchLength);
1346        return reverseFindIgnoringCaseInner(characters8(), matchString->characters16(), index, ourLength, matchLength);
1347    }
1348
1349    if (matchString->is8Bit())
1350        return reverseFindIgnoringCaseInner(characters16(), matchString->characters8(), index, ourLength, matchLength);
1351
1352    return reverseFindIgnoringCaseInner(characters16(), matchString->characters16(), index, ourLength, matchLength);
1353}
1354
1355ALWAYS_INLINE static bool equalInner(const StringImpl* stringImpl, unsigned startOffset, const char* matchString, unsigned matchLength, bool caseSensitive)
1356{
1357    ASSERT(stringImpl);
1358    ASSERT(matchLength <= stringImpl->length());
1359    ASSERT(startOffset + matchLength <= stringImpl->length());
1360
1361    if (caseSensitive) {
1362        if (stringImpl->is8Bit())
1363            return equal(stringImpl->characters8() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1364        return equal(stringImpl->characters16() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1365    }
1366    if (stringImpl->is8Bit())
1367        return equalIgnoringCase(stringImpl->characters8() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1368    return equalIgnoringCase(stringImpl->characters16() + startOffset, reinterpret_cast<const LChar*>(matchString), matchLength);
1369}
1370
1371bool StringImpl::startsWith(const StringImpl* str) const
1372{
1373    if (!str)
1374        return false;
1375
1376    if (str->length() > length())
1377        return false;
1378
1379    if (is8Bit()) {
1380        if (str->is8Bit())
1381            return equal(characters8(), str->characters8(), str->length());
1382        return equal(characters8(), str->characters16(), str->length());
1383    }
1384    if (str->is8Bit())
1385        return equal(characters16(), str->characters8(), str->length());
1386    return equal(characters16(), str->characters16(), str->length());
1387}
1388
1389bool StringImpl::startsWith(UChar character) const
1390{
1391    return m_length && (*this)[0] == character;
1392}
1393
1394bool StringImpl::startsWith(const char* matchString, unsigned matchLength, bool caseSensitive) const
1395{
1396    ASSERT(matchLength);
1397    if (matchLength > length())
1398        return false;
1399    return equalInner(this, 0, matchString, matchLength, caseSensitive);
1400}
1401
1402bool StringImpl::endsWith(StringImpl* matchString, bool caseSensitive)
1403{
1404    ASSERT(matchString);
1405    if (m_length >= matchString->m_length) {
1406        unsigned start = m_length - matchString->m_length;
1407        return (caseSensitive ? find(matchString, start) : findIgnoringCase(matchString, start)) == start;
1408    }
1409    return false;
1410}
1411
1412bool StringImpl::endsWith(UChar character) const
1413{
1414    return m_length && (*this)[m_length - 1] == character;
1415}
1416
1417bool StringImpl::endsWith(const char* matchString, unsigned matchLength, bool caseSensitive) const
1418{
1419    ASSERT(matchLength);
1420    if (matchLength > length())
1421        return false;
1422    unsigned startOffset = length() - matchLength;
1423    return equalInner(this, startOffset, matchString, matchLength, caseSensitive);
1424}
1425
1426PassRef<StringImpl> StringImpl::replace(UChar oldC, UChar newC)
1427{
1428    if (oldC == newC)
1429        return *this;
1430    unsigned i;
1431    for (i = 0; i != m_length; ++i) {
1432        UChar c = is8Bit() ? m_data8[i] : m_data16[i];
1433        if (c == oldC)
1434            break;
1435    }
1436    if (i == m_length)
1437        return *this;
1438
1439    if (is8Bit()) {
1440        if (oldC > 0xff)
1441            // Looking for a 16 bit char in an 8 bit string, we're done.
1442            return *this;
1443
1444        if (newC <= 0xff) {
1445            LChar* data;
1446            LChar oldChar = static_cast<LChar>(oldC);
1447            LChar newChar = static_cast<LChar>(newC);
1448
1449            auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1450
1451            for (i = 0; i != m_length; ++i) {
1452                LChar ch = m_data8[i];
1453                if (ch == oldChar)
1454                    ch = newChar;
1455                data[i] = ch;
1456            }
1457            return newImpl;
1458        }
1459
1460        // There is the possibility we need to up convert from 8 to 16 bit,
1461        // create a 16 bit string for the result.
1462        UChar* data;
1463        auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1464
1465        for (i = 0; i != m_length; ++i) {
1466            UChar ch = m_data8[i];
1467            if (ch == oldC)
1468                ch = newC;
1469            data[i] = ch;
1470        }
1471
1472        return newImpl;
1473    }
1474
1475    UChar* data;
1476    auto newImpl = createUninitializedInternalNonEmpty(m_length, data);
1477
1478    for (i = 0; i != m_length; ++i) {
1479        UChar ch = m_data16[i];
1480        if (ch == oldC)
1481            ch = newC;
1482        data[i] = ch;
1483    }
1484    return newImpl;
1485}
1486
1487PassRef<StringImpl> StringImpl::replace(unsigned position, unsigned lengthToReplace, StringImpl* str)
1488{
1489    position = std::min(position, length());
1490    lengthToReplace = std::min(lengthToReplace, length() - position);
1491    unsigned lengthToInsert = str ? str->length() : 0;
1492    if (!lengthToReplace && !lengthToInsert)
1493        return *this;
1494
1495    if ((length() - lengthToReplace) >= (std::numeric_limits<unsigned>::max() - lengthToInsert))
1496        CRASH();
1497
1498    if (is8Bit() && (!str || str->is8Bit())) {
1499        LChar* data;
1500        auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data);
1501        memcpy(data, m_data8, position * sizeof(LChar));
1502        if (str)
1503            memcpy(data + position, str->m_data8, lengthToInsert * sizeof(LChar));
1504        memcpy(data + position + lengthToInsert, m_data8 + position + lengthToReplace,
1505               (length() - position - lengthToReplace) * sizeof(LChar));
1506        return newImpl;
1507    }
1508    UChar* data;
1509    auto newImpl = createUninitialized(length() - lengthToReplace + lengthToInsert, data);
1510    if (is8Bit())
1511        for (unsigned i = 0; i < position; ++i)
1512            data[i] = m_data8[i];
1513    else
1514        memcpy(data, m_data16, position * sizeof(UChar));
1515    if (str) {
1516        if (str->is8Bit())
1517            for (unsigned i = 0; i < lengthToInsert; ++i)
1518                data[i + position] = str->m_data8[i];
1519        else
1520            memcpy(data + position, str->m_data16, lengthToInsert * sizeof(UChar));
1521    }
1522    if (is8Bit()) {
1523        for (unsigned i = 0; i < length() - position - lengthToReplace; ++i)
1524            data[i + position + lengthToInsert] = m_data8[i + position + lengthToReplace];
1525    } else {
1526        memcpy(data + position + lengthToInsert, characters16() + position + lengthToReplace,
1527            (length() - position - lengthToReplace) * sizeof(UChar));
1528    }
1529    return newImpl;
1530}
1531
1532PassRef<StringImpl> StringImpl::replace(UChar pattern, StringImpl* replacement)
1533{
1534    if (!replacement)
1535        return *this;
1536
1537    if (replacement->is8Bit())
1538        return replace(pattern, replacement->m_data8, replacement->length());
1539
1540    return replace(pattern, replacement->m_data16, replacement->length());
1541}
1542
1543PassRef<StringImpl> StringImpl::replace(UChar pattern, const LChar* replacement, unsigned repStrLength)
1544{
1545    ASSERT(replacement);
1546
1547    size_t srcSegmentStart = 0;
1548    unsigned matchCount = 0;
1549
1550    // Count the matches.
1551    while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1552        ++matchCount;
1553        ++srcSegmentStart;
1554    }
1555
1556    // If we have 0 matches then we don't have to do any more work.
1557    if (!matchCount)
1558        return *this;
1559
1560    if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength)
1561        CRASH();
1562
1563    unsigned replaceSize = matchCount * repStrLength;
1564    unsigned newSize = m_length - matchCount;
1565    if (newSize >= (std::numeric_limits<unsigned>::max() - replaceSize))
1566        CRASH();
1567
1568    newSize += replaceSize;
1569
1570    // Construct the new data.
1571    size_t srcSegmentEnd;
1572    unsigned srcSegmentLength;
1573    srcSegmentStart = 0;
1574    unsigned dstOffset = 0;
1575
1576    if (is8Bit()) {
1577        LChar* data;
1578        auto newImpl = createUninitialized(newSize, data);
1579
1580        while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1581            srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1582            memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar));
1583            dstOffset += srcSegmentLength;
1584            memcpy(data + dstOffset, replacement, repStrLength * sizeof(LChar));
1585            dstOffset += repStrLength;
1586            srcSegmentStart = srcSegmentEnd + 1;
1587        }
1588
1589        srcSegmentLength = m_length - srcSegmentStart;
1590        memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar));
1591
1592        ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1593
1594        return newImpl;
1595    }
1596
1597    UChar* data;
1598    auto newImpl = createUninitialized(newSize, data);
1599
1600    while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1601        srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1602        memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1603
1604        dstOffset += srcSegmentLength;
1605        for (unsigned i = 0; i < repStrLength; ++i)
1606            data[i + dstOffset] = replacement[i];
1607
1608        dstOffset += repStrLength;
1609        srcSegmentStart = srcSegmentEnd + 1;
1610    }
1611
1612    srcSegmentLength = m_length - srcSegmentStart;
1613    memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1614
1615    ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1616
1617    return newImpl;
1618}
1619
1620PassRef<StringImpl> StringImpl::replace(UChar pattern, const UChar* replacement, unsigned repStrLength)
1621{
1622    ASSERT(replacement);
1623
1624    size_t srcSegmentStart = 0;
1625    unsigned matchCount = 0;
1626
1627    // Count the matches.
1628    while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1629        ++matchCount;
1630        ++srcSegmentStart;
1631    }
1632
1633    // If we have 0 matches then we don't have to do any more work.
1634    if (!matchCount)
1635        return *this;
1636
1637    if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength)
1638        CRASH();
1639
1640    unsigned replaceSize = matchCount * repStrLength;
1641    unsigned newSize = m_length - matchCount;
1642    if (newSize >= (std::numeric_limits<unsigned>::max() - replaceSize))
1643        CRASH();
1644
1645    newSize += replaceSize;
1646
1647    // Construct the new data.
1648    size_t srcSegmentEnd;
1649    unsigned srcSegmentLength;
1650    srcSegmentStart = 0;
1651    unsigned dstOffset = 0;
1652
1653    if (is8Bit()) {
1654        UChar* data;
1655        auto newImpl = createUninitialized(newSize, data);
1656
1657        while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1658            srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1659            for (unsigned i = 0; i < srcSegmentLength; ++i)
1660                data[i + dstOffset] = m_data8[i + srcSegmentStart];
1661
1662            dstOffset += srcSegmentLength;
1663            memcpy(data + dstOffset, replacement, repStrLength * sizeof(UChar));
1664
1665            dstOffset += repStrLength;
1666            srcSegmentStart = srcSegmentEnd + 1;
1667        }
1668
1669        srcSegmentLength = m_length - srcSegmentStart;
1670        for (unsigned i = 0; i < srcSegmentLength; ++i)
1671            data[i + dstOffset] = m_data8[i + srcSegmentStart];
1672
1673        ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1674
1675        return newImpl;
1676    }
1677
1678    UChar* data;
1679    auto newImpl = createUninitialized(newSize, data);
1680
1681    while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1682        srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1683        memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1684
1685        dstOffset += srcSegmentLength;
1686        memcpy(data + dstOffset, replacement, repStrLength * sizeof(UChar));
1687
1688        dstOffset += repStrLength;
1689        srcSegmentStart = srcSegmentEnd + 1;
1690    }
1691
1692    srcSegmentLength = m_length - srcSegmentStart;
1693    memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1694
1695    ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1696
1697    return newImpl;
1698}
1699
1700PassRef<StringImpl> StringImpl::replace(StringImpl* pattern, StringImpl* replacement)
1701{
1702    if (!pattern || !replacement)
1703        return *this;
1704
1705    unsigned patternLength = pattern->length();
1706    if (!patternLength)
1707        return *this;
1708
1709    unsigned repStrLength = replacement->length();
1710    size_t srcSegmentStart = 0;
1711    unsigned matchCount = 0;
1712
1713    // Count the matches.
1714    while ((srcSegmentStart = find(pattern, srcSegmentStart)) != notFound) {
1715        ++matchCount;
1716        srcSegmentStart += patternLength;
1717    }
1718
1719    // If we have 0 matches, we don't have to do any more work
1720    if (!matchCount)
1721        return *this;
1722
1723    unsigned newSize = m_length - matchCount * patternLength;
1724    if (repStrLength && matchCount > std::numeric_limits<unsigned>::max() / repStrLength)
1725        CRASH();
1726
1727    if (newSize > (std::numeric_limits<unsigned>::max() - matchCount * repStrLength))
1728        CRASH();
1729
1730    newSize += matchCount * repStrLength;
1731
1732
1733    // Construct the new data
1734    size_t srcSegmentEnd;
1735    unsigned srcSegmentLength;
1736    srcSegmentStart = 0;
1737    unsigned dstOffset = 0;
1738    bool srcIs8Bit = is8Bit();
1739    bool replacementIs8Bit = replacement->is8Bit();
1740
1741    // There are 4 cases:
1742    // 1. This and replacement are both 8 bit.
1743    // 2. This and replacement are both 16 bit.
1744    // 3. This is 8 bit and replacement is 16 bit.
1745    // 4. This is 16 bit and replacement is 8 bit.
1746    if (srcIs8Bit && replacementIs8Bit) {
1747        // Case 1
1748        LChar* data;
1749        auto newImpl = createUninitialized(newSize, data);
1750        while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1751            srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1752            memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar));
1753            dstOffset += srcSegmentLength;
1754            memcpy(data + dstOffset, replacement->m_data8, repStrLength * sizeof(LChar));
1755            dstOffset += repStrLength;
1756            srcSegmentStart = srcSegmentEnd + patternLength;
1757        }
1758
1759        srcSegmentLength = m_length - srcSegmentStart;
1760        memcpy(data + dstOffset, m_data8 + srcSegmentStart, srcSegmentLength * sizeof(LChar));
1761
1762        ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1763
1764        return newImpl;
1765    }
1766
1767    UChar* data;
1768    auto newImpl = createUninitialized(newSize, data);
1769    while ((srcSegmentEnd = find(pattern, srcSegmentStart)) != notFound) {
1770        srcSegmentLength = srcSegmentEnd - srcSegmentStart;
1771        if (srcIs8Bit) {
1772            // Case 3.
1773            for (unsigned i = 0; i < srcSegmentLength; ++i)
1774                data[i + dstOffset] = m_data8[i + srcSegmentStart];
1775        } else {
1776            // Case 2 & 4.
1777            memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1778        }
1779        dstOffset += srcSegmentLength;
1780        if (replacementIs8Bit) {
1781            // Cases 2 & 3.
1782            for (unsigned i = 0; i < repStrLength; ++i)
1783                data[i + dstOffset] = replacement->m_data8[i];
1784        } else {
1785            // Case 4
1786            memcpy(data + dstOffset, replacement->m_data16, repStrLength * sizeof(UChar));
1787        }
1788        dstOffset += repStrLength;
1789        srcSegmentStart = srcSegmentEnd + patternLength;
1790    }
1791
1792    srcSegmentLength = m_length - srcSegmentStart;
1793    if (srcIs8Bit) {
1794        // Case 3.
1795        for (unsigned i = 0; i < srcSegmentLength; ++i)
1796            data[i + dstOffset] = m_data8[i + srcSegmentStart];
1797    } else {
1798        // Cases 2 & 4.
1799        memcpy(data + dstOffset, m_data16 + srcSegmentStart, srcSegmentLength * sizeof(UChar));
1800    }
1801
1802    ASSERT(dstOffset + srcSegmentLength == newImpl.get().length());
1803
1804    return newImpl;
1805}
1806
1807static ALWAYS_INLINE bool stringImplContentEqual(const StringImpl& a, const StringImpl& b)
1808{
1809    unsigned aLength = a.length();
1810    unsigned bLength = b.length();
1811    if (aLength != bLength)
1812        return false;
1813
1814    if (a.is8Bit()) {
1815        if (b.is8Bit())
1816            return equal(a.characters8(), b.characters8(), aLength);
1817
1818        return equal(a.characters8(), b.characters16(), aLength);
1819    }
1820
1821    if (b.is8Bit())
1822        return equal(a.characters16(), b.characters8(), aLength);
1823
1824    return equal(a.characters16(), b.characters16(), aLength);
1825}
1826
1827bool equal(const StringImpl* a, const StringImpl* b)
1828{
1829    if (a == b)
1830        return true;
1831    if (!a || !b)
1832        return false;
1833
1834    return stringImplContentEqual(*a, *b);
1835}
1836
1837template <typename CharType>
1838inline bool equalInternal(const StringImpl* a, const CharType* b, unsigned length)
1839{
1840    if (!a)
1841        return !b;
1842    if (!b)
1843        return false;
1844
1845    if (a->length() != length)
1846        return false;
1847    if (a->is8Bit())
1848        return equal(a->characters8(), b, length);
1849    return equal(a->characters16(), b, length);
1850}
1851
1852bool equal(const StringImpl* a, const LChar* b, unsigned length)
1853{
1854    return equalInternal(a, b, length);
1855}
1856
1857bool equal(const StringImpl* a, const UChar* b, unsigned length)
1858{
1859    return equalInternal(a, b, length);
1860}
1861
1862bool equal(const StringImpl* a, const LChar* b)
1863{
1864    if (!a)
1865        return !b;
1866    if (!b)
1867        return !a;
1868
1869    unsigned length = a->length();
1870
1871    if (a->is8Bit()) {
1872        const LChar* aPtr = a->characters8();
1873        for (unsigned i = 0; i != length; ++i) {
1874            LChar bc = b[i];
1875            LChar ac = aPtr[i];
1876            if (!bc)
1877                return false;
1878            if (ac != bc)
1879                return false;
1880        }
1881
1882        return !b[length];
1883    }
1884
1885    const UChar* aPtr = a->characters16();
1886    for (unsigned i = 0; i != length; ++i) {
1887        LChar bc = b[i];
1888        if (!bc)
1889            return false;
1890        if (aPtr[i] != bc)
1891            return false;
1892    }
1893
1894    return !b[length];
1895}
1896
1897bool equal(const StringImpl& a, const StringImpl& b)
1898{
1899    if (&a == &b)
1900        return true;
1901
1902    return stringImplContentEqual(a, b);
1903}
1904
1905bool equalIgnoringCase(const StringImpl* a, const StringImpl* b)
1906{
1907    if (a == b)
1908        return true;
1909    if (!a || !b)
1910        return false;
1911
1912    return CaseFoldingHash::equal(a, b);
1913}
1914
1915bool equalIgnoringCase(const StringImpl* a, const LChar* b)
1916{
1917    if (!a)
1918        return !b;
1919    if (!b)
1920        return !a;
1921
1922    unsigned length = a->length();
1923
1924    // Do a faster loop for the case where all the characters are ASCII.
1925    UChar ored = 0;
1926    bool equal = true;
1927    if (a->is8Bit()) {
1928        const LChar* as = a->characters8();
1929        for (unsigned i = 0; i != length; ++i) {
1930            LChar bc = b[i];
1931            if (!bc)
1932                return false;
1933            UChar ac = as[i];
1934            ored |= ac;
1935            equal = equal && (toASCIILower(ac) == toASCIILower(bc));
1936        }
1937
1938        // Do a slower implementation for cases that include non-ASCII characters.
1939        if (ored & ~0x7F) {
1940            equal = true;
1941            for (unsigned i = 0; i != length; ++i)
1942                equal = equal && u_foldCase(as[i], U_FOLD_CASE_DEFAULT) == u_foldCase(b[i], U_FOLD_CASE_DEFAULT);
1943        }
1944
1945        return equal && !b[length];
1946    }
1947
1948    const UChar* as = a->characters16();
1949    for (unsigned i = 0; i != length; ++i) {
1950        LChar bc = b[i];
1951        if (!bc)
1952            return false;
1953        UChar ac = as[i];
1954        ored |= ac;
1955        equal = equal && (toASCIILower(ac) == toASCIILower(bc));
1956    }
1957
1958    // Do a slower implementation for cases that include non-ASCII characters.
1959    if (ored & ~0x7F) {
1960        equal = true;
1961        for (unsigned i = 0; i != length; ++i) {
1962            equal = equal && u_foldCase(as[i], U_FOLD_CASE_DEFAULT) == u_foldCase(b[i], U_FOLD_CASE_DEFAULT);
1963        }
1964    }
1965
1966    return equal && !b[length];
1967}
1968
1969bool equalIgnoringCaseNonNull(const StringImpl* a, const StringImpl* b)
1970{
1971    ASSERT(a && b);
1972    if (a == b)
1973        return true;
1974
1975    unsigned length = a->length();
1976    if (length != b->length())
1977        return false;
1978
1979    if (a->is8Bit()) {
1980        if (b->is8Bit())
1981            return equalIgnoringCase(a->characters8(), b->characters8(), length);
1982
1983        return equalIgnoringCase(b->characters16(), a->characters8(), length);
1984    }
1985
1986    if (b->is8Bit())
1987        return equalIgnoringCase(a->characters16(), b->characters8(), length);
1988
1989    return equalIgnoringCase(a->characters16(), b->characters16(), length);
1990}
1991
1992bool equalIgnoringNullity(StringImpl* a, StringImpl* b)
1993{
1994    if (!a && b && !b->length())
1995        return true;
1996    if (!b && a && !a->length())
1997        return true;
1998    return equal(a, b);
1999}
2000
2001UCharDirection StringImpl::defaultWritingDirection(bool* hasStrongDirectionality)
2002{
2003    for (unsigned i = 0; i < m_length; ++i) {
2004        UCharDirection charDirection = u_charDirection(is8Bit() ? m_data8[i] : m_data16[i]);
2005        if (charDirection == U_LEFT_TO_RIGHT) {
2006            if (hasStrongDirectionality)
2007                *hasStrongDirectionality = true;
2008            return U_LEFT_TO_RIGHT;
2009        }
2010        if (charDirection == U_RIGHT_TO_LEFT || charDirection == U_RIGHT_TO_LEFT_ARABIC) {
2011            if (hasStrongDirectionality)
2012                *hasStrongDirectionality = true;
2013            return U_RIGHT_TO_LEFT;
2014        }
2015    }
2016    if (hasStrongDirectionality)
2017        *hasStrongDirectionality = false;
2018    return U_LEFT_TO_RIGHT;
2019}
2020
2021PassRef<StringImpl> StringImpl::adopt(StringBuffer<LChar>& buffer)
2022{
2023    unsigned length = buffer.length();
2024    if (!length)
2025        return *empty();
2026    return adoptRef(*new StringImpl(buffer.release(), length));
2027}
2028
2029PassRef<StringImpl> StringImpl::adopt(StringBuffer<UChar>& buffer)
2030{
2031    unsigned length = buffer.length();
2032    if (!length)
2033        return *empty();
2034    return adoptRef(*new StringImpl(buffer.release(), length));
2035}
2036
2037size_t StringImpl::sizeInBytes() const
2038{
2039    // FIXME: support substrings
2040    size_t size = length();
2041    if (!is8Bit())
2042        size *= 2;
2043    return size + sizeof(*this);
2044}
2045
2046// Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
2047static inline void putUTF8Triple(char*& buffer, UChar ch)
2048{
2049    ASSERT(ch >= 0x0800);
2050    *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
2051    *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
2052    *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
2053}
2054
2055bool StringImpl::utf8Impl(const UChar* characters, unsigned length, char*& buffer, size_t bufferSize, ConversionMode mode)
2056{
2057    if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) {
2058        const UChar* charactersEnd = characters + length;
2059        char* bufferEnd = buffer + bufferSize;
2060        while (characters < charactersEnd) {
2061            // Use strict conversion to detect unpaired surrogates.
2062            ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true);
2063            ASSERT(result != targetExhausted);
2064            // Conversion fails when there is an unpaired surrogate.
2065            // Put replacement character (U+FFFD) instead of the unpaired surrogate.
2066            if (result != conversionOK) {
2067                ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
2068                // There should be room left, since one UChar hasn't been converted.
2069                ASSERT((buffer + 3) <= bufferEnd);
2070                putUTF8Triple(buffer, replacementCharacter);
2071                ++characters;
2072            }
2073        }
2074    } else {
2075        bool strict = mode == StrictConversion;
2076        const UChar* originalCharacters = characters;
2077        ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferSize, strict);
2078        ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
2079
2080        // Only produced from strict conversion.
2081        if (result == sourceIllegal) {
2082            ASSERT(strict);
2083            return false;
2084        }
2085
2086        // Check for an unconverted high surrogate.
2087        if (result == sourceExhausted) {
2088            if (strict)
2089                return false;
2090            // This should be one unpaired high surrogate. Treat it the same
2091            // was as an unpaired high surrogate would have been handled in
2092            // the middle of a string with non-strict conversion - which is
2093            // to say, simply encode it to UTF-8.
2094            ASSERT_UNUSED(
2095                originalCharacters, (characters + 1) == (originalCharacters + length));
2096            ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
2097            // There should be room left, since one UChar hasn't been converted.
2098            ASSERT((buffer + 3) <= (buffer + bufferSize));
2099            putUTF8Triple(buffer, *characters);
2100        }
2101    }
2102
2103    return true;
2104}
2105
2106CString StringImpl::utf8ForCharacters(const UChar* characters, unsigned length, ConversionMode mode)
2107{
2108    if (!length)
2109        return CString("", 0);
2110    if (length > std::numeric_limits<unsigned>::max() / 3)
2111        return CString();
2112    Vector<char, 1024> bufferVector(length * 3);
2113    char* buffer = bufferVector.data();
2114    if (!utf8Impl(characters, length, buffer, bufferVector.size(), mode))
2115        return CString();
2116    return CString(bufferVector.data(), buffer - bufferVector.data());
2117}
2118
2119CString StringImpl::utf8ForRange(unsigned offset, unsigned length, ConversionMode mode) const
2120{
2121    ASSERT(offset <= this->length());
2122    ASSERT(offset + length <= this->length());
2123
2124    if (!length)
2125        return CString("", 0);
2126
2127    // Allocate a buffer big enough to hold all the characters
2128    // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
2129    // Optimization ideas, if we find this function is hot:
2130    //  * We could speculatively create a CStringBuffer to contain 'length'
2131    //    characters, and resize if necessary (i.e. if the buffer contains
2132    //    non-ascii characters). (Alternatively, scan the buffer first for
2133    //    ascii characters, so we know this will be sufficient).
2134    //  * We could allocate a CStringBuffer with an appropriate size to
2135    //    have a good chance of being able to write the string into the
2136    //    buffer without reallocing (say, 1.5 x length).
2137    if (length > std::numeric_limits<unsigned>::max() / 3)
2138        return CString();
2139    Vector<char, 1024> bufferVector(length * 3);
2140
2141    char* buffer = bufferVector.data();
2142
2143    if (is8Bit()) {
2144        const LChar* characters = this->characters8() + offset;
2145
2146        ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
2147        ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
2148    } else {
2149        if (!utf8Impl(this->characters16() + offset, length, buffer, bufferVector.size(), mode))
2150            return CString();
2151    }
2152
2153    return CString(bufferVector.data(), buffer - bufferVector.data());
2154}
2155
2156CString StringImpl::utf8(ConversionMode mode) const
2157{
2158    return utf8ForRange(0, length(), mode);
2159}
2160
2161// Table is based on ftp://ftp.unicode.org/Public/UNIDATA/CaseFolding.txt
2162const UChar StringImpl::latin1CaseFoldTable[256] = {
2163    0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f,
2164    0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f,
2165    0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f,
2166    0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f,
2167    0x0040, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
2168    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f,
2169    0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f,
2170    0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f,
2171    0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f,
2172    0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f,
2173    0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
2174    0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x03bc, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
2175    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
2176    0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00d7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00df,
2177    0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
2178    0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
2179};
2180
2181bool equalIgnoringNullity(const UChar* a, size_t aLength, StringImpl* b)
2182{
2183    if (!b)
2184        return !aLength;
2185    if (aLength != b->length())
2186        return false;
2187    if (b->is8Bit()) {
2188        const LChar* bCharacters = b->characters8();
2189        for (unsigned i = 0; i < aLength; ++i) {
2190            if (a[i] != bCharacters[i])
2191                return false;
2192        }
2193        return true;
2194    }
2195    return !memcmp(a, b->characters16(), b->length() * sizeof(UChar));
2196}
2197
2198} // namespace WTF
2199