1/*
2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB.  If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22#include "config.h"
23#include "TextBreakIterator.h"
24
25#include "LineBreakIteratorPoolICU.h"
26#include "UTextProviderLatin1.h"
27#include "UTextProviderUTF16.h"
28#include <mutex>
29#include <wtf/Atomics.h>
30#include <wtf/text/StringView.h>
31
32namespace WebCore {
33
34// Iterator initialization
35
36static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID())
37{
38    UErrorCode openStatus = U_ZERO_ERROR;
39    TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus));
40    ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
41    return iterator;
42}
43
44#if !PLATFORM(IOS)
45
46static TextBreakIterator* initializeIteratorWithRules(const char* breakRules)
47{
48    UParseError parseStatus;
49    UErrorCode openStatus = U_ZERO_ERROR;
50    unsigned length = strlen(breakRules);
51    auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters();
52    TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus));
53    ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
54    return iterator;
55}
56
57#endif
58
59
60// Iterator text setting
61
62static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string)
63{
64    if (string.is8Bit()) {
65        UTextWithBuffer textLocal;
66        textLocal.text = UTEXT_INITIALIZER;
67        textLocal.text.extraSize = sizeof(textLocal.buffer);
68        textLocal.text.pExtra = textLocal.buffer;
69
70        UErrorCode openStatus = U_ZERO_ERROR;
71        UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus);
72        if (U_FAILURE(openStatus)) {
73            LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus);
74            return nullptr;
75        }
76
77        UErrorCode setTextStatus = U_ZERO_ERROR;
78        ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
79        if (U_FAILURE(setTextStatus)) {
80            LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
81            return nullptr;
82        }
83
84        utext_close(text);
85    } else {
86        UErrorCode setTextStatus = U_ZERO_ERROR;
87        ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus);
88        if (U_FAILURE(setTextStatus))
89            return nullptr;
90    }
91
92    return &iterator;
93}
94
95static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength)
96{
97    if (string.is8Bit()) {
98        UTextWithBuffer textLocal;
99        textLocal.text = UTEXT_INITIALIZER;
100        textLocal.text.extraSize = sizeof(textLocal.buffer);
101        textLocal.text.pExtra = textLocal.buffer;
102
103        UErrorCode openStatus = U_ZERO_ERROR;
104        UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus);
105        if (U_FAILURE(openStatus)) {
106            LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus);
107            return nullptr;
108        }
109
110        UErrorCode setTextStatus = U_ZERO_ERROR;
111        ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
112        if (U_FAILURE(setTextStatus)) {
113            LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
114            return nullptr;
115        }
116
117        utext_close(text);
118    } else {
119        UText textLocal = UTEXT_INITIALIZER;
120
121        UErrorCode openStatus = U_ZERO_ERROR;
122        UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus);
123        if (U_FAILURE(openStatus)) {
124            LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus);
125            return 0;
126        }
127
128        UErrorCode setTextStatus = U_ZERO_ERROR;
129        ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus);
130        if (U_FAILURE(setTextStatus)) {
131            LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
132            return nullptr;
133        }
134
135        utext_close(text);
136    }
137
138    return &iterator;
139}
140
141
142// Static iterators
143
144TextBreakIterator* wordBreakIterator(StringView string)
145{
146    static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD);
147    if (!staticWordBreakIterator)
148        return nullptr;
149
150    return setTextForIterator(*staticWordBreakIterator, string);
151}
152
153TextBreakIterator* sentenceBreakIterator(StringView string)
154{
155    static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE);
156    if (!staticSentenceBreakIterator)
157        return nullptr;
158
159    return setTextForIterator(*staticSentenceBreakIterator, string);
160}
161
162TextBreakIterator* cursorMovementIterator(StringView string)
163{
164#if !PLATFORM(IOS)
165    // This rule set is based on character-break iterator rules of ICU 4.0
166    // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
167    // The major differences from the original ones are listed below:
168    // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
169    // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
170    // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
171    // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
172    // * Added rules for regional indicator symbols.
173    static const char* kRules =
174        "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
175        "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
176        "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
177        "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
178        "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
179        "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
180        "$L       = [\\p{Grapheme_Cluster_Break = L}];"
181        "$V       = [\\p{Grapheme_Cluster_Break = V}];"
182        "$T       = [\\p{Grapheme_Cluster_Break = T}];"
183        "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
184        "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
185        "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
186        "$HinV    = \\u094D;"              // Devanagari Sign Virama
187        "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
188        "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
189        "$BenV    = \\u09CD;"              // Bengali Sign Virama
190        "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
191        "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
192        "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
193        "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
194        "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
195        "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
196        "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
197        "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
198        "$OriV    = \\u0B4D;"              // Oriya Sign Virama
199        "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
200        "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
201        "$TelV    = \\u0C4D;"              // Telugu Sign Virama
202        "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
203        "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
204        "$KanV    = \\u0CCD;"              // Kannada Sign Virama
205        "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
206        "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
207        "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
208        "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
209        "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
210        "!!chain;"
211        "!!forward;"
212        "$CR $LF;"
213        "$L ($L | $V | $LV | $LVT);"
214        "($LV | $V) ($V | $T);"
215        "($LVT | $T) $T;"
216        "[^$Control $CR $LF] $Extend;"
217        "[^$Control $CR $LF] $SpacingMark;"
218        "$RI $RI / $RI;"
219        "$RI $RI;"
220        "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
221        "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
222        "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
223        "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
224        "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
225        "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
226        "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
227        "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
228        "!!reverse;"
229        "$LF $CR;"
230        "($L | $V | $LV | $LVT) $L;"
231        "($V | $T) ($LV | $V);"
232        "$T ($LVT | $T);"
233        "$Extend      [^$Control $CR $LF];"
234        "$SpacingMark [^$Control $CR $LF];"
235        "$RI $RI / $RI $RI;"
236        "$RI $RI;"
237        "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
238        "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
239        "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
240        "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
241        "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
242        "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
243        "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
244        "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
245        "!!safe_reverse;"
246        "!!safe_forward;";
247    static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules);
248#else // PLATFORM(IOS)
249    // Use the special Thai character break iterator for all locales
250    static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th");
251#endif // !PLATFORM(IOS)
252
253    if (!staticCursorMovementIterator)
254        return nullptr;
255
256    return setTextForIterator(*staticCursorMovementIterator, string);
257}
258
259TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
260{
261    TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(LineBreakIteratorPool::sharedPool().take(locale));
262    if (!iterator)
263        return nullptr;
264
265    return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength);
266}
267
268void releaseLineBreakIterator(TextBreakIterator* iterator)
269{
270    ASSERT_ARG(iterator, iterator);
271
272    LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator));
273}
274
275static TextBreakIterator* nonSharedCharacterBreakIterator;
276
277static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
278{
279#if ENABLE(COMPARE_AND_SWAP)
280    return WTF::weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue);
281#else
282    DEPRECATED_DEFINE_STATIC_LOCAL(std::mutex, nonSharedCharacterBreakIteratorMutex, ());
283    std::lock_guard<std::mutex> locker(nonSharedCharacterBreakIteratorMutex);
284    if (nonSharedCharacterBreakIterator != expected)
285        return false;
286    nonSharedCharacterBreakIterator = newValue;
287    return true;
288#endif
289}
290
291NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string)
292{
293    m_iterator = nonSharedCharacterBreakIterator;
294
295    bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
296    if (!createdIterator)
297        m_iterator = initializeIterator(UBRK_CHARACTER);
298    if (!m_iterator)
299        return;
300
301    m_iterator = setTextForIterator(*m_iterator, string);
302}
303
304NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
305{
306    if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
307        ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator));
308}
309
310
311// Iterator implemenation.
312
313int textBreakFirst(TextBreakIterator* iterator)
314{
315    return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
316}
317
318int textBreakLast(TextBreakIterator* iterator)
319{
320    return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
321}
322
323int textBreakNext(TextBreakIterator* iterator)
324{
325    return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
326}
327
328int textBreakPrevious(TextBreakIterator* iterator)
329{
330    return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
331}
332
333int textBreakPreceding(TextBreakIterator* iterator, int pos)
334{
335    return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
336}
337
338int textBreakFollowing(TextBreakIterator* iterator, int pos)
339{
340    return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
341}
342
343int textBreakCurrent(TextBreakIterator* iterator)
344{
345    return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
346}
347
348bool isTextBreak(TextBreakIterator* iterator, int position)
349{
350    return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
351}
352
353bool isWordTextBreak(TextBreakIterator* iterator)
354{
355    int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator));
356    return ruleStatus != UBRK_WORD_NONE;
357}
358
359unsigned numGraphemeClusters(const String& s)
360{
361    unsigned stringLength = s.length();
362
363    if (!stringLength)
364        return 0;
365
366    // The only Latin-1 Extended Grapheme Cluster is CR LF
367    if (s.is8Bit() && !s.contains('\r'))
368        return stringLength;
369
370    NonSharedCharacterBreakIterator it(s);
371    if (!it)
372        return stringLength;
373
374    unsigned num = 0;
375    while (textBreakNext(it) != TextBreakDone)
376        ++num;
377    return num;
378}
379
380unsigned numCharactersInGraphemeClusters(const String& s, unsigned numGraphemeClusters)
381{
382    unsigned stringLength = s.length();
383
384    if (!stringLength)
385        return 0;
386
387    // The only Latin-1 Extended Grapheme Cluster is CR LF
388    if (s.is8Bit() && !s.contains('\r'))
389        return std::min(stringLength, numGraphemeClusters);
390
391    NonSharedCharacterBreakIterator it(s);
392    if (!it)
393        return std::min(stringLength, numGraphemeClusters);
394
395    for (unsigned i = 0; i < numGraphemeClusters; ++i) {
396        if (textBreakNext(it) == TextBreakDone)
397            return stringLength;
398    }
399    return textBreakCurrent(it);
400}
401
402} // namespace WebCore
403