1/* 2 * (C) 1999 Lars Knoll (knoll@kde.org) 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved. 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 */ 21 22#include "config.h" 23#include "TextBreakIterator.h" 24 25#include "LineBreakIteratorPoolICU.h" 26#include "UTextProviderLatin1.h" 27#include "UTextProviderUTF16.h" 28#include <mutex> 29#include <wtf/Atomics.h> 30#include <wtf/text/StringView.h> 31 32namespace WebCore { 33 34// Iterator initialization 35 36static TextBreakIterator* initializeIterator(UBreakIteratorType type, const char* locale = currentTextBreakLocaleID()) 37{ 38 UErrorCode openStatus = U_ZERO_ERROR; 39 TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, locale, 0, 0, &openStatus)); 40 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 41 return iterator; 42} 43 44#if !PLATFORM(IOS) 45 46static TextBreakIterator* initializeIteratorWithRules(const char* breakRules) 47{ 48 UParseError parseStatus; 49 UErrorCode openStatus = U_ZERO_ERROR; 50 unsigned length = strlen(breakRules); 51 auto upconvertedCharacters = StringView(reinterpret_cast<const LChar*>(breakRules), length).upconvertedCharacters(); 52 TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(upconvertedCharacters, length, 0, 0, &parseStatus, &openStatus)); 53 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 54 return iterator; 55} 56 57#endif 58 59 60// Iterator text setting 61 62static TextBreakIterator* setTextForIterator(TextBreakIterator& iterator, StringView string) 63{ 64 if (string.is8Bit()) { 65 UTextWithBuffer textLocal; 66 textLocal.text = UTEXT_INITIALIZER; 67 textLocal.text.extraSize = sizeof(textLocal.buffer); 68 textLocal.text.pExtra = textLocal.buffer; 69 70 UErrorCode openStatus = U_ZERO_ERROR; 71 UText* text = openLatin1UTextProvider(&textLocal, string.characters8(), string.length(), &openStatus); 72 if (U_FAILURE(openStatus)) { 73 LOG_ERROR("uTextOpenLatin1 failed with status %d", openStatus); 74 return nullptr; 75 } 76 77 UErrorCode setTextStatus = U_ZERO_ERROR; 78 ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus); 79 if (U_FAILURE(setTextStatus)) { 80 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 81 return nullptr; 82 } 83 84 utext_close(text); 85 } else { 86 UErrorCode setTextStatus = U_ZERO_ERROR; 87 ubrk_setText(reinterpret_cast<UBreakIterator*>(&iterator), string.characters16(), string.length(), &setTextStatus); 88 if (U_FAILURE(setTextStatus)) 89 return nullptr; 90 } 91 92 return &iterator; 93} 94 95static TextBreakIterator* setContextAwareTextForIterator(TextBreakIterator& iterator, StringView string, const UChar* priorContext, unsigned priorContextLength) 96{ 97 if (string.is8Bit()) { 98 UTextWithBuffer textLocal; 99 textLocal.text = UTEXT_INITIALIZER; 100 textLocal.text.extraSize = sizeof(textLocal.buffer); 101 textLocal.text.pExtra = textLocal.buffer; 102 103 UErrorCode openStatus = U_ZERO_ERROR; 104 UText* text = openLatin1ContextAwareUTextProvider(&textLocal, string.characters8(), string.length(), priorContext, priorContextLength, &openStatus); 105 if (U_FAILURE(openStatus)) { 106 LOG_ERROR("openLatin1ContextAwareUTextProvider failed with status %d", openStatus); 107 return nullptr; 108 } 109 110 UErrorCode setTextStatus = U_ZERO_ERROR; 111 ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus); 112 if (U_FAILURE(setTextStatus)) { 113 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 114 return nullptr; 115 } 116 117 utext_close(text); 118 } else { 119 UText textLocal = UTEXT_INITIALIZER; 120 121 UErrorCode openStatus = U_ZERO_ERROR; 122 UText* text = openUTF16ContextAwareUTextProvider(&textLocal, string.characters16(), string.length(), priorContext, priorContextLength, &openStatus); 123 if (U_FAILURE(openStatus)) { 124 LOG_ERROR("openUTF16ContextAwareUTextProvider failed with status %d", openStatus); 125 return 0; 126 } 127 128 UErrorCode setTextStatus = U_ZERO_ERROR; 129 ubrk_setUText(reinterpret_cast<UBreakIterator*>(&iterator), text, &setTextStatus); 130 if (U_FAILURE(setTextStatus)) { 131 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 132 return nullptr; 133 } 134 135 utext_close(text); 136 } 137 138 return &iterator; 139} 140 141 142// Static iterators 143 144TextBreakIterator* wordBreakIterator(StringView string) 145{ 146 static TextBreakIterator* staticWordBreakIterator = initializeIterator(UBRK_WORD); 147 if (!staticWordBreakIterator) 148 return nullptr; 149 150 return setTextForIterator(*staticWordBreakIterator, string); 151} 152 153TextBreakIterator* sentenceBreakIterator(StringView string) 154{ 155 static TextBreakIterator* staticSentenceBreakIterator = initializeIterator(UBRK_SENTENCE); 156 if (!staticSentenceBreakIterator) 157 return nullptr; 158 159 return setTextForIterator(*staticSentenceBreakIterator, string); 160} 161 162TextBreakIterator* cursorMovementIterator(StringView string) 163{ 164#if !PLATFORM(IOS) 165 // This rule set is based on character-break iterator rules of ICU 4.0 166 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 167 // The major differences from the original ones are listed below: 168 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 169 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 170 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 171 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 172 // * Added rules for regional indicator symbols. 173 static const char* kRules = 174 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 175 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 176 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 177 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 178 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 179 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 180 "$L = [\\p{Grapheme_Cluster_Break = L}];" 181 "$V = [\\p{Grapheme_Cluster_Break = V}];" 182 "$T = [\\p{Grapheme_Cluster_Break = T}];" 183 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 184 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 185 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 186 "$HinV = \\u094D;" // Devanagari Sign Virama 187 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 188 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 189 "$BenV = \\u09CD;" // Bengali Sign Virama 190 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 191 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 192 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 193 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 194 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 195 "$GujV = \\u0ACD;" // Gujarati Sign Virama 196 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 197 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 198 "$OriV = \\u0B4D;" // Oriya Sign Virama 199 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 200 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 201 "$TelV = \\u0C4D;" // Telugu Sign Virama 202 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 203 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 204 "$KanV = \\u0CCD;" // Kannada Sign Virama 205 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 206 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 207 "$MalV = \\u0D4D;" // Malayalam Sign Virama 208 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 209 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators 210 "!!chain;" 211 "!!forward;" 212 "$CR $LF;" 213 "$L ($L | $V | $LV | $LVT);" 214 "($LV | $V) ($V | $T);" 215 "($LVT | $T) $T;" 216 "[^$Control $CR $LF] $Extend;" 217 "[^$Control $CR $LF] $SpacingMark;" 218 "$RI $RI / $RI;" 219 "$RI $RI;" 220 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 221 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 222 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 223 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 224 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 225 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 226 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 227 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 228 "!!reverse;" 229 "$LF $CR;" 230 "($L | $V | $LV | $LVT) $L;" 231 "($V | $T) ($LV | $V);" 232 "$T ($LVT | $T);" 233 "$Extend [^$Control $CR $LF];" 234 "$SpacingMark [^$Control $CR $LF];" 235 "$RI $RI / $RI $RI;" 236 "$RI $RI;" 237 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 238 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 239 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 240 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 241 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 242 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 243 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 244 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 245 "!!safe_reverse;" 246 "!!safe_forward;"; 247 static TextBreakIterator* staticCursorMovementIterator = initializeIteratorWithRules(kRules); 248#else // PLATFORM(IOS) 249 // Use the special Thai character break iterator for all locales 250 static TextBreakIterator* staticCursorMovementIterator = initializeIterator(UBRK_CHARACTER, "th"); 251#endif // !PLATFORM(IOS) 252 253 if (!staticCursorMovementIterator) 254 return nullptr; 255 256 return setTextForIterator(*staticCursorMovementIterator, string); 257} 258 259TextBreakIterator* acquireLineBreakIterator(StringView string, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 260{ 261 TextBreakIterator* iterator = reinterpret_cast<TextBreakIterator*>(LineBreakIteratorPool::sharedPool().take(locale)); 262 if (!iterator) 263 return nullptr; 264 265 return setContextAwareTextForIterator(*iterator, string, priorContext, priorContextLength); 266} 267 268void releaseLineBreakIterator(TextBreakIterator* iterator) 269{ 270 ASSERT_ARG(iterator, iterator); 271 272 LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator)); 273} 274 275static TextBreakIterator* nonSharedCharacterBreakIterator; 276 277static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue) 278{ 279#if ENABLE(COMPARE_AND_SWAP) 280 return WTF::weakCompareAndSwap(reinterpret_cast<void**>(&nonSharedCharacterBreakIterator), expected, newValue); 281#else 282 DEPRECATED_DEFINE_STATIC_LOCAL(std::mutex, nonSharedCharacterBreakIteratorMutex, ()); 283 std::lock_guard<std::mutex> locker(nonSharedCharacterBreakIteratorMutex); 284 if (nonSharedCharacterBreakIterator != expected) 285 return false; 286 nonSharedCharacterBreakIterator = newValue; 287 return true; 288#endif 289} 290 291NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(StringView string) 292{ 293 m_iterator = nonSharedCharacterBreakIterator; 294 295 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0); 296 if (!createdIterator) 297 m_iterator = initializeIterator(UBRK_CHARACTER); 298 if (!m_iterator) 299 return; 300 301 m_iterator = setTextForIterator(*m_iterator, string); 302} 303 304NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() 305{ 306 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator)) 307 ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator)); 308} 309 310 311// Iterator implemenation. 312 313int textBreakFirst(TextBreakIterator* iterator) 314{ 315 return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator)); 316} 317 318int textBreakLast(TextBreakIterator* iterator) 319{ 320 return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator)); 321} 322 323int textBreakNext(TextBreakIterator* iterator) 324{ 325 return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); 326} 327 328int textBreakPrevious(TextBreakIterator* iterator) 329{ 330 return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator)); 331} 332 333int textBreakPreceding(TextBreakIterator* iterator, int pos) 334{ 335 return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos); 336} 337 338int textBreakFollowing(TextBreakIterator* iterator, int pos) 339{ 340 return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); 341} 342 343int textBreakCurrent(TextBreakIterator* iterator) 344{ 345 return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator)); 346} 347 348bool isTextBreak(TextBreakIterator* iterator, int position) 349{ 350 return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position); 351} 352 353bool isWordTextBreak(TextBreakIterator* iterator) 354{ 355 int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator)); 356 return ruleStatus != UBRK_WORD_NONE; 357} 358 359unsigned numGraphemeClusters(const String& s) 360{ 361 unsigned stringLength = s.length(); 362 363 if (!stringLength) 364 return 0; 365 366 // The only Latin-1 Extended Grapheme Cluster is CR LF 367 if (s.is8Bit() && !s.contains('\r')) 368 return stringLength; 369 370 NonSharedCharacterBreakIterator it(s); 371 if (!it) 372 return stringLength; 373 374 unsigned num = 0; 375 while (textBreakNext(it) != TextBreakDone) 376 ++num; 377 return num; 378} 379 380unsigned numCharactersInGraphemeClusters(const String& s, unsigned numGraphemeClusters) 381{ 382 unsigned stringLength = s.length(); 383 384 if (!stringLength) 385 return 0; 386 387 // The only Latin-1 Extended Grapheme Cluster is CR LF 388 if (s.is8Bit() && !s.contains('\r')) 389 return std::min(stringLength, numGraphemeClusters); 390 391 NonSharedCharacterBreakIterator it(s); 392 if (!it) 393 return std::min(stringLength, numGraphemeClusters); 394 395 for (unsigned i = 0; i < numGraphemeClusters; ++i) { 396 if (textBreakNext(it) == TextBreakDone) 397 return stringLength; 398 } 399 return textBreakCurrent(it); 400} 401 402} // namespace WebCore 403