1/* 2 * Copyright (C) 2004, 2006, 2014 Apple Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#import "config.h" 27#import "TextBoundaries.h" 28 29#import "TextBreakIterator.h" 30#import "TextBreakIteratorInternalICU.h" 31#import <CoreFoundation/CFStringTokenizer.h> 32#import <Foundation/Foundation.h> 33#import <unicode/ubrk.h> 34#import <unicode/uchar.h> 35#import <unicode/ustring.h> 36#import <unicode/utypes.h> 37#import <wtf/RetainPtr.h> 38#import <wtf/text/StringView.h> 39#import <wtf/unicode/CharacterNames.h> 40 41namespace WebCore { 42 43#if !USE(APPKIT) 44 45static bool isSkipCharacter(UChar32 c) 46{ 47 return c == 0xA0 || c == '\n' || c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || u_isspace(c); 48} 49 50static bool isWhitespaceCharacter(UChar32 c) 51{ 52 return c == 0xA0 || c == '\n' || u_isspace(c); 53} 54 55static bool isWordDelimitingCharacter(UChar32 c) 56{ 57 // Ampersand is an exception added to treat AT&T as a single word (see <rdar://problem/5022264>). 58 return !CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetAlphaNumeric), c) && c != '&'; 59} 60 61static bool isSymbolCharacter(UChar32 c) 62{ 63 return CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetSymbol), c); 64} 65 66static bool isAmbiguousBoundaryCharacter(UChar32 character) 67{ 68 // These are characters that can behave as word boundaries, but can appear within words. 69 return character == '\'' || character == rightSingleQuotationMark || character == hebrewPunctuationGershayim; 70} 71 72static CFStringTokenizerRef tokenizerForString(CFStringRef str) 73{ 74 static CFLocaleRef locale = nullptr; 75 if (!locale) { 76 const char* temp = currentTextBreakLocaleID(); 77 RetainPtr<CFStringRef> currentLocaleID = adoptCF(CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, reinterpret_cast<const UInt8*>(temp), strlen(temp), kCFStringEncodingASCII, false, kCFAllocatorNull)); 78 locale = CFLocaleCreate(kCFAllocatorDefault, currentLocaleID.get()); 79 if (!locale) 80 return nullptr; 81 } 82 83 CFRange entireRange = CFRangeMake(0, CFStringGetLength(str)); 84 85 static CFStringTokenizerRef tokenizer = nullptr; 86 if (!tokenizer) 87 tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, str, entireRange, kCFStringTokenizerUnitWordBoundary, locale); 88 else 89 CFStringTokenizerSetString(tokenizer, str, entireRange); 90 return tokenizer; 91} 92 93// Simple case: A word is a stream of characters delimited by a special set of word-delimiting characters. 94static void findSimpleWordBoundary(StringView text, int position, int* start, int* end) 95{ 96 ASSERT(position >= 0); 97 ASSERT(static_cast<unsigned>(position) < text.length()); 98 99 unsigned startPos = position; 100 while (startPos > 0) { 101 int i = startPos; 102 UChar32 characterBeforeStartPos; 103 U16_PREV(text, 0, i, characterBeforeStartPos); 104 if (isWordDelimitingCharacter(characterBeforeStartPos)) { 105 ASSERT(i >= 0); 106 if (!i) 107 break; 108 109 if (!isAmbiguousBoundaryCharacter(characterBeforeStartPos)) 110 break; 111 112 UChar32 characterBeforeBeforeStartPos; 113 U16_PREV(text, 0, i, characterBeforeBeforeStartPos); 114 if (isWordDelimitingCharacter(characterBeforeBeforeStartPos)) 115 break; 116 } 117 U16_BACK_1(text, 0, startPos); 118 } 119 120 unsigned endPos = position; 121 while (endPos < text.length()) { 122 UChar32 character; 123 U16_GET(text, 0, endPos, text.length(), character); 124 if (isWordDelimitingCharacter(character)) { 125 unsigned i = endPos; 126 U16_FWD_1(text, i, text.length()); 127 ASSERT(i <= text.length()); 128 if (i == text.length()) 129 break; 130 UChar32 characterAfterEndPos; 131 U16_NEXT(text, i, text.length(), characterAfterEndPos); 132 if (!isAmbiguousBoundaryCharacter(character)) 133 break; 134 if (isWordDelimitingCharacter(characterAfterEndPos)) 135 break; 136 } 137 U16_FWD_1(text, endPos, text.length()); 138 } 139 140 // The text may consist of all delimiter characters (e.g. "++++++++" or a series of emoji), and returning an empty range 141 // makes no sense (and doesn't match findComplexWordBoundary() behavior). 142 if (startPos == endPos && endPos < text.length()) { 143 UChar32 character; 144 U16_GET(text, 0, endPos, text.length(), character); 145 if (isSymbolCharacter(character)) 146 U16_FWD_1(text, endPos, text.length()); 147 } 148 149 *start = startPos; 150 *end = endPos; 151} 152 153// Complex case: use CFStringTokenizer to find word boundary. 154static void findComplexWordBoundary(StringView text, int position, int* start, int* end) 155{ 156 RetainPtr<CFStringRef> charString = text.createCFStringWithoutCopying(); 157 158 CFStringTokenizerRef tokenizer = tokenizerForString(charString.get()); 159 if (!tokenizer) { 160 // Error creating tokenizer, so just use simple function. 161 findSimpleWordBoundary(text, position, start, end); 162 return; 163 } 164 165 CFStringTokenizerTokenType token = CFStringTokenizerGoToTokenAtIndex(tokenizer, position); 166 if (token == kCFStringTokenizerTokenNone) { 167 // No token found: select entire block. 168 // NB: I never hit this section in all my testing. 169 *start = 0; 170 *end = text.length(); 171 return; 172 } 173 174 CFRange result = CFStringTokenizerGetCurrentTokenRange(tokenizer); 175 *start = result.location; 176 *end = result.location + result.length; 177} 178 179#endif 180 181void findWordBoundary(StringView text, int position, int* start, int* end) 182{ 183#if USE(APPKIT) 184 NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()]; 185 NSRange range = [attributedString doubleClickAtIndex:std::min<unsigned>(position, text.length() - 1)]; 186 [attributedString release]; 187 *start = range.location; 188 *end = range.location + range.length; 189#else 190 unsigned pos = position; 191 if (pos == text.length() && pos) 192 --pos; 193 194 // For complex text (Thai, Japanese, Chinese), visible_units will pass the text in as a 195 // single contiguous run of characters, providing as much context as is possible. 196 // We only need one character to determine if the text is complex. 197 UChar32 ch; 198 unsigned i = pos; 199 U16_NEXT(text, i, text.length(), ch); 200 bool isComplex = requiresContextForWordBoundary(ch); 201 202 // FIXME: This check improves our word boundary behavior, but doesn't actually go far enough. 203 // See <rdar://problem/8853951> Take complex word boundary finding path when necessary 204 if (!isComplex) { 205 // Check again for complex text, at the start of the run. 206 i = 0; 207 U16_NEXT(text, i, text.length(), ch); 208 isComplex = requiresContextForWordBoundary(ch); 209 } 210 211 if (isComplex) 212 findComplexWordBoundary(text, position, start, end); 213 else 214 findSimpleWordBoundary(text, position, start, end); 215 216#define LOG_WORD_BREAK 0 217#if LOG_WORD_BREAK 218 auto uniString = text.createCFStringWithoutCopying(); 219 auto foundWord = text.substring(*start, *end - *start).createCFStringWithoutCopying(); 220 NSLog(@"%s_BREAK '%@' (%d,%d) in '%@' (%p) at %d, length=%d", isComplex ? "COMPLEX" : "SIMPLE", foundWord.get(), *start, *end, uniString.get(), uniString.get(), position, text.length()); 221#endif 222 223#endif 224} 225 226void findEndWordBoundary(StringView text, int position, int* end) 227{ 228 int start; 229 findWordBoundary(text, position, &start, end); 230} 231 232int findNextWordFromIndex(StringView text, int position, bool forward) 233{ 234#if USE(APPKIT) 235 NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()]; 236 int result = [attributedString nextWordFromIndex:position forward:forward]; 237 [attributedString release]; 238 return result; 239#else 240 // This very likely won't behave exactly like the non-iPhone version, but it works 241 // for the contexts in which it is used on iPhone, and in the future will be 242 // tuned to improve the iPhone-specific behavior for the keyboard and text editing. 243 int pos = position; 244 TextBreakIterator* boundary = wordBreakIterator(text); 245 if (boundary) { 246 if (forward) { 247 do { 248 pos = textBreakFollowing(boundary, pos); 249 if (pos == UBRK_DONE) 250 pos = text.length(); 251 } while (static_cast<unsigned>(pos) < text.length() && (pos == 0 || !isSkipCharacter(text[pos - 1])) && isSkipCharacter(text[pos])); 252 } 253 else { 254 do { 255 pos = textBreakPreceding(boundary, pos); 256 if (pos == UBRK_DONE) 257 pos = 0; 258 } while (pos > 0 && isSkipCharacter(text[pos]) && !isWhitespaceCharacter(text[pos - 1])); 259 } 260 } 261 return pos; 262#endif 263} 264 265} 266