1/*
2 * Copyright (C) 2004, 2006, 2014 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#import "config.h"
27#import "TextBoundaries.h"
28
29#import "TextBreakIterator.h"
30#import "TextBreakIteratorInternalICU.h"
31#import <CoreFoundation/CFStringTokenizer.h>
32#import <Foundation/Foundation.h>
33#import <unicode/ubrk.h>
34#import <unicode/uchar.h>
35#import <unicode/ustring.h>
36#import <unicode/utypes.h>
37#import <wtf/RetainPtr.h>
38#import <wtf/text/StringView.h>
39#import <wtf/unicode/CharacterNames.h>
40
41namespace WebCore {
42
43#if !USE(APPKIT)
44
45static bool isSkipCharacter(UChar32 c)
46{
47    return c == 0xA0 || c == '\n' || c == '.' || c == ',' || c == '!'  || c == '?' || c == ';' || c == ':' || u_isspace(c);
48}
49
50static bool isWhitespaceCharacter(UChar32 c)
51{
52    return c == 0xA0 || c == '\n' || u_isspace(c);
53}
54
55static bool isWordDelimitingCharacter(UChar32 c)
56{
57    // Ampersand is an exception added to treat AT&T as a single word (see <rdar://problem/5022264>).
58    return !CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetAlphaNumeric), c) && c != '&';
59}
60
61static bool isSymbolCharacter(UChar32 c)
62{
63    return CFCharacterSetIsLongCharacterMember(CFCharacterSetGetPredefined(kCFCharacterSetSymbol), c);
64}
65
66static bool isAmbiguousBoundaryCharacter(UChar32 character)
67{
68    // These are characters that can behave as word boundaries, but can appear within words.
69    return character == '\'' || character == rightSingleQuotationMark || character == hebrewPunctuationGershayim;
70}
71
72static CFStringTokenizerRef tokenizerForString(CFStringRef str)
73{
74    static CFLocaleRef locale = nullptr;
75    if (!locale) {
76        const char* temp = currentTextBreakLocaleID();
77        RetainPtr<CFStringRef> currentLocaleID = adoptCF(CFStringCreateWithBytesNoCopy(kCFAllocatorDefault, reinterpret_cast<const UInt8*>(temp), strlen(temp), kCFStringEncodingASCII, false, kCFAllocatorNull));
78        locale = CFLocaleCreate(kCFAllocatorDefault, currentLocaleID.get());
79        if (!locale)
80            return nullptr;
81    }
82
83    CFRange entireRange = CFRangeMake(0, CFStringGetLength(str));
84
85    static CFStringTokenizerRef tokenizer = nullptr;
86    if (!tokenizer)
87        tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, str, entireRange, kCFStringTokenizerUnitWordBoundary, locale);
88    else
89        CFStringTokenizerSetString(tokenizer, str, entireRange);
90    return tokenizer;
91}
92
93// Simple case: A word is a stream of characters delimited by a special set of word-delimiting characters.
94static void findSimpleWordBoundary(StringView text, int position, int* start, int* end)
95{
96    ASSERT(position >= 0);
97    ASSERT(static_cast<unsigned>(position) < text.length());
98
99    unsigned startPos = position;
100    while (startPos > 0) {
101        int i = startPos;
102        UChar32 characterBeforeStartPos;
103        U16_PREV(text, 0, i, characterBeforeStartPos);
104        if (isWordDelimitingCharacter(characterBeforeStartPos)) {
105            ASSERT(i >= 0);
106            if (!i)
107                break;
108
109            if (!isAmbiguousBoundaryCharacter(characterBeforeStartPos))
110                break;
111
112            UChar32 characterBeforeBeforeStartPos;
113            U16_PREV(text, 0, i, characterBeforeBeforeStartPos);
114            if (isWordDelimitingCharacter(characterBeforeBeforeStartPos))
115                break;
116        }
117        U16_BACK_1(text, 0, startPos);
118    }
119
120    unsigned endPos = position;
121    while (endPos < text.length()) {
122        UChar32 character;
123        U16_GET(text, 0, endPos, text.length(), character);
124        if (isWordDelimitingCharacter(character)) {
125            unsigned i = endPos;
126            U16_FWD_1(text, i, text.length());
127            ASSERT(i <= text.length());
128            if (i == text.length())
129                break;
130            UChar32 characterAfterEndPos;
131            U16_NEXT(text, i, text.length(), characterAfterEndPos);
132            if (!isAmbiguousBoundaryCharacter(character))
133                break;
134            if (isWordDelimitingCharacter(characterAfterEndPos))
135                break;
136        }
137        U16_FWD_1(text, endPos, text.length());
138    }
139
140    // The text may consist of all delimiter characters (e.g. "++++++++" or a series of emoji), and returning an empty range
141    // makes no sense (and doesn't match findComplexWordBoundary() behavior).
142    if (startPos == endPos && endPos < text.length()) {
143        UChar32 character;
144        U16_GET(text, 0, endPos, text.length(), character);
145        if (isSymbolCharacter(character))
146            U16_FWD_1(text, endPos, text.length());
147    }
148
149    *start = startPos;
150    *end = endPos;
151}
152
153// Complex case: use CFStringTokenizer to find word boundary.
154static void findComplexWordBoundary(StringView text, int position, int* start, int* end)
155{
156    RetainPtr<CFStringRef> charString = text.createCFStringWithoutCopying();
157
158    CFStringTokenizerRef tokenizer = tokenizerForString(charString.get());
159    if (!tokenizer) {
160        // Error creating tokenizer, so just use simple function.
161        findSimpleWordBoundary(text, position, start, end);
162        return;
163    }
164
165    CFStringTokenizerTokenType  token = CFStringTokenizerGoToTokenAtIndex(tokenizer, position);
166    if (token == kCFStringTokenizerTokenNone) {
167        // No token found: select entire block.
168        // NB: I never hit this section in all my testing.
169        *start = 0;
170        *end = text.length();
171        return;
172    }
173
174    CFRange result = CFStringTokenizerGetCurrentTokenRange(tokenizer);
175    *start = result.location;
176    *end = result.location + result.length;
177}
178
179#endif
180
181void findWordBoundary(StringView text, int position, int* start, int* end)
182{
183#if USE(APPKIT)
184    NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()];
185    NSRange range = [attributedString doubleClickAtIndex:std::min<unsigned>(position, text.length() - 1)];
186    [attributedString release];
187    *start = range.location;
188    *end = range.location + range.length;
189#else
190    unsigned pos = position;
191    if (pos == text.length() && pos)
192        --pos;
193
194    // For complex text (Thai, Japanese, Chinese), visible_units will pass the text in as a
195    // single contiguous run of characters, providing as much context as is possible.
196    // We only need one character to determine if the text is complex.
197    UChar32 ch;
198    unsigned i = pos;
199    U16_NEXT(text, i, text.length(), ch);
200    bool isComplex = requiresContextForWordBoundary(ch);
201
202    // FIXME: This check improves our word boundary behavior, but doesn't actually go far enough.
203    // See <rdar://problem/8853951> Take complex word boundary finding path when necessary
204    if (!isComplex) {
205        // Check again for complex text, at the start of the run.
206        i = 0;
207        U16_NEXT(text, i, text.length(), ch);
208        isComplex = requiresContextForWordBoundary(ch);
209    }
210
211    if (isComplex)
212        findComplexWordBoundary(text, position, start, end);
213    else
214        findSimpleWordBoundary(text, position, start, end);
215
216#define LOG_WORD_BREAK 0
217#if LOG_WORD_BREAK
218    auto uniString = text.createCFStringWithoutCopying();
219    auto foundWord = text.substring(*start, *end - *start).createCFStringWithoutCopying();
220    NSLog(@"%s_BREAK '%@' (%d,%d) in '%@' (%p) at %d, length=%d", isComplex ? "COMPLEX" : "SIMPLE", foundWord.get(), *start, *end, uniString.get(), uniString.get(), position, text.length());
221#endif
222
223#endif
224}
225
226void findEndWordBoundary(StringView text, int position, int* end)
227{
228    int start;
229    findWordBoundary(text, position, &start, end);
230}
231
232int findNextWordFromIndex(StringView text, int position, bool forward)
233{
234#if USE(APPKIT)
235    NSAttributedString *attributedString = [[NSAttributedString alloc] initWithString:text.createNSStringWithoutCopying().get()];
236    int result = [attributedString nextWordFromIndex:position forward:forward];
237    [attributedString release];
238    return result;
239#else
240    // This very likely won't behave exactly like the non-iPhone version, but it works
241    // for the contexts in which it is used on iPhone, and in the future will be
242    // tuned to improve the iPhone-specific behavior for the keyboard and text editing.
243    int pos = position;
244    TextBreakIterator* boundary = wordBreakIterator(text);
245    if (boundary) {
246        if (forward) {
247            do {
248                pos = textBreakFollowing(boundary, pos);
249                if (pos == UBRK_DONE)
250                    pos = text.length();
251            } while (static_cast<unsigned>(pos) < text.length() && (pos == 0 || !isSkipCharacter(text[pos - 1])) && isSkipCharacter(text[pos]));
252        }
253        else {
254            do {
255                pos = textBreakPreceding(boundary, pos);
256                if (pos == UBRK_DONE)
257                    pos = 0;
258            } while (pos > 0 && isSkipCharacter(text[pos]) && !isWhitespaceCharacter(text[pos - 1]));
259        }
260    }
261    return pos;
262#endif
263}
264
265}
266