1/**
2 *******************************************************************************
3 * Copyright (C) 2006-2014, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "brkeng.h"
13#include "dictbe.h"
14#include "unicode/uniset.h"
15#include "unicode/chariter.h"
16#include "unicode/ubrk.h"
17#include "uvector.h"
18#include "uassert.h"
19#include "unicode/normlzr.h"
20#include "cmemory.h"
21#include "dictionarydata.h"
22
23U_NAMESPACE_BEGIN
24
25/*
26 ******************************************************************
27 */
28
29DictionaryBreakEngine::DictionaryBreakEngine(uint32_t breakTypes) {
30    fTypes = breakTypes;
31}
32
33DictionaryBreakEngine::~DictionaryBreakEngine() {
34}
35
36UBool
37DictionaryBreakEngine::handles(UChar32 c, int32_t breakType) const {
38    return (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)
39            && fSet.contains(c));
40}
41
42int32_t
43DictionaryBreakEngine::findBreaks( UText *text,
44                                 int32_t startPos,
45                                 int32_t endPos,
46                                 UBool reverse,
47                                 int32_t breakType,
48                                 UStack &foundBreaks ) const {
49    int32_t result = 0;
50
51    // Find the span of characters included in the set.
52    //   The span to break begins at the current position in the text, and
53    //   extends towards the start or end of the text, depending on 'reverse'.
54
55    int32_t start = (int32_t)utext_getNativeIndex(text);
56    int32_t current;
57    int32_t rangeStart;
58    int32_t rangeEnd;
59    UChar32 c = utext_current32(text);
60    if (reverse) {
61        UBool   isDict = fSet.contains(c);
62        while((current = (int32_t)utext_getNativeIndex(text)) > startPos && isDict) {
63            c = utext_previous32(text);
64            isDict = fSet.contains(c);
65        }
66        rangeStart = (current < startPos) ? startPos : current+(isDict ? 0 : 1);
67        rangeEnd = start + 1;
68    }
69    else {
70        while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
71            utext_next32(text);         // TODO:  recast loop for postincrement
72            c = utext_current32(text);
73        }
74        rangeStart = start;
75        rangeEnd = current;
76    }
77    if (breakType >= 0 && breakType < 32 && (((uint32_t)1 << breakType) & fTypes)) {
78        result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
79        utext_setNativeIndex(text, current);
80    }
81
82    return result;
83}
84
85void
86DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
87    fSet = set;
88    // Compact for caching
89    fSet.compact();
90}
91
92/*
93 ******************************************************************
94 * PossibleWord
95 */
96
97// Helper class for improving readability of the Thai/Lao/Khmer word break
98// algorithm. The implementation is completely inline.
99
100// List size, limited by the maximum number of words in the dictionary
101// that form a nested sequence.
102#define POSSIBLE_WORD_LIST_MAX 20
103
104class PossibleWord {
105private:
106    // list of word candidate lengths, in increasing length order
107    int32_t   lengths[POSSIBLE_WORD_LIST_MAX];
108    int32_t   count;      // Count of candidates
109    int32_t   prefix;     // The longest match with a dictionary word
110    int32_t   offset;     // Offset in the text of these candidates
111    int       mark;       // The preferred candidate's offset
112    int       current;    // The candidate we're currently looking at
113
114public:
115    PossibleWord();
116    ~PossibleWord();
117
118    // Fill the list of candidates if needed, select the longest, and return the number found
119    int       candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
120
121    // Select the currently marked candidate, point after it in the text, and invalidate self
122    int32_t   acceptMarked( UText *text );
123
124    // Back up from the current candidate to the next shorter one; return TRUE if that exists
125    // and point the text after it
126    UBool     backUp( UText *text );
127
128    // Return the longest prefix this candidate location shares with a dictionary word
129    int32_t   longestPrefix();
130
131    // Mark the current candidate as the one we like
132    void      markCurrent();
133};
134
135inline
136PossibleWord::PossibleWord() {
137    offset = -1;
138}
139
140inline
141PossibleWord::~PossibleWord() {
142}
143
144inline int
145PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
146    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
147    int32_t start = (int32_t)utext_getNativeIndex(text);
148    if (start != offset) {
149        offset = start;
150        prefix = dict->matches(text, rangeEnd-start, lengths, count, sizeof(lengths)/sizeof(lengths[0]));
151        // Dictionary leaves text after longest prefix, not longest word. Back up.
152        if (count <= 0) {
153            utext_setNativeIndex(text, start);
154        }
155    }
156    if (count > 0) {
157        utext_setNativeIndex(text, start+lengths[count-1]);
158    }
159    current = count-1;
160    mark = current;
161    return count;
162}
163
164inline int32_t
165PossibleWord::acceptMarked( UText *text ) {
166    utext_setNativeIndex(text, offset + lengths[mark]);
167    return lengths[mark];
168}
169
170inline UBool
171PossibleWord::backUp( UText *text ) {
172    if (current > 0) {
173        utext_setNativeIndex(text, offset + lengths[--current]);
174        return TRUE;
175    }
176    return FALSE;
177}
178
179inline int32_t
180PossibleWord::longestPrefix() {
181    return prefix;
182}
183
184inline void
185PossibleWord::markCurrent() {
186    mark = current;
187}
188
189/*
190 ******************************************************************
191 * ThaiBreakEngine
192 */
193
194// How many words in a row are "good enough"?
195#define THAI_LOOKAHEAD 3
196
197// Will not combine a non-word with a preceding dictionary word longer than this
198#define THAI_ROOT_COMBINE_THRESHOLD 3
199
200// Will not combine a non-word that shares at least this much prefix with a
201// dictionary word, with a preceding word
202#define THAI_PREFIX_COMBINE_THRESHOLD 3
203
204// Ellision character
205#define THAI_PAIYANNOI 0x0E2F
206
207// Repeat character
208#define THAI_MAIYAMOK 0x0E46
209
210// Minimum word size
211#define THAI_MIN_WORD 2
212
213// Minimum number of characters for two words
214#define THAI_MIN_WORD_SPAN (THAI_MIN_WORD * 2)
215
216ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
217    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
218      fDictionary(adoptDictionary)
219{
220    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
221    if (U_SUCCESS(status)) {
222        setCharacters(fThaiWordSet);
223    }
224    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
225    fMarkSet.add(0x0020);
226    fEndWordSet = fThaiWordSet;
227    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
228    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
229    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
230    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
231    fSuffixSet.add(THAI_PAIYANNOI);
232    fSuffixSet.add(THAI_MAIYAMOK);
233
234    // Compact for caching.
235    fMarkSet.compact();
236    fEndWordSet.compact();
237    fBeginWordSet.compact();
238    fSuffixSet.compact();
239}
240
241ThaiBreakEngine::~ThaiBreakEngine() {
242    delete fDictionary;
243}
244
245int32_t
246ThaiBreakEngine::divideUpDictionaryRange( UText *text,
247                                                int32_t rangeStart,
248                                                int32_t rangeEnd,
249                                                UStack &foundBreaks ) const {
250    if ((rangeEnd - rangeStart) < THAI_MIN_WORD_SPAN) {
251        return 0;       // Not enough characters for two words
252    }
253
254    uint32_t wordsFound = 0;
255    int32_t wordLength;
256    int32_t current;
257    UErrorCode status = U_ZERO_ERROR;
258    PossibleWord words[THAI_LOOKAHEAD];
259    UChar32 uc;
260
261    utext_setNativeIndex(text, rangeStart);
262
263    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
264        wordLength = 0;
265
266        // Look for candidate words at the current position
267        int candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
268
269        // If we found exactly one, use that
270        if (candidates == 1) {
271            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
272            wordsFound += 1;
273        }
274        // If there was more than one, see which one can take us forward the most words
275        else if (candidates > 1) {
276            // If we're already at the end of the range, we're done
277            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
278                goto foundBest;
279            }
280            do {
281                int wordsMatched = 1;
282                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
283                    if (wordsMatched < 2) {
284                        // Followed by another dictionary word; mark first word as a good candidate
285                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
286                        wordsMatched = 2;
287                    }
288
289                    // If we're already at the end of the range, we're done
290                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
291                        goto foundBest;
292                    }
293
294                    // See if any of the possible second words is followed by a third word
295                    do {
296                        // If we find a third word, stop right away
297                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
298                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
299                            goto foundBest;
300                        }
301                    }
302                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
303                }
304            }
305            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
306foundBest:
307            wordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
308            wordsFound += 1;
309        }
310
311        // We come here after having either found a word or not. We look ahead to the
312        // next word. If it's not a dictionary word, we will combine it withe the word we
313        // just found (if there is one), but only if the preceding word does not exceed
314        // the threshold.
315        // The text iterator should now be positioned at the end of the word we found.
316        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < THAI_ROOT_COMBINE_THRESHOLD) {
317            // if it is a dictionary word, do nothing. If it isn't, then if there is
318            // no preceding word, or the non-word shares less than the minimum threshold
319            // of characters with a dictionary word, then scan to resynchronize
320            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
321                  && (wordLength == 0
322                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
323                // Look for a plausible word boundary
324                //TODO: This section will need a rework for UText.
325                int32_t remaining = rangeEnd - (current+wordLength);
326                UChar32 pc = utext_current32(text);
327                int32_t chars = 0;
328                for (;;) {
329                    utext_next32(text);
330                    uc = utext_current32(text);
331                    // TODO: Here we're counting on the fact that the SA languages are all
332                    // in the BMP. This should get fixed with the UText rework.
333                    chars += 1;
334                    if (--remaining <= 0) {
335                        break;
336                    }
337                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
338                        // Maybe. See if it's in the dictionary.
339                        // NOTE: In the original Apple code, checked that the next
340                        // two characters after uc were not 0x0E4C THANTHAKHAT before
341                        // checking the dictionary. That is just a performance filter,
342                        // but it's not clear it's faster than checking the trie.
343                        int candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
344                        utext_setNativeIndex(text, current + wordLength + chars);
345                        if (candidates > 0) {
346                            break;
347                        }
348                    }
349                    pc = uc;
350                }
351
352                // Bump the word count if there wasn't already one
353                if (wordLength <= 0) {
354                    wordsFound += 1;
355                }
356
357                // Update the length with the passed-over characters
358                wordLength += chars;
359            }
360            else {
361                // Back up to where we were for next iteration
362                utext_setNativeIndex(text, current+wordLength);
363            }
364        }
365
366        // Never stop before a combining mark.
367        int32_t currPos;
368        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
369            utext_next32(text);
370            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
371        }
372
373        // Look ahead for possible suffixes if a dictionary word does not follow.
374        // We do this in code rather than using a rule so that the heuristic
375        // resynch continues to function. For example, one of the suffix characters
376        // could be a typo in the middle of a word.
377        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
378            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
379                && fSuffixSet.contains(uc = utext_current32(text))) {
380                if (uc == THAI_PAIYANNOI) {
381                    if (!fSuffixSet.contains(utext_previous32(text))) {
382                        // Skip over previous end and PAIYANNOI
383                        utext_next32(text);
384                        utext_next32(text);
385                        wordLength += 1;            // Add PAIYANNOI to word
386                        uc = utext_current32(text);     // Fetch next character
387                    }
388                    else {
389                        // Restore prior position
390                        utext_next32(text);
391                    }
392                }
393                if (uc == THAI_MAIYAMOK) {
394                    if (utext_previous32(text) != THAI_MAIYAMOK) {
395                        // Skip over previous end and MAIYAMOK
396                        utext_next32(text);
397                        utext_next32(text);
398                        wordLength += 1;            // Add MAIYAMOK to word
399                    }
400                    else {
401                        // Restore prior position
402                        utext_next32(text);
403                    }
404                }
405            }
406            else {
407                utext_setNativeIndex(text, current+wordLength);
408            }
409        }
410
411        // Did we find a word on this iteration? If so, push it on the break stack
412        if (wordLength > 0) {
413            foundBreaks.push((current+wordLength), status);
414        }
415    }
416
417    // Don't return a break for the end of the dictionary range if there is one there.
418    if (foundBreaks.peeki() >= rangeEnd) {
419        (void) foundBreaks.popi();
420        wordsFound -= 1;
421    }
422
423    return wordsFound;
424}
425
426/*
427 ******************************************************************
428 * LaoBreakEngine
429 */
430
431// How many words in a row are "good enough"?
432#define LAO_LOOKAHEAD 3
433
434// Will not combine a non-word with a preceding dictionary word longer than this
435#define LAO_ROOT_COMBINE_THRESHOLD 3
436
437// Will not combine a non-word that shares at least this much prefix with a
438// dictionary word, with a preceding word
439#define LAO_PREFIX_COMBINE_THRESHOLD 3
440
441// Minimum word size
442#define LAO_MIN_WORD 2
443
444// Minimum number of characters for two words
445#define LAO_MIN_WORD_SPAN (LAO_MIN_WORD * 2)
446
447LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
448    : DictionaryBreakEngine((1<<UBRK_WORD) | (1<<UBRK_LINE)),
449      fDictionary(adoptDictionary)
450{
451    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
452    if (U_SUCCESS(status)) {
453        setCharacters(fLaoWordSet);
454    }
455    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
456    fMarkSet.add(0x0020);
457    fEndWordSet = fLaoWordSet;
458    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
459    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
460    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
461    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
462
463    // Compact for caching.
464    fMarkSet.compact();
465    fEndWordSet.compact();
466    fBeginWordSet.compact();
467}
468
469LaoBreakEngine::~LaoBreakEngine() {
470    delete fDictionary;
471}
472
473int32_t
474LaoBreakEngine::divideUpDictionaryRange( UText *text,
475                                                int32_t rangeStart,
476                                                int32_t rangeEnd,
477                                                UStack &foundBreaks ) const {
478    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
479        return 0;       // Not enough characters for two words
480    }
481
482    uint32_t wordsFound = 0;
483    int32_t wordLength;
484    int32_t current;
485    UErrorCode status = U_ZERO_ERROR;
486    PossibleWord words[LAO_LOOKAHEAD];
487    UChar32 uc;
488
489    utext_setNativeIndex(text, rangeStart);
490
491    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
492        wordLength = 0;
493
494        // Look for candidate words at the current position
495        int candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
496
497        // If we found exactly one, use that
498        if (candidates == 1) {
499            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
500            wordsFound += 1;
501        }
502        // If there was more than one, see which one can take us forward the most words
503        else if (candidates > 1) {
504            // If we're already at the end of the range, we're done
505            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
506                goto foundBest;
507            }
508            do {
509                int wordsMatched = 1;
510                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
511                    if (wordsMatched < 2) {
512                        // Followed by another dictionary word; mark first word as a good candidate
513                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
514                        wordsMatched = 2;
515                    }
516
517                    // If we're already at the end of the range, we're done
518                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
519                        goto foundBest;
520                    }
521
522                    // See if any of the possible second words is followed by a third word
523                    do {
524                        // If we find a third word, stop right away
525                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
526                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
527                            goto foundBest;
528                        }
529                    }
530                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
531                }
532            }
533            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
534foundBest:
535            wordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
536            wordsFound += 1;
537        }
538
539        // We come here after having either found a word or not. We look ahead to the
540        // next word. If it's not a dictionary word, we will combine it withe the word we
541        // just found (if there is one), but only if the preceding word does not exceed
542        // the threshold.
543        // The text iterator should now be positioned at the end of the word we found.
544        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < LAO_ROOT_COMBINE_THRESHOLD) {
545            // if it is a dictionary word, do nothing. If it isn't, then if there is
546            // no preceding word, or the non-word shares less than the minimum threshold
547            // of characters with a dictionary word, then scan to resynchronize
548            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
549                  && (wordLength == 0
550                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
551                // Look for a plausible word boundary
552                //TODO: This section will need a rework for UText.
553                int32_t remaining = rangeEnd - (current+wordLength);
554                UChar32 pc = utext_current32(text);
555                int32_t chars = 0;
556                for (;;) {
557                    utext_next32(text);
558                    uc = utext_current32(text);
559                    // TODO: Here we're counting on the fact that the SA languages are all
560                    // in the BMP. This should get fixed with the UText rework.
561                    chars += 1;
562                    if (--remaining <= 0) {
563                        break;
564                    }
565                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
566                        // Maybe. See if it's in the dictionary.
567                        int candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
568                        utext_setNativeIndex(text, current + wordLength + chars);
569                        if (candidates > 0) {
570                            break;
571                        }
572                    }
573                    pc = uc;
574                }
575
576                // Bump the word count if there wasn't already one
577                if (wordLength <= 0) {
578                    wordsFound += 1;
579                }
580
581                // Update the length with the passed-over characters
582                wordLength += chars;
583            }
584            else {
585                // Back up to where we were for next iteration
586                utext_setNativeIndex(text, current+wordLength);
587            }
588        }
589
590        // Never stop before a combining mark.
591        int32_t currPos;
592        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
593            utext_next32(text);
594            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
595        }
596
597        // Look ahead for possible suffixes if a dictionary word does not follow.
598        // We do this in code rather than using a rule so that the heuristic
599        // resynch continues to function. For example, one of the suffix characters
600        // could be a typo in the middle of a word.
601        // NOT CURRENTLY APPLICABLE TO LAO
602
603        // Did we find a word on this iteration? If so, push it on the break stack
604        if (wordLength > 0) {
605            foundBreaks.push((current+wordLength), status);
606        }
607    }
608
609    // Don't return a break for the end of the dictionary range if there is one there.
610    if (foundBreaks.peeki() >= rangeEnd) {
611        (void) foundBreaks.popi();
612        wordsFound -= 1;
613    }
614
615    return wordsFound;
616}
617
618/*
619 ******************************************************************
620 * KhmerBreakEngine
621 */
622
623// How many words in a row are "good enough"?
624#define KHMER_LOOKAHEAD 3
625
626// Will not combine a non-word with a preceding dictionary word longer than this
627#define KHMER_ROOT_COMBINE_THRESHOLD 3
628
629// Will not combine a non-word that shares at least this much prefix with a
630// dictionary word, with a preceding word
631#define KHMER_PREFIX_COMBINE_THRESHOLD 3
632
633// Minimum word size
634#define KHMER_MIN_WORD 2
635
636// Minimum number of characters for two words
637#define KHMER_MIN_WORD_SPAN (KHMER_MIN_WORD * 2)
638
639KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
640    : DictionaryBreakEngine((1 << UBRK_WORD) | (1 << UBRK_LINE)),
641      fDictionary(adoptDictionary)
642{
643    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
644    if (U_SUCCESS(status)) {
645        setCharacters(fKhmerWordSet);
646    }
647    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
648    fMarkSet.add(0x0020);
649    fEndWordSet = fKhmerWordSet;
650    fBeginWordSet.add(0x1780, 0x17B3);
651    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
652    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
653    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
654    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
655    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
656//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
657//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
658//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
659//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
660//    fSuffixSet.add(THAI_PAIYANNOI);
661//    fSuffixSet.add(THAI_MAIYAMOK);
662
663    // Compact for caching.
664    fMarkSet.compact();
665    fEndWordSet.compact();
666    fBeginWordSet.compact();
667//    fSuffixSet.compact();
668}
669
670KhmerBreakEngine::~KhmerBreakEngine() {
671    delete fDictionary;
672}
673
674int32_t
675KhmerBreakEngine::divideUpDictionaryRange( UText *text,
676                                                int32_t rangeStart,
677                                                int32_t rangeEnd,
678                                                UStack &foundBreaks ) const {
679    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
680        return 0;       // Not enough characters for two words
681    }
682
683    uint32_t wordsFound = 0;
684    int32_t wordLength;
685    int32_t current;
686    UErrorCode status = U_ZERO_ERROR;
687    PossibleWord words[KHMER_LOOKAHEAD];
688    UChar32 uc;
689
690    utext_setNativeIndex(text, rangeStart);
691
692    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
693        wordLength = 0;
694
695        // Look for candidate words at the current position
696        int candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
697
698        // If we found exactly one, use that
699        if (candidates == 1) {
700            wordLength = words[wordsFound%KHMER_LOOKAHEAD].acceptMarked(text);
701            wordsFound += 1;
702        }
703
704        // If there was more than one, see which one can take us forward the most words
705        else if (candidates > 1) {
706            // If we're already at the end of the range, we're done
707            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
708                goto foundBest;
709            }
710            do {
711                int wordsMatched = 1;
712                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
713                    if (wordsMatched < 2) {
714                        // Followed by another dictionary word; mark first word as a good candidate
715                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
716                        wordsMatched = 2;
717                    }
718
719                    // If we're already at the end of the range, we're done
720                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
721                        goto foundBest;
722                    }
723
724                    // See if any of the possible second words is followed by a third word
725                    do {
726                        // If we find a third word, stop right away
727                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
728                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
729                            goto foundBest;
730                        }
731                    }
732                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
733                }
734            }
735            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
736foundBest:
737            wordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
738            wordsFound += 1;
739        }
740
741        // We come here after having either found a word or not. We look ahead to the
742        // next word. If it's not a dictionary word, we will combine it with the word we
743        // just found (if there is one), but only if the preceding word does not exceed
744        // the threshold.
745        // The text iterator should now be positioned at the end of the word we found.
746        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
747            // if it is a dictionary word, do nothing. If it isn't, then if there is
748            // no preceding word, or the non-word shares less than the minimum threshold
749            // of characters with a dictionary word, then scan to resynchronize
750            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
751                  && (wordLength == 0
752                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
753                // Look for a plausible word boundary
754                //TODO: This section will need a rework for UText.
755                int32_t remaining = rangeEnd - (current+wordLength);
756                UChar32 pc = utext_current32(text);
757                int32_t chars = 0;
758                for (;;) {
759                    utext_next32(text);
760                    uc = utext_current32(text);
761                    // TODO: Here we're counting on the fact that the SA languages are all
762                    // in the BMP. This should get fixed with the UText rework.
763                    chars += 1;
764                    if (--remaining <= 0) {
765                        break;
766                    }
767                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
768                        // Maybe. See if it's in the dictionary.
769                        int candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
770                        utext_setNativeIndex(text, current+wordLength+chars);
771                        if (candidates > 0) {
772                            break;
773                        }
774                    }
775                    pc = uc;
776                }
777
778                // Bump the word count if there wasn't already one
779                if (wordLength <= 0) {
780                    wordsFound += 1;
781                }
782
783                // Update the length with the passed-over characters
784                wordLength += chars;
785            }
786            else {
787                // Back up to where we were for next iteration
788                utext_setNativeIndex(text, current+wordLength);
789            }
790        }
791
792        // Never stop before a combining mark.
793        int32_t currPos;
794        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
795            utext_next32(text);
796            wordLength += (int32_t)utext_getNativeIndex(text) - currPos;
797        }
798
799        // Look ahead for possible suffixes if a dictionary word does not follow.
800        // We do this in code rather than using a rule so that the heuristic
801        // resynch continues to function. For example, one of the suffix characters
802        // could be a typo in the middle of a word.
803//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
804//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
805//                && fSuffixSet.contains(uc = utext_current32(text))) {
806//                if (uc == KHMER_PAIYANNOI) {
807//                    if (!fSuffixSet.contains(utext_previous32(text))) {
808//                        // Skip over previous end and PAIYANNOI
809//                        utext_next32(text);
810//                        utext_next32(text);
811//                        wordLength += 1;            // Add PAIYANNOI to word
812//                        uc = utext_current32(text);     // Fetch next character
813//                    }
814//                    else {
815//                        // Restore prior position
816//                        utext_next32(text);
817//                    }
818//                }
819//                if (uc == KHMER_MAIYAMOK) {
820//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
821//                        // Skip over previous end and MAIYAMOK
822//                        utext_next32(text);
823//                        utext_next32(text);
824//                        wordLength += 1;            // Add MAIYAMOK to word
825//                    }
826//                    else {
827//                        // Restore prior position
828//                        utext_next32(text);
829//                    }
830//                }
831//            }
832//            else {
833//                utext_setNativeIndex(text, current+wordLength);
834//            }
835//        }
836
837        // Did we find a word on this iteration? If so, push it on the break stack
838        if (wordLength > 0) {
839            foundBreaks.push((current+wordLength), status);
840        }
841    }
842
843    // Don't return a break for the end of the dictionary range if there is one there.
844    if (foundBreaks.peeki() >= rangeEnd) {
845        (void) foundBreaks.popi();
846        wordsFound -= 1;
847    }
848
849    return wordsFound;
850}
851
852#if !UCONFIG_NO_NORMALIZATION
853/*
854 ******************************************************************
855 * CjkBreakEngine
856 */
857static const uint32_t kuint32max = 0xFFFFFFFF;
858CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
859: DictionaryBreakEngine(1 << UBRK_WORD), fDictionary(adoptDictionary) {
860    // Korean dictionary only includes Hangul syllables
861    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
862    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
863    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
864    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
865
866    if (U_SUCCESS(status)) {
867        // handle Korean and Japanese/Chinese using different dictionaries
868        if (type == kKorean) {
869            setCharacters(fHangulWordSet);
870        } else { //Chinese and Japanese
871            UnicodeSet cjSet;
872            cjSet.addAll(fHanWordSet);
873            cjSet.addAll(fKatakanaWordSet);
874            cjSet.addAll(fHiraganaWordSet);
875            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
876            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
877            setCharacters(cjSet);
878        }
879    }
880}
881
882CjkBreakEngine::~CjkBreakEngine(){
883    delete fDictionary;
884}
885
886// The katakanaCost values below are based on the length frequencies of all
887// katakana phrases in the dictionary
888static const int kMaxKatakanaLength = 8;
889static const int kMaxKatakanaGroupLength = 20;
890static const uint32_t maxSnlp = 255;
891
892static inline uint32_t getKatakanaCost(int wordLength){
893    //TODO: fill array with actual values from dictionary!
894    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
895                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
896    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
897}
898
899static inline bool isKatakana(uint16_t value) {
900    return (value >= 0x30A1u && value <= 0x30FEu && value != 0x30FBu) ||
901            (value >= 0xFF66u && value <= 0xFF9fu);
902}
903
904// A very simple helper class to streamline the buffer handling in
905// divideUpDictionaryRange.
906template<class T, size_t N>
907class AutoBuffer {
908public:
909    AutoBuffer(size_t size) : buffer(stackBuffer), capacity(N) {
910        if (size > N) {
911            buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
912            capacity = size;
913        }
914    }
915    ~AutoBuffer() {
916        if (buffer != stackBuffer)
917            uprv_free(buffer);
918    }
919
920    T* elems() {
921        return buffer;
922    }
923
924    const T& operator[] (size_t i) const {
925        return buffer[i];
926    }
927
928    T& operator[] (size_t i) {
929        return buffer[i];
930    }
931
932    // resize without copy
933    void resize(size_t size) {
934        if (size <= capacity)
935            return;
936        if (buffer != stackBuffer)
937            uprv_free(buffer);
938        buffer = reinterpret_cast<T*>(uprv_malloc(sizeof(T)*size));
939        capacity = size;
940    }
941
942private:
943    T stackBuffer[N];
944    T* buffer;
945    AutoBuffer();
946    size_t capacity;
947};
948
949
950/*
951 * @param text A UText representing the text
952 * @param rangeStart The start of the range of dictionary characters
953 * @param rangeEnd The end of the range of dictionary characters
954 * @param foundBreaks Output of C array of int32_t break positions, or 0
955 * @return The number of breaks found
956 */
957int32_t
958CjkBreakEngine::divideUpDictionaryRange( UText *text,
959        int32_t rangeStart,
960        int32_t rangeEnd,
961        UStack &foundBreaks ) const {
962    if (rangeStart >= rangeEnd) {
963        return 0;
964    }
965
966    const size_t defaultInputLength = 80;
967    size_t inputLength = rangeEnd - rangeStart;
968    // TODO: Replace by UnicodeString.
969    AutoBuffer<UChar, defaultInputLength> charString(inputLength);
970
971    // Normalize the input string and put it in normalizedText.
972    // The map from the indices of the normalized input to the raw
973    // input is kept in charPositions.
974    UErrorCode status = U_ZERO_ERROR;
975    utext_extract(text, rangeStart, rangeEnd, charString.elems(), inputLength, &status);
976    if (U_FAILURE(status)) {
977        return 0;
978    }
979
980    UnicodeString inputString(charString.elems(), inputLength);
981    // TODO: Use Normalizer2.
982    UNormalizationMode norm_mode = UNORM_NFKC;
983    UBool isNormalized =
984        Normalizer::quickCheck(inputString, norm_mode, status) == UNORM_YES ||
985        Normalizer::isNormalized(inputString, norm_mode, status);
986
987    // TODO: Replace by UVector32.
988    AutoBuffer<int32_t, defaultInputLength> charPositions(inputLength + 1);
989    int numChars = 0;
990    UText normalizedText = UTEXT_INITIALIZER;
991    // Needs to be declared here because normalizedText holds onto its buffer.
992    UnicodeString normalizedString;
993    if (isNormalized) {
994        int32_t index = 0;
995        charPositions[0] = 0;
996        while(index < inputString.length()) {
997            index = inputString.moveIndex32(index, 1);
998            charPositions[++numChars] = index;
999        }
1000        utext_openUnicodeString(&normalizedText, &inputString, &status);
1001    }
1002    else {
1003        Normalizer::normalize(inputString, norm_mode, 0, normalizedString, status);
1004        if (U_FAILURE(status)) {
1005            return 0;
1006        }
1007        charPositions.resize(normalizedString.length() + 1);
1008        Normalizer normalizer(charString.elems(), inputLength, norm_mode);
1009        int32_t index = 0;
1010        charPositions[0] = 0;
1011        while(index < normalizer.endIndex()){
1012            /* UChar32 uc = */ normalizer.next();
1013            charPositions[++numChars] = index = normalizer.getIndex();
1014        }
1015        utext_openUnicodeString(&normalizedText, &normalizedString, &status);
1016    }
1017
1018    if (U_FAILURE(status)) {
1019        return 0;
1020    }
1021
1022    // From this point on, all the indices refer to the indices of
1023    // the normalized input string.
1024
1025    // bestSnlp[i] is the snlp of the best segmentation of the first i
1026    // characters in the range to be matched.
1027    // TODO: Replace by UVector32.
1028    AutoBuffer<uint32_t, defaultInputLength> bestSnlp(numChars + 1);
1029    bestSnlp[0] = 0;
1030    for(int i = 1; i <= numChars; i++) {
1031        bestSnlp[i] = kuint32max;
1032    }
1033
1034    // prev[i] is the index of the last CJK character in the previous word in
1035    // the best segmentation of the first i characters.
1036    // TODO: Replace by UVector32.
1037    AutoBuffer<int, defaultInputLength> prev(numChars + 1);
1038    for(int i = 0; i <= numChars; i++){
1039        prev[i] = -1;
1040    }
1041
1042    const size_t maxWordSize = 20;
1043    // TODO: Replace both with UVector32.
1044    AutoBuffer<int32_t, maxWordSize> values(numChars);
1045    AutoBuffer<int32_t, maxWordSize> lengths(numChars);
1046
1047    // Dynamic programming to find the best segmentation.
1048    bool is_prev_katakana = false;
1049    for (int32_t i = 0; i < numChars; ++i) {
1050        //utext_setNativeIndex(text, rangeStart + i);
1051        utext_setNativeIndex(&normalizedText, i);
1052        if (bestSnlp[i] == kuint32max)
1053            continue;
1054
1055        int32_t count;
1056        // limit maximum word length matched to size of current substring
1057        int32_t maxSearchLength = (i + maxWordSize < (size_t) numChars)? maxWordSize : (numChars - i);
1058
1059        fDictionary->matches(&normalizedText, maxSearchLength, lengths.elems(), count, maxSearchLength, values.elems());
1060
1061        // if there are no single character matches found in the dictionary
1062        // starting with this charcter, treat character as a 1-character word
1063        // with the highest value possible, i.e. the least likely to occur.
1064        // Exclude Korean characters from this treatment, as they should be left
1065        // together by default.
1066        if((count == 0 || lengths[0] != 1) &&
1067                !fHangulWordSet.contains(utext_current32(&normalizedText))) {
1068            values[count] = maxSnlp;
1069            lengths[count++] = 1;
1070        }
1071
1072        for (int j = 0; j < count; j++) {
1073            uint32_t newSnlp = bestSnlp[i] + values[j];
1074            if (newSnlp < bestSnlp[lengths[j] + i]) {
1075                bestSnlp[lengths[j] + i] = newSnlp;
1076                prev[lengths[j] + i] = i;
1077            }
1078        }
1079
1080        // In Japanese,
1081        // Katakana word in single character is pretty rare. So we apply
1082        // the following heuristic to Katakana: any continuous run of Katakana
1083        // characters is considered a candidate word with a default cost
1084        // specified in the katakanaCost table according to its length.
1085        //utext_setNativeIndex(text, rangeStart + i);
1086        utext_setNativeIndex(&normalizedText, i);
1087        bool is_katakana = isKatakana(utext_current32(&normalizedText));
1088        if (!is_prev_katakana && is_katakana) {
1089            int j = i + 1;
1090            utext_next32(&normalizedText);
1091            // Find the end of the continuous run of Katakana characters
1092            while (j < numChars && (j - i) < kMaxKatakanaGroupLength &&
1093                    isKatakana(utext_current32(&normalizedText))) {
1094                utext_next32(&normalizedText);
1095                ++j;
1096            }
1097            if ((j - i) < kMaxKatakanaGroupLength) {
1098                uint32_t newSnlp = bestSnlp[i] + getKatakanaCost(j - i);
1099                if (newSnlp < bestSnlp[j]) {
1100                    bestSnlp[j] = newSnlp;
1101                    prev[j] = i;
1102                }
1103            }
1104        }
1105        is_prev_katakana = is_katakana;
1106    }
1107
1108    // Start pushing the optimal offset index into t_boundary (t for tentative).
1109    // prev[numChars] is guaranteed to be meaningful.
1110    // We'll first push in the reverse order, i.e.,
1111    // t_boundary[0] = numChars, and afterwards do a swap.
1112    // TODO: Replace by UVector32.
1113    AutoBuffer<int, maxWordSize> t_boundary(numChars + 1);
1114
1115    int numBreaks = 0;
1116    // No segmentation found, set boundary to end of range
1117    if (bestSnlp[numChars] == kuint32max) {
1118        t_boundary[numBreaks++] = numChars;
1119    } else {
1120        for (int i = numChars; i > 0; i = prev[i]) {
1121            t_boundary[numBreaks++] = i;
1122        }
1123        U_ASSERT(prev[t_boundary[numBreaks - 1]] == 0);
1124    }
1125
1126    // Reverse offset index in t_boundary.
1127    // Don't add a break for the start of the dictionary range if there is one
1128    // there already.
1129    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1130        t_boundary[numBreaks++] = 0;
1131    }
1132
1133    // Now that we're done, convert positions in t_bdry[] (indices in
1134    // the normalized input string) back to indices in the raw input string
1135    // while reversing t_bdry and pushing values to foundBreaks.
1136    for (int i = numBreaks-1; i >= 0; i--) {
1137        foundBreaks.push(charPositions[t_boundary[i]] + rangeStart, status);
1138    }
1139
1140    utext_close(&normalizedText);
1141    return numBreaks;
1142}
1143#endif
1144
1145U_NAMESPACE_END
1146
1147#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1148
1149