1/*
2***************************************************************************
3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.                 *
4***************************************************************************
5*/
6
7#include "unicode/utypes.h"
8
9#if !UCONFIG_NO_BREAK_ITERATION
10
11#include "rbtok.h"
12#include "unicode/ustring.h"
13#include "unicode/utext.h"
14#include "rbbidata.h"
15
16U_NAMESPACE_BEGIN
17
18
19#if defined(__GNUC__) && (__GNUC__ >= 4)
20#pragma GCC optimization_level 3
21#endif
22
23static const int16_t START_STATE = 1;     // The state number of the starting state
24static const int16_t STOP_STATE  = 0;     // The state-transition value indicating "stop"
25
26int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags)
27{
28    RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens;
29    RuleBasedTokenRange *outTokenP = outTokenRanges;
30    int32_t             state;
31    int16_t             category;
32
33    const RBBIStateTableRow  *row;
34    const RBBIStateTableRow  *const startRow = fStartRow;
35
36    int32_t             lastAcceptingState = 0;
37    UChar32             c = 0;
38    signed long         prev;
39    signed long         result;
40    const char         *const tableData       = fData->fForwardTable->fTableData;
41    const uint32_t            tableRowLen     = fData->fForwardTable->fRowLen;
42    UText *text = fText;
43
44    #ifdef RBBI_DEBUG
45        if (fTrace) {
46            RBBIDebugPuts("Handle Next   pos   char  state category");
47        }
48    #endif
49
50    fLastStatusIndexValid = FALSE;
51
52    // if we're already at the end of the text, return DONE.
53    prev = (signed long)UTEXT_GETNATIVEINDEX(text);
54
55    // loop until we reach the end of the text or transition to state 0
56    //
57    const UTrie         *trie = &fData->fTrie;
58    while (outTokenP < outTokenLimit) {
59        c               = UTEXT_NEXT32(text);
60        if (c == U_SENTINEL)
61        {
62            goto exitTokenizer;
63        }
64        //  Set the initial state for the state machine
65        state = START_STATE;
66        row = startRow;
67
68        // if we have cached break positions and we're still in the range
69        // covered by them, just move one step forward in the cache
70        if (fCachedBreakPositions != NULL) {
71            if (fPositionInCache < fNumCachedBreakPositions - 1) {
72                ++fPositionInCache;
73                result = fCachedBreakPositions[fPositionInCache];
74                goto emitToken;
75            }
76            else {
77                reset();
78            }
79        }
80
81        while (c != U_SENTINEL) {
82            //
83            // Get the char category.  An incoming category of 1 or 2 means that
84            //      we are preset for doing the beginning or end of input, and
85            //      that we shouldn't get a category from an actual text input character.
86            //
87                // look up the current character's character category, which tells us
88                // which column in the state table to look at.
89                // Note:  the 16 in UTRIE_GET16 refers to the size of the data being returned,
90                //        not the size of the character going in, which is a UChar32.
91                //
92                if (__builtin_expect((c < 0x100), 1))
93                    category = fLatin1Cat[c];
94                else
95                    UTRIE_GET16(trie, c, category);
96
97                // Check the dictionary bit in the character's category.
98                //    Counter is only used by dictionary based iterators (subclasses).
99                //    Chars that need to be handled by a dictionary have a flag bit set
100                //    in their category values.
101                //
102                if (__builtin_expect((category & 0x4000) != 0, 0))  {
103                    fDictionaryCharCount++;
104                    //  And off the dictionary flag bit.
105                    category &= ~0x4000;
106                }
107
108            #ifdef RBBI_DEBUG
109                if (fTrace) {
110                    RBBIDebugPrintf("             %4d   ", utext_getNativeIndex(fText));
111                    if (0x20<=c && c<0x7f) {
112                        RBBIDebugPrintf("\"%c\"  ", c);
113                    } else {
114                        RBBIDebugPrintf("%5x  ", c);
115                    }
116                    RBBIDebugPrintf("%3d  %3d\n", state, category);
117                }
118            #endif
119
120            // State Transition - move machine to its next state
121            //
122            state = row->fNextState[category];
123            row = (const RBBIStateTableRow *) (tableData + tableRowLen * state);
124
125            if (row->fAccepting == -1) {
126                // Match found, common case.
127                    result = (signed long)UTEXT_GETNATIVEINDEX(text);
128                //fLastRuleStatusIndex = row->fTagIdx;   // Remember the break status (tag) values.
129                //lastStatusRow = row;
130                lastAcceptingState = state;
131            }
132
133            if (state == STOP_STATE) {
134                // This is the normal exit from the lookup state machine.
135                // We have advanced through the string until it is certain that no
136                //   longer match is possible, no matter what characters follow.
137                break;
138            }
139
140            // Advance to the next character.
141            // If this is a beginning-of-input loop iteration, don't advance
142            //    the input position.  The next iteration will be processing the
143            //    first real input character.
144                c = UTEXT_NEXT32(text);
145        }
146
147        if (fDictionaryCharCount > 0) {
148            result = (signed long) checkDictionary(prev, (int32_t) result, FALSE);
149        }
150
151emitToken:
152        // The state machine is done.  Check whether it found a match...
153
154        // Leave the iterator at our result position.
155        UTEXT_SETNATIVEINDEX(text, result);
156
157        RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)};
158        int32_t flags = fStateFlags[lastAcceptingState];
159
160        if (flags == -1)
161            goto skipToken;
162
163        *outTokenP++ = range;
164        if (outTokenFlags)
165        {
166            *outTokenFlags++ = (unsigned long) flags;
167        }
168
169        if (flags & 0x40000000)
170            goto exitTokenizer;
171
172skipToken:
173        prev = result;
174    }
175
176exitTokenizer:
177    return (outTokenP - outTokenRanges);
178}
179
180#if defined (__GNUC__) && (__GNUC__ >= 4)
181#pragma GCC optimization_level reset
182#endif
183
184void
185RuleBasedTokenizer::init()
186{
187    const RBBIStateTable *statetable = fData->fForwardTable;
188    setBreakType(UBRK_WORD);
189    fStartRow = (const RBBIStateTableRow *)
190        (statetable->fTableData + (statetable->fRowLen * START_STATE));
191    UChar i;
192    const UTrie         *trie = &fData->fTrie;
193    int16_t category;
194    fLatin1Cat = new int16_t[256];
195    for (i = 0; i < 256; ++i)
196    {
197        //UTRIE_GET16(trie, i, category);
198        //fLatin1Cat[i] = category;
199        fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i);
200    }
201    fStateFlags = new int32_t[statetable->fNumStates];
202    for (i = 0; i < statetable->fNumStates; ++i)
203    {
204        const RBBIStateTableRow *row = (const RBBIStateTableRow *)
205            (statetable->fTableData + (statetable->fRowLen * i));
206        int32_t flags = 0;
207        if (row->fAccepting == -1)
208        {
209            const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx);
210            const int32_t *valLimit = vals + 1;
211            valLimit += *vals++;
212            while (vals < valLimit)
213            {
214                int32_t val = *vals++;
215                if (val == 0)
216                {
217                    break;
218                }
219                else if (val > 0)
220                {
221                    flags |= val;
222                }
223                else
224                {
225                    flags = val;
226                    break;
227                }
228            }
229        }
230        fStateFlags[i] = flags;
231    }
232}
233
234RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err)
235    : RuleBasedBreakIterator(rules, parseErr, err)
236{
237    init();
238}
239
240RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status)
241    : RuleBasedBreakIterator((RBBIDataHeader *)data, status)
242{
243    init();
244}
245
246RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status)
247    : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status)
248{
249    init();
250}
251
252RuleBasedTokenizer::~RuleBasedTokenizer() {
253    delete [] fStateFlags;
254    delete [] fLatin1Cat;
255}
256
257U_NAMESPACE_END
258
259#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
260