1/* 2*************************************************************************** 3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. * 4*************************************************************************** 5*/ 6 7#include "unicode/utypes.h" 8 9#if !UCONFIG_NO_BREAK_ITERATION 10 11#include "rbtok.h" 12#include "unicode/ustring.h" 13#include "unicode/utext.h" 14#include "rbbidata.h" 15 16U_NAMESPACE_BEGIN 17 18 19#if defined(__GNUC__) && (__GNUC__ >= 4) 20#pragma GCC optimization_level 3 21#endif 22 23static const int16_t START_STATE = 1; // The state number of the starting state 24static const int16_t STOP_STATE = 0; // The state-transition value indicating "stop" 25 26int32_t RuleBasedTokenizer::tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags) 27{ 28 RuleBasedTokenRange *outTokenLimit = outTokenRanges + maxTokens; 29 RuleBasedTokenRange *outTokenP = outTokenRanges; 30 int32_t state; 31 int16_t category; 32 33 const RBBIStateTableRow *row; 34 const RBBIStateTableRow *const startRow = fStartRow; 35 36 int32_t lastAcceptingState = 0; 37 UChar32 c = 0; 38 signed long prev; 39 signed long result; 40 const char *const tableData = fData->fForwardTable->fTableData; 41 const uint32_t tableRowLen = fData->fForwardTable->fRowLen; 42 UText *text = fText; 43 44 #ifdef RBBI_DEBUG 45 if (fTrace) { 46 RBBIDebugPuts("Handle Next pos char state category"); 47 } 48 #endif 49 50 fLastStatusIndexValid = FALSE; 51 52 // if we're already at the end of the text, return DONE. 53 prev = (signed long)UTEXT_GETNATIVEINDEX(text); 54 55 // loop until we reach the end of the text or transition to state 0 56 // 57 const UTrie *trie = &fData->fTrie; 58 while (outTokenP < outTokenLimit) { 59 c = UTEXT_NEXT32(text); 60 if (c == U_SENTINEL) 61 { 62 goto exitTokenizer; 63 } 64 // Set the initial state for the state machine 65 state = START_STATE; 66 row = startRow; 67 68 // if we have cached break positions and we're still in the range 69 // covered by them, just move one step forward in the cache 70 if (fCachedBreakPositions != NULL) { 71 if (fPositionInCache < fNumCachedBreakPositions - 1) { 72 ++fPositionInCache; 73 result = fCachedBreakPositions[fPositionInCache]; 74 goto emitToken; 75 } 76 else { 77 reset(); 78 } 79 } 80 81 while (c != U_SENTINEL) { 82 // 83 // Get the char category. An incoming category of 1 or 2 means that 84 // we are preset for doing the beginning or end of input, and 85 // that we shouldn't get a category from an actual text input character. 86 // 87 // look up the current character's character category, which tells us 88 // which column in the state table to look at. 89 // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, 90 // not the size of the character going in, which is a UChar32. 91 // 92 if (__builtin_expect((c < 0x100), 1)) 93 category = fLatin1Cat[c]; 94 else 95 UTRIE_GET16(trie, c, category); 96 97 // Check the dictionary bit in the character's category. 98 // Counter is only used by dictionary based iterators (subclasses). 99 // Chars that need to be handled by a dictionary have a flag bit set 100 // in their category values. 101 // 102 if (__builtin_expect((category & 0x4000) != 0, 0)) { 103 fDictionaryCharCount++; 104 // And off the dictionary flag bit. 105 category &= ~0x4000; 106 } 107 108 #ifdef RBBI_DEBUG 109 if (fTrace) { 110 RBBIDebugPrintf(" %4d ", utext_getNativeIndex(fText)); 111 if (0x20<=c && c<0x7f) { 112 RBBIDebugPrintf("\"%c\" ", c); 113 } else { 114 RBBIDebugPrintf("%5x ", c); 115 } 116 RBBIDebugPrintf("%3d %3d\n", state, category); 117 } 118 #endif 119 120 // State Transition - move machine to its next state 121 // 122 state = row->fNextState[category]; 123 row = (const RBBIStateTableRow *) (tableData + tableRowLen * state); 124 125 if (row->fAccepting == -1) { 126 // Match found, common case. 127 result = (signed long)UTEXT_GETNATIVEINDEX(text); 128 //fLastRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. 129 //lastStatusRow = row; 130 lastAcceptingState = state; 131 } 132 133 if (state == STOP_STATE) { 134 // This is the normal exit from the lookup state machine. 135 // We have advanced through the string until it is certain that no 136 // longer match is possible, no matter what characters follow. 137 break; 138 } 139 140 // Advance to the next character. 141 // If this is a beginning-of-input loop iteration, don't advance 142 // the input position. The next iteration will be processing the 143 // first real input character. 144 c = UTEXT_NEXT32(text); 145 } 146 147 if (fDictionaryCharCount > 0) { 148 result = (signed long) checkDictionary(prev, (int32_t) result, FALSE); 149 } 150 151emitToken: 152 // The state machine is done. Check whether it found a match... 153 154 // Leave the iterator at our result position. 155 UTEXT_SETNATIVEINDEX(text, result); 156 157 RuleBasedTokenRange range = {(signed long)prev, (signed long) (result-prev)}; 158 int32_t flags = fStateFlags[lastAcceptingState]; 159 160 if (flags == -1) 161 goto skipToken; 162 163 *outTokenP++ = range; 164 if (outTokenFlags) 165 { 166 *outTokenFlags++ = (unsigned long) flags; 167 } 168 169 if (flags & 0x40000000) 170 goto exitTokenizer; 171 172skipToken: 173 prev = result; 174 } 175 176exitTokenizer: 177 return (outTokenP - outTokenRanges); 178} 179 180#if defined (__GNUC__) && (__GNUC__ >= 4) 181#pragma GCC optimization_level reset 182#endif 183 184void 185RuleBasedTokenizer::init() 186{ 187 const RBBIStateTable *statetable = fData->fForwardTable; 188 setBreakType(UBRK_WORD); 189 fStartRow = (const RBBIStateTableRow *) 190 (statetable->fTableData + (statetable->fRowLen * START_STATE)); 191 UChar i; 192 const UTrie *trie = &fData->fTrie; 193 int16_t category; 194 fLatin1Cat = new int16_t[256]; 195 for (i = 0; i < 256; ++i) 196 { 197 //UTRIE_GET16(trie, i, category); 198 //fLatin1Cat[i] = category; 199 fLatin1Cat[i] = _UTRIE_GET_RAW(trie, index, 0, i); 200 } 201 fStateFlags = new int32_t[statetable->fNumStates]; 202 for (i = 0; i < statetable->fNumStates; ++i) 203 { 204 const RBBIStateTableRow *row = (const RBBIStateTableRow *) 205 (statetable->fTableData + (statetable->fRowLen * i)); 206 int32_t flags = 0; 207 if (row->fAccepting == -1) 208 { 209 const int32_t *vals = (fData->fRuleStatusTable) + (row->fTagIdx); 210 const int32_t *valLimit = vals + 1; 211 valLimit += *vals++; 212 while (vals < valLimit) 213 { 214 int32_t val = *vals++; 215 if (val == 0) 216 { 217 break; 218 } 219 else if (val > 0) 220 { 221 flags |= val; 222 } 223 else 224 { 225 flags = val; 226 break; 227 } 228 } 229 } 230 fStateFlags[i] = flags; 231 } 232} 233 234RuleBasedTokenizer::RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &err) 235 : RuleBasedBreakIterator(rules, parseErr, err) 236{ 237 init(); 238} 239 240RuleBasedTokenizer::RuleBasedTokenizer(uint8_t *data, UErrorCode &status) 241 : RuleBasedBreakIterator((RBBIDataHeader *)data, status) 242{ 243 init(); 244} 245 246RuleBasedTokenizer::RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt, UErrorCode &status) 247 : RuleBasedBreakIterator((const RBBIDataHeader *)data, RuleBasedBreakIterator::kDontAdopt, status) 248{ 249 init(); 250} 251 252RuleBasedTokenizer::~RuleBasedTokenizer() { 253 delete [] fStateFlags; 254 delete [] fLatin1Cat; 255} 256 257U_NAMESPACE_END 258 259#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 260