1/* 2*************************************************************************** 3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved. * 4*************************************************************************** 5*/ 6 7#ifndef RBTOK_H 8#define RBTOK_H 9 10#include "unicode/utypes.h" 11 12/** 13 * \file 14 * \brief C++ API: Rule Based Tokenizer 15 */ 16 17#if !UCONFIG_NO_BREAK_ITERATION 18 19#include "unicode/urbtok.h" 20#include "unicode/rbbi.h" 21#include "unicode/parseerr.h" 22 23 24U_NAMESPACE_BEGIN 25 26/** @internal */ 27struct RBBIDataHeader; 28struct RBBIStateTableRow; 29 30 31/** 32 * 33 * A subclass of RuleBasedBreakIterator that adds tokenization functionality. 34 35 * <p>This class is for internal use only by Apple Computer, Inc.</p> 36 * 37 */ 38class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator { 39 40private: 41 /** 42 * The row corresponding to the start state 43 * @internal 44 */ 45 const RBBIStateTableRow *fStartRow; 46 47 /** 48 * The merged flag results for accepting states 49 * @internal 50 */ 51 int32_t *fStateFlags; 52 53 /** 54 * Character categories for the Latin1 subset of Unicode 55 * @internal 56 */ 57 int16_t *fLatin1Cat; 58 59public: 60 /** 61 * Construct a RuleBasedTokenizer from a set of rules supplied as a string. 62 * @param rules The break rules to be used. 63 * @param parseError In the event of a syntax error in the rules, provides the location 64 * within the rules of the problem. 65 * @param status Information on any errors encountered. 66 * @internal 67 */ 68 RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status); 69 70 /** 71 * Constructor from a flattened set of RBBI data in uprv_malloc'd memory. 72 * RulesBasedBreakIterators built from a custom set of rules 73 * are created via this constructor; the rules are compiled 74 * into memory, then the break iterator is constructed here. 75 * 76 * The break iterator adopts the memory, and will 77 * free it when done. 78 * @internal 79 */ 80 RuleBasedTokenizer(uint8_t *data, UErrorCode &status); 81 82 /** 83 * Constructor from a flattened set of RBBI data in umemory which need not 84 * be malloced (e.g. it may be a memory-mapped file, etc.). 85 * 86 * This version does not adopt the memory, and does not 87 * free it when done. 88 * @internal 89 */ 90 enum EDontAdopt { 91 kDontAdopt 92 }; 93 RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status); 94 95 /** 96 * Destructor 97 * @internal 98 */ 99 virtual ~RuleBasedTokenizer(); 100 101 /** 102 * Fetch the next set of tokens. 103 * @param maxTokens The maximum number of tokens to return. 104 * @param outTokenRanges Pointer to output array of token ranges. 105 * @param outTokenFlags (optional) pointer to output array of token flags. 106 * @internal 107 */ 108 int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags); 109 110private: 111 /** 112 * Common initialization function, used by constructors. 113 * @internal 114 */ 115 void init(); 116}; 117 118U_NAMESPACE_END 119 120#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 121 122#endif 123