1/*
2***************************************************************************
3* Copyright (C) 2006-2008 Apple Inc. All Rights Reserved.                 *
4***************************************************************************
5*/
6
7#ifndef RBTOK_H
8#define RBTOK_H
9
10#include "unicode/utypes.h"
11
12/**
13 * \file
14 * \brief C++ API: Rule Based Tokenizer
15 */
16
17#if !UCONFIG_NO_BREAK_ITERATION
18
19#include "unicode/urbtok.h"
20#include "unicode/rbbi.h"
21#include "unicode/parseerr.h"
22
23
24U_NAMESPACE_BEGIN
25
26/** @internal */
27struct RBBIDataHeader;
28struct RBBIStateTableRow;
29
30
31/**
32 *
33 * A subclass of RuleBasedBreakIterator that adds tokenization functionality.
34
35 * <p>This class is for internal use only by Apple Computer, Inc.</p>
36 *
37 */
38class U_COMMON_API RuleBasedTokenizer : public RuleBasedBreakIterator {
39
40private:
41    /**
42     * The row corresponding to the start state
43     * @internal
44     */
45    const RBBIStateTableRow *fStartRow;
46
47    /**
48     * The merged flag results for accepting states
49     * @internal
50     */
51    int32_t *fStateFlags;
52
53    /**
54     * Character categories for the Latin1 subset of Unicode
55     * @internal
56     */
57    int16_t *fLatin1Cat;
58
59public:
60    /**
61     * Construct a RuleBasedTokenizer from a set of rules supplied as a string.
62     * @param rules The break rules to be used.
63     * @param parseError  In the event of a syntax error in the rules, provides the location
64     *                    within the rules of the problem.
65     * @param status Information on any errors encountered.
66     * @internal
67     */
68    RuleBasedTokenizer(const UnicodeString &rules, UParseError &parseErr, UErrorCode &status);
69
70    /**
71     * Constructor from a flattened set of RBBI data in uprv_malloc'd memory.
72     *             RulesBasedBreakIterators built from a custom set of rules
73     *             are created via this constructor; the rules are compiled
74     *             into memory, then the break iterator is constructed here.
75     *
76     *             The break iterator adopts the memory, and will
77     *             free it when done.
78     * @internal
79     */
80    RuleBasedTokenizer(uint8_t *data, UErrorCode &status);
81
82    /**
83     * Constructor from a flattened set of RBBI data in umemory which need not
84     *             be malloced (e.g. it may be a memory-mapped file, etc.).
85       *
86     *             This version does not adopt the memory, and does not
87     *             free it when done.
88     * @internal
89     */
90    enum EDontAdopt {
91        kDontAdopt
92    };
93    RuleBasedTokenizer(const uint8_t *data, enum EDontAdopt dontAdopt, UErrorCode &status);
94
95    /**
96     * Destructor
97     *  @internal
98     */
99    virtual ~RuleBasedTokenizer();
100
101    /**
102     * Fetch the next set of tokens.
103     * @param maxTokens The maximum number of tokens to return.
104     * @param outTokenRanges Pointer to output array of token ranges.
105     * @param outTokenFlags (optional) pointer to output array of token flags.
106     * @internal
107     */
108    int32_t tokenize(int32_t maxTokens, RuleBasedTokenRange *outTokenRanges, unsigned long *outTokenFlags);
109
110private:
111    /**
112      * Common initialization function, used by constructors.
113      * @internal
114      */
115    void init();
116};
117
118U_NAMESPACE_END
119
120#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
121
122#endif
123