1/*
2*******************************************************************************
3*   Copyright (C) 2010-2012,2014, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  stringtriebuilder.h
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010dec24
12*   created by: Markus W. Scherer
13*/
14
15#ifndef __STRINGTRIEBUILDER_H__
16#define __STRINGTRIEBUILDER_H__
17
18#include "unicode/utypes.h"
19#include "unicode/uobject.h"
20
21/**
22 * \file
23 * \brief C++ API: Builder API for trie builders
24 */
25
26// Forward declaration.
27struct UHashtable;
28typedef struct UHashtable UHashtable;
29
30/**
31 * Build options for BytesTrieBuilder and CharsTrieBuilder.
32 * @stable ICU 4.8
33 */
34enum UStringTrieBuildOption {
35    /**
36     * Builds a trie quickly.
37     * @stable ICU 4.8
38     */
39    USTRINGTRIE_BUILD_FAST,
40    /**
41     * Builds a trie more slowly, attempting to generate
42     * a shorter but equivalent serialization.
43     * This build option also uses more memory.
44     *
45     * This option can be effective when many integer values are the same
46     * and string/byte sequence suffixes can be shared.
47     * Runtime speed is not expected to improve.
48     * @stable ICU 4.8
49     */
50    USTRINGTRIE_BUILD_SMALL
51};
52
53U_NAMESPACE_BEGIN
54
55/**
56 * Base class for string trie builder classes.
57 *
58 * This class is not intended for public subclassing.
59 * @stable ICU 4.8
60 */
61class U_COMMON_API StringTrieBuilder : public UObject {
62public:
63#ifndef U_HIDE_INTERNAL_API
64    /** @internal */
65    static UBool hashNode(const void *node);
66    /** @internal */
67    static UBool equalNodes(const void *left, const void *right);
68#endif  /* U_HIDE_INTERNAL_API */
69
70protected:
71    // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72    // or else the compiler will create a public default constructor.
73    /** @internal */
74    StringTrieBuilder();
75    /** @internal */
76    virtual ~StringTrieBuilder();
77
78#ifndef U_HIDE_INTERNAL_API
79    /** @internal */
80    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
81    /** @internal */
82    void deleteCompactBuilder();
83
84    /** @internal */
85    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86
87    /** @internal */
88    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
89    /** @internal */
90    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91#endif  /* U_HIDE_INTERNAL_API */
92
93    class Node;
94
95#ifndef U_HIDE_INTERNAL_API
96    /** @internal */
97    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
98    /** @internal */
99    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100                            int32_t length, UErrorCode &errorCode);
101#endif  /* U_HIDE_INTERNAL_API */
102
103    /** @internal */
104    virtual int32_t getElementStringLength(int32_t i) const = 0;
105    /** @internal */
106    virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
107    /** @internal */
108    virtual int32_t getElementValue(int32_t i) const = 0;
109
110    // Finds the first unit index after this one where
111    // the first and last element have different units again.
112    /** @internal */
113    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114
115    // Number of different units at unitIndex.
116    /** @internal */
117    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
118    /** @internal */
119    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
120    /** @internal */
121    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122
123    /** @internal */
124    virtual UBool matchNodesCanHaveValues() const = 0;
125
126    /** @internal */
127    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
128    /** @internal */
129    virtual int32_t getMinLinearMatch() const = 0;
130    /** @internal */
131    virtual int32_t getMaxLinearMatchLength() const = 0;
132
133#ifndef U_HIDE_INTERNAL_API
134    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
135    /** @internal */
136    static const int32_t kMaxBranchLinearSubNodeLength=5;
137
138    // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
140    /** @internal */
141    static const int32_t kMaxSplitBranchLevels=14;
142
143    /**
144     * Makes sure that there is only one unique node registered that is
145     * equivalent to newNode.
146     * @param newNode Input node. The builder takes ownership.
147     * @param errorCode ICU in/out UErrorCode.
148                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149     * @return newNode if it is the first of its kind, or
150     *         an equivalent node if newNode is a duplicate.
151     * @internal
152     */
153    Node *registerNode(Node *newNode, UErrorCode &errorCode);
154    /**
155     * Makes sure that there is only one unique FinalValueNode registered
156     * with this value.
157     * Avoids creating a node if the value is a duplicate.
158     * @param value A final value.
159     * @param errorCode ICU in/out UErrorCode.
160                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161     * @return A FinalValueNode with the given value.
162     * @internal
163     */
164    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165#endif  /* U_HIDE_INTERNAL_API */
166
167    /*
168     * C++ note:
169     * registerNode() and registerFinalValue() take ownership of their input nodes,
170     * and only return owned nodes.
171     * If they see a failure UErrorCode, they will delete the input node.
172     * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
173     * If there is a failure, they return NULL.
174     *
175     * NULL Node pointers can be safely passed into other Nodes because
176     * they call the static Node::hashCode() which checks for a NULL pointer first.
177     *
178     * Therefore, as long as builder functions register a new node,
179     * they need to check for failures only before explicitly dereferencing
180     * a Node pointer, or before setting a new UErrorCode.
181     */
182
183    // Hash set of nodes, maps from nodes to integer 1.
184    /** @internal */
185    UHashtable *nodes;
186
187#ifndef U_HIDE_INTERNAL_API
188    /** @internal */
189    class Node : public UObject {
190    public:
191        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
192        inline int32_t hashCode() const { return hash; }
193        // Handles node==NULL.
194        static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
195        // Base class operator==() compares the actual class types.
196        virtual UBool operator==(const Node &other) const;
197        inline UBool operator!=(const Node &other) const { return !operator==(other); }
198        /**
199         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
200         * This is to avoid writing a duplicate node twice.
201         *
202         * Branch nodes in this trie data structure are not symmetric.
203         * Most branch edges "jump" to other nodes but the rightmost branch edges
204         * just continue without a jump.
205         * Therefore, write() must write the rightmost branch edge last
206         * (trie units are written backwards), and must write it at that point even if
207         * it is a duplicate of a node previously written elsewhere.
208         *
209         * This function visits and marks right branch edges first.
210         * Edges are numbered with increasingly negative values because we share the
211         * offset field which gets positive values when nodes are written.
212         * A branch edge also remembers the first number for any of its edges.
213         *
214         * When a further-left branch edge has a number in the range of the rightmost
215         * edge's numbers, then it will be written as part of the required right edge
216         * and we can avoid writing it first.
217         *
218         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
219         * edge numbers.
220         *
221         * @param edgeNumber The first edge number for this node and its sub-nodes.
222         * @return An edge number that is at least the maximum-negative
223         *         of the input edge number and the numbers of this node and all of its sub-nodes.
224         */
225        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
226        // write() must set the offset to a positive value.
227        virtual void write(StringTrieBuilder &builder) = 0;
228        // See markRightEdgesFirst.
229        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
230                                               StringTrieBuilder &builder) {
231            // Note: Edge numbers are negative, lastRight<=firstRight.
232            // If offset>0 then this node and its sub-nodes have been written already
233            // and we need not write them again.
234            // If this node is part of the unwritten right branch edge,
235            // then we wait until that is written.
236            if(offset<0 && (offset<lastRight || firstRight<offset)) {
237                write(builder);
238            }
239        }
240        inline int32_t getOffset() const { return offset; }
241    protected:
242        int32_t hash;
243        int32_t offset;
244    };
245
246    // This class should not be overridden because
247    // registerFinalValue() compares a stack-allocated FinalValueNode
248    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
249    // with the input node, and the
250    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
251    // will be false if the typeid's are different.
252    /** @internal */
253    class FinalValueNode : public Node {
254    public:
255        FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
256        virtual UBool operator==(const Node &other) const;
257        virtual void write(StringTrieBuilder &builder);
258    protected:
259        int32_t value;
260    };
261
262    /**
263     * @internal
264     */
265    class ValueNode : public Node {
266    public:
267        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
268        virtual UBool operator==(const Node &other) const;
269        void setValue(int32_t v) {
270            hasValue=TRUE;
271            value=v;
272            hash=hash*37+v;
273        }
274    protected:
275        UBool hasValue;
276        int32_t value;
277    };
278
279    /**
280     * @internal
281     */
282    class IntermediateValueNode : public ValueNode {
283    public:
284        IntermediateValueNode(int32_t v, Node *nextNode)
285                : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
286        virtual UBool operator==(const Node &other) const;
287        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
288        virtual void write(StringTrieBuilder &builder);
289    protected:
290        Node *next;
291    };
292
293    /**
294     * @internal
295     */
296    class LinearMatchNode : public ValueNode {
297    public:
298        LinearMatchNode(int32_t len, Node *nextNode)
299                : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
300                  length(len), next(nextNode) {}
301        virtual UBool operator==(const Node &other) const;
302        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
303    protected:
304        int32_t length;
305        Node *next;
306    };
307
308    /**
309     * @internal
310     */
311    class BranchNode : public Node {
312    public:
313        BranchNode(int32_t initialHash) : Node(initialHash) {}
314    protected:
315        int32_t firstEdgeNumber;
316    };
317
318    /**
319     * @internal
320     */
321    class ListBranchNode : public BranchNode {
322    public:
323        ListBranchNode() : BranchNode(0x444444), length(0) {}
324        virtual UBool operator==(const Node &other) const;
325        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
326        virtual void write(StringTrieBuilder &builder);
327        // Adds a unit with a final value.
328        void add(int32_t c, int32_t value) {
329            units[length]=(UChar)c;
330            equal[length]=NULL;
331            values[length]=value;
332            ++length;
333            hash=(hash*37+c)*37+value;
334        }
335        // Adds a unit which leads to another match node.
336        void add(int32_t c, Node *node) {
337            units[length]=(UChar)c;
338            equal[length]=node;
339            values[length]=0;
340            ++length;
341            hash=(hash*37+c)*37+hashCode(node);
342        }
343    protected:
344        Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
345        int32_t length;
346        int32_t values[kMaxBranchLinearSubNodeLength];
347        UChar units[kMaxBranchLinearSubNodeLength];
348    };
349
350    /**
351     * @internal
352     */
353    class SplitBranchNode : public BranchNode {
354    public:
355        SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
356                : BranchNode(((0x555555*37+middleUnit)*37+
357                              hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
358                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
359        virtual UBool operator==(const Node &other) const;
360        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
361        virtual void write(StringTrieBuilder &builder);
362    protected:
363        UChar unit;
364        Node *lessThan;
365        Node *greaterOrEqual;
366    };
367
368    // Branch head node, for writing the actual node lead unit.
369    /** @internal */
370    class BranchHeadNode : public ValueNode {
371    public:
372        BranchHeadNode(int32_t len, Node *subNode)
373                : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
374                  length(len), next(subNode) {}
375        virtual UBool operator==(const Node &other) const;
376        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
377        virtual void write(StringTrieBuilder &builder);
378    protected:
379        int32_t length;
380        Node *next;  // A branch sub-node.
381    };
382#endif  /* U_HIDE_INTERNAL_API */
383
384    /** @internal */
385    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
386                                        Node *nextNode) const = 0;
387
388    /** @internal */
389    virtual int32_t write(int32_t unit) = 0;
390    /** @internal */
391    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
392    /** @internal */
393    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
394    /** @internal */
395    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
396    /** @internal */
397    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
398};
399
400U_NAMESPACE_END
401
402#endif  // __STRINGTRIEBUILDER_H__
403