1/**
2 *******************************************************************************
3 * Copyright (C) 2006,2012, International Business Machines Corporation        *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7
8#ifndef DICTBE_H
9#define DICTBE_H
10
11#include "unicode/utypes.h"
12#include "unicode/uniset.h"
13#include "unicode/utext.h"
14
15#include "brkeng.h"
16
17U_NAMESPACE_BEGIN
18
19class DictionaryMatcher;
20
21/*******************************************************************
22 * DictionaryBreakEngine
23 */
24
25/**
26 * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
27 * dictionary to determine language-specific breaks.</p>
28 *
29 * <p>After it is constructed a DictionaryBreakEngine may be shared between
30 * threads without synchronization.</p>
31 */
32class DictionaryBreakEngine : public LanguageBreakEngine {
33 private:
34    /**
35     * The set of characters handled by this engine
36     * @internal
37     */
38
39  UnicodeSet    fSet;
40
41    /**
42     * The set of break types handled by this engine
43     * @internal
44     */
45
46  uint32_t      fTypes;
47
48  /**
49   * <p>Default constructor.</p>
50   *
51   */
52  DictionaryBreakEngine();
53
54 public:
55
56  /**
57   * <p>Constructor setting the break types handled.</p>
58   *
59   * @param breakTypes A bitmap of types handled by the engine.
60   */
61  DictionaryBreakEngine( uint32_t breakTypes );
62
63  /**
64   * <p>Virtual destructor.</p>
65   */
66  virtual ~DictionaryBreakEngine();
67
68  /**
69   * <p>Indicate whether this engine handles a particular character for
70   * a particular kind of break.</p>
71   *
72   * @param c A character which begins a run that the engine might handle
73   * @param breakType The type of text break which the caller wants to determine
74   * @return TRUE if this engine handles the particular character and break
75   * type.
76   */
77  virtual UBool handles( UChar32 c, int32_t breakType ) const;
78
79  /**
80   * <p>Find any breaks within a run in the supplied text.</p>
81   *
82   * @param text A UText representing the text. The iterator is left at
83   * the end of the run of characters which the engine is capable of handling
84   * that starts from the first (or last) character in the range.
85   * @param startPos The start of the run within the supplied text.
86   * @param endPos The end of the run within the supplied text.
87   * @param reverse Whether the caller is looking for breaks in a reverse
88   * direction.
89   * @param breakType The type of break desired, or -1.
90   * @param foundBreaks An allocated C array of the breaks found, if any
91   * @return The number of breaks found.
92   */
93  virtual int32_t findBreaks( UText *text,
94                              int32_t startPos,
95                              int32_t endPos,
96                              UBool reverse,
97                              int32_t breakType,
98                              UStack &foundBreaks ) const;
99
100 protected:
101
102 /**
103  * <p>Set the character set handled by this engine.</p>
104  *
105  * @param set A UnicodeSet of the set of characters handled by the engine
106  */
107  virtual void setCharacters( const UnicodeSet &set );
108
109 /**
110  * <p>Set the break types handled by this engine.</p>
111  *
112  * @param breakTypes A bitmap of types handled by the engine.
113  */
114//  virtual void setBreakTypes( uint32_t breakTypes );
115
116 /**
117  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
118  *
119  * @param text A UText representing the text
120  * @param rangeStart The start of the range of dictionary characters
121  * @param rangeEnd The end of the range of dictionary characters
122  * @param foundBreaks Output of C array of int32_t break positions, or 0
123  * @return The number of breaks found
124  */
125  virtual int32_t divideUpDictionaryRange( UText *text,
126                                           int32_t rangeStart,
127                                           int32_t rangeEnd,
128                                           UStack &foundBreaks ) const = 0;
129
130};
131
132/*******************************************************************
133 * ThaiBreakEngine
134 */
135
136/**
137 * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
138 * dictionary and heuristics to determine Thai-specific breaks.</p>
139 *
140 * <p>After it is constructed a ThaiBreakEngine may be shared between
141 * threads without synchronization.</p>
142 */
143class ThaiBreakEngine : public DictionaryBreakEngine {
144 private:
145    /**
146     * The set of characters handled by this engine
147     * @internal
148     */
149
150  UnicodeSet                fThaiWordSet;
151  UnicodeSet                fEndWordSet;
152  UnicodeSet                fBeginWordSet;
153  UnicodeSet                fSuffixSet;
154  UnicodeSet                fMarkSet;
155  DictionaryMatcher  *fDictionary;
156
157 public:
158
159  /**
160   * <p>Default constructor.</p>
161   *
162   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
163   * engine is deleted.
164   */
165  ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
166
167  /**
168   * <p>Virtual destructor.</p>
169   */
170  virtual ~ThaiBreakEngine();
171
172 protected:
173 /**
174  * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
175  *
176  * @param text A UText representing the text
177  * @param rangeStart The start of the range of dictionary characters
178  * @param rangeEnd The end of the range of dictionary characters
179  * @param foundBreaks Output of C array of int32_t break positions, or 0
180  * @return The number of breaks found
181  */
182  virtual int32_t divideUpDictionaryRange( UText *text,
183                                           int32_t rangeStart,
184                                           int32_t rangeEnd,
185                                           UStack &foundBreaks ) const;
186
187};
188
189#if !UCONFIG_NO_NORMALIZATION
190
191/*******************************************************************
192 * CjkBreakEngine
193 */
194
195//indicates language/script that the CjkBreakEngine will handle
196enum LanguageType {
197    kKorean,
198    kChineseJapanese
199};
200
201/**
202 * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
203 * dictionary with costs associated with each word and
204 * Viterbi decoding to determine CJK-specific breaks.</p>
205 */
206class CjkBreakEngine : public DictionaryBreakEngine {
207 protected:
208    /**
209     * The set of characters handled by this engine
210     * @internal
211     */
212  UnicodeSet                fHangulWordSet;
213  UnicodeSet                fHanWordSet;
214  UnicodeSet                fKatakanaWordSet;
215  UnicodeSet                fHiraganaWordSet;
216
217  DictionaryMatcher  *fDictionary;
218
219 public:
220
221    /**
222     * <p>Default constructor.</p>
223     *
224     * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
225     * engine is deleted. The DictionaryMatcher must contain costs for each word
226     * in order for the dictionary to work properly.
227     */
228  CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
229
230    /**
231     * <p>Virtual destructor.</p>
232     */
233  virtual ~CjkBreakEngine();
234
235 protected:
236    /**
237     * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
238     *
239     * @param text A UText representing the text
240     * @param rangeStart The start of the range of dictionary characters
241     * @param rangeEnd The end of the range of dictionary characters
242     * @param foundBreaks Output of C array of int32_t break positions, or 0
243     * @return The number of breaks found
244     */
245  virtual int32_t divideUpDictionaryRange( UText *text,
246          int32_t rangeStart,
247          int32_t rangeEnd,
248          UStack &foundBreaks ) const;
249
250};
251
252#endif
253
254/*******************************************************************
255 * KhmerBreakEngine
256 */
257
258/**
259 * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
260 * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
261 *
262 * <p>After it is constructed a KhmerBreakEngine may be shared between
263 * threads without synchronization.</p>
264 */
265class KhmerBreakEngine : public DictionaryBreakEngine {
266 private:
267    /**
268     * The set of characters handled by this engine
269     * @internal
270     */
271
272  UnicodeSet                fKhmerWordSet;
273  UnicodeSet                fEndWordSet;
274  UnicodeSet                fBeginWordSet;
275  UnicodeSet                fMarkSet;
276  DictionaryMatcher  *fDictionary;
277
278 public:
279
280  /**
281   * <p>Default constructor.</p>
282   *
283   * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
284   * engine is deleted.
285   */
286  KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
287
288  /**
289   * <p>Virtual destructor.</p>
290   */
291  virtual ~KhmerBreakEngine();
292
293 protected:
294 /**
295  * <p>Divide up a range of known dictionary characters.</p>
296  *
297  * @param text A UText representing the text
298  * @param rangeStart The start of the range of dictionary characters
299  * @param rangeEnd The end of the range of dictionary characters
300  * @param foundBreaks Output of C array of int32_t break positions, or 0
301  * @return The number of breaks found
302  */
303  virtual int32_t divideUpDictionaryRange( UText *text,
304                                           int32_t rangeStart,
305                                           int32_t rangeEnd,
306                                           UStack &foundBreaks ) const;
307
308};
309
310
311U_NAMESPACE_END
312
313    /* DICTBE_H */
314#endif
315