1/*
2**********************************************************************
3*   Copyright (C) 2001-2014 IBM and others. All rights reserved.
4**********************************************************************
5*   Date        Name        Description
6*  08/13/2001   synwee      Creation.
7**********************************************************************
8*/
9#ifndef USRCHIMP_H
10#define USRCHIMP_H
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/normalizer2.h"
17#include "unicode/ucol.h"
18#include "unicode/ucoleitr.h"
19#include "unicode/ubrk.h"
20
21/* mask off anything but primary order */
22#define UCOL_PRIMARYORDERMASK 0xffff0000
23/* mask off anything but secondary order */
24#define UCOL_SECONDARYORDERMASK 0x0000ff00
25/* mask off anything but tertiary order */
26#define UCOL_TERTIARYORDERMASK 0x000000ff
27/* primary order shift */
28#define UCOL_PRIMARYORDERSHIFT 16
29/* secondary order shift */
30#define UCOL_SECONDARYORDERSHIFT 8
31
32#define UCOL_IGNORABLE 0
33
34/* get weights from a CE */
35#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff)
36#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT)
37#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK)
38
39#define UCOL_CONTINUATION_MARKER 0xC0
40
41#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER)
42
43/**
44 * This indicates an error has occured during processing or there are no more CEs
45 * to be returned.
46 */
47#ifndef UCOL_PROCESSED_NULLORDER
48#define UCOL_PROCESSED_NULLORDER        ((int64_t)U_INT64_MAX)
49#endif
50
51U_NAMESPACE_BEGIN
52
53class CollationElementIterator;
54class Collator;
55
56struct PCEI
57{
58    uint64_t ce;
59    int32_t  low;
60    int32_t  high;
61};
62
63struct PCEBuffer
64{
65    PCEI    defaultBuffer[16];
66    PCEI   *buffer;
67    int32_t bufferIndex;
68    int32_t bufferSize;
69
70    PCEBuffer();
71    ~PCEBuffer();
72
73    void  reset();
74    UBool empty() const;
75    void  put(uint64_t ce, int32_t ixLow, int32_t ixHigh);
76    const PCEI *get();
77};
78
79class UCollationPCE : public UMemory {
80private:
81    PCEBuffer          pceBuffer;
82    CollationElementIterator *cei;
83    UCollationStrength strength;
84    UBool              toShift;
85    UBool              isShifted;
86    uint32_t           variableTop;
87
88public:
89    UCollationPCE(UCollationElements *elems);
90    UCollationPCE(CollationElementIterator *iter);
91    ~UCollationPCE();
92
93    void init(UCollationElements *elems);
94    void init(CollationElementIterator *iter);
95
96    /**
97     * Get the processed ordering priority of the next collation element in the text.
98     * A single character may contain more than one collation element.
99     *
100     * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE.
101     * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE.
102     * @param status A pointer to an UErrorCode to receive any errors.
103     * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER
104     *         if an error has occured or if the end of string has been reached
105     */
106    int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
107    /**
108     * Get the processed ordering priority of the previous collation element in the text.
109     * A single character may contain more than one collation element.
110     *
111     * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE
112     * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE
113     * @param status A pointer to an UErrorCode to receive any errors. Noteably
114     *               a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack
115     *               buffer has been exhausted.
116     * @return The previous collation elements ordering, otherwise returns
117     *         UCOL_PROCESSED_NULLORDER if an error has occured or if the start of
118     *         string has been reached.
119     */
120    int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
121
122private:
123    void init(const Collator &coll);
124    uint64_t processCE(uint32_t ce);
125};
126
127U_NAMESPACE_END
128
129#define INITIAL_ARRAY_SIZE_       256
130#define MAX_TABLE_SIZE_           257
131
132struct USearch {
133    // required since collation element iterator does not have a getText API
134    const UChar              *text;
135          int32_t             textLength; // exact length
136          UBool               isOverlap;
137          UBool               isCanonicalMatch;
138          int16_t             elementComparisonType;
139          UBreakIterator     *internalBreakIter;  //internal character breakiterator
140          UBreakIterator     *breakIter;
141    // value USEARCH_DONE is the default value
142    // if we are not at the start of the text or the end of the text,
143    // depending on the iteration direction and matchedIndex is USEARCH_DONE
144    // it means that we can't find any more matches in that particular direction
145          int32_t             matchedIndex;
146          int32_t             matchedLength;
147          UBool               isForwardSearching;
148          UBool               reset;
149};
150
151struct UPattern {
152    const UChar              *text;
153          int32_t             textLength; // exact length
154          // length required for backwards ce comparison
155          int32_t             CELength;
156          int32_t            *CE;
157          int32_t             CEBuffer[INITIAL_ARRAY_SIZE_];
158          int32_t             PCELength;
159          int64_t            *PCE;
160          int64_t             PCEBuffer[INITIAL_ARRAY_SIZE_];
161          UBool               hasPrefixAccents;
162          UBool               hasSuffixAccents;
163          int16_t             defaultShiftSize;
164          int16_t             shift[MAX_TABLE_SIZE_];
165          int16_t             backShift[MAX_TABLE_SIZE_];
166};
167
168struct UStringSearch {
169    struct USearch            *search;
170    struct UPattern            pattern;
171    const  UCollator          *collator;
172    const  icu::Normalizer2   *nfd;
173    // positions within the collation element iterator is used to determine
174    // if we are at the start of the text.
175           UCollationElements *textIter;
176           icu::UCollationPCE *textProcessedIter;
177    // utility collation element, used throughout program for temporary
178    // iteration.
179           UCollationElements *utilIter;
180           UBool               ownCollator;
181           UCollationStrength  strength;
182           uint32_t            ceMask;
183           uint32_t            variableTop;
184           UBool               toShift;
185           UChar               canonicalPrefixAccents[INITIAL_ARRAY_SIZE_];
186           UChar               canonicalSuffixAccents[INITIAL_ARRAY_SIZE_];
187};
188
189/**
190* Exact matches without checking for the ends for extra accents.
191* The match after the position within the collation element iterator is to be
192* found.
193* After a match is found the offset in the collation element iterator will be
194* shifted to the start of the match.
195* Implementation note:
196* For tertiary we can't use the collator->tertiaryMask, that is a
197* preprocessed mask that takes into account case options. since we are only
198* concerned with exact matches, we don't need that.
199* Alternate handling - since only the 16 most significant digits is only used,
200* we can safely do a compare without masking if the ce is a variable, we mask
201* and get only the primary values no shifting to quartenary is required since
202* all primary values less than variabletop will need to be masked off anyway.
203* If the end character is composite and the pattern ce does not match the text
204* ce, we skip it until we find a match in the end composite character or when
205* it has passed the character. This is so that we can match pattern "a" with
206* the text "\u00e6"
207* @param strsrch string search data
208* @param status error status if any
209* @return TRUE if an exact match is found, FALSE otherwise
210*/
211U_CFUNC
212UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status);
213
214/**
215* Canonical matches.
216* According to the definition, matches found here will include the whole span
217* of beginning and ending accents if it overlaps that region.
218* @param strsrch string search data
219* @param status error status if any
220* @return TRUE if a canonical match is found, FALSE otherwise
221*/
222U_CFUNC
223UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status);
224
225/**
226* Gets the previous match.
227* Comments follows from handleNextExact
228* @param strsrch string search data
229* @param status error status if any
230* @return True if a exact math is found, FALSE otherwise.
231*/
232U_CFUNC
233UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status);
234
235/**
236* Canonical matches.
237* According to the definition, matches found here will include the whole span
238* of beginning and ending accents if it overlaps that region.
239* @param strsrch string search data
240* @param status error status if any
241* @return TRUE if a canonical match is found, FALSE otherwise
242*/
243U_CFUNC
244UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
245                                      UErrorCode    *status);
246
247#endif /* #if !UCONFIG_NO_COLLATION */
248
249#endif
250