1/* 2********************************************************************** 3* Copyright (C) 2001-2014 IBM and others. All rights reserved. 4********************************************************************** 5* Date Name Description 6* 08/13/2001 synwee Creation. 7********************************************************************** 8*/ 9#ifndef USRCHIMP_H 10#define USRCHIMP_H 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_COLLATION 15 16#include "unicode/normalizer2.h" 17#include "unicode/ucol.h" 18#include "unicode/ucoleitr.h" 19#include "unicode/ubrk.h" 20 21/* mask off anything but primary order */ 22#define UCOL_PRIMARYORDERMASK 0xffff0000 23/* mask off anything but secondary order */ 24#define UCOL_SECONDARYORDERMASK 0x0000ff00 25/* mask off anything but tertiary order */ 26#define UCOL_TERTIARYORDERMASK 0x000000ff 27/* primary order shift */ 28#define UCOL_PRIMARYORDERSHIFT 16 29/* secondary order shift */ 30#define UCOL_SECONDARYORDERSHIFT 8 31 32#define UCOL_IGNORABLE 0 33 34/* get weights from a CE */ 35#define UCOL_PRIMARYORDER(order) (((order) >> 16) & 0xffff) 36#define UCOL_SECONDARYORDER(order) (((order) & UCOL_SECONDARYORDERMASK)>> UCOL_SECONDARYORDERSHIFT) 37#define UCOL_TERTIARYORDER(order) ((order) & UCOL_TERTIARYORDERMASK) 38 39#define UCOL_CONTINUATION_MARKER 0xC0 40 41#define isContinuation(CE) (((CE) & UCOL_CONTINUATION_MARKER) == UCOL_CONTINUATION_MARKER) 42 43/** 44 * This indicates an error has occured during processing or there are no more CEs 45 * to be returned. 46 */ 47#ifndef UCOL_PROCESSED_NULLORDER 48#define UCOL_PROCESSED_NULLORDER ((int64_t)U_INT64_MAX) 49#endif 50 51U_NAMESPACE_BEGIN 52 53class CollationElementIterator; 54class Collator; 55 56struct PCEI 57{ 58 uint64_t ce; 59 int32_t low; 60 int32_t high; 61}; 62 63struct PCEBuffer 64{ 65 PCEI defaultBuffer[16]; 66 PCEI *buffer; 67 int32_t bufferIndex; 68 int32_t bufferSize; 69 70 PCEBuffer(); 71 ~PCEBuffer(); 72 73 void reset(); 74 UBool empty() const; 75 void put(uint64_t ce, int32_t ixLow, int32_t ixHigh); 76 const PCEI *get(); 77}; 78 79class UCollationPCE : public UMemory { 80private: 81 PCEBuffer pceBuffer; 82 CollationElementIterator *cei; 83 UCollationStrength strength; 84 UBool toShift; 85 UBool isShifted; 86 uint32_t variableTop; 87 88public: 89 UCollationPCE(UCollationElements *elems); 90 UCollationPCE(CollationElementIterator *iter); 91 ~UCollationPCE(); 92 93 void init(UCollationElements *elems); 94 void init(CollationElementIterator *iter); 95 96 /** 97 * Get the processed ordering priority of the next collation element in the text. 98 * A single character may contain more than one collation element. 99 * 100 * @param ixLow a pointer to an int32_t to receive the iterator index before fetching the CE. 101 * @param ixHigh a pointer to an int32_t to receive the iterator index after fetching the CE. 102 * @param status A pointer to an UErrorCode to receive any errors. 103 * @return The next collation elements ordering, otherwise returns UCOL_PROCESSED_NULLORDER 104 * if an error has occured or if the end of string has been reached 105 */ 106 int64_t nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 107 /** 108 * Get the processed ordering priority of the previous collation element in the text. 109 * A single character may contain more than one collation element. 110 * 111 * @param ixLow A pointer to an int32_t to receive the iterator index after fetching the CE 112 * @param ixHigh A pointer to an int32_t to receiver the iterator index before fetching the CE 113 * @param status A pointer to an UErrorCode to receive any errors. Noteably 114 * a U_BUFFER_OVERFLOW_ERROR is returned if the internal stack 115 * buffer has been exhausted. 116 * @return The previous collation elements ordering, otherwise returns 117 * UCOL_PROCESSED_NULLORDER if an error has occured or if the start of 118 * string has been reached. 119 */ 120 int64_t previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 121 122private: 123 void init(const Collator &coll); 124 uint64_t processCE(uint32_t ce); 125}; 126 127U_NAMESPACE_END 128 129#define INITIAL_ARRAY_SIZE_ 256 130#define MAX_TABLE_SIZE_ 257 131 132struct USearch { 133 // required since collation element iterator does not have a getText API 134 const UChar *text; 135 int32_t textLength; // exact length 136 UBool isOverlap; 137 UBool isCanonicalMatch; 138 int16_t elementComparisonType; 139 UBreakIterator *internalBreakIter; //internal character breakiterator 140 UBreakIterator *breakIter; 141 // value USEARCH_DONE is the default value 142 // if we are not at the start of the text or the end of the text, 143 // depending on the iteration direction and matchedIndex is USEARCH_DONE 144 // it means that we can't find any more matches in that particular direction 145 int32_t matchedIndex; 146 int32_t matchedLength; 147 UBool isForwardSearching; 148 UBool reset; 149}; 150 151struct UPattern { 152 const UChar *text; 153 int32_t textLength; // exact length 154 // length required for backwards ce comparison 155 int32_t CELength; 156 int32_t *CE; 157 int32_t CEBuffer[INITIAL_ARRAY_SIZE_]; 158 int32_t PCELength; 159 int64_t *PCE; 160 int64_t PCEBuffer[INITIAL_ARRAY_SIZE_]; 161 UBool hasPrefixAccents; 162 UBool hasSuffixAccents; 163 int16_t defaultShiftSize; 164 int16_t shift[MAX_TABLE_SIZE_]; 165 int16_t backShift[MAX_TABLE_SIZE_]; 166}; 167 168struct UStringSearch { 169 struct USearch *search; 170 struct UPattern pattern; 171 const UCollator *collator; 172 const icu::Normalizer2 *nfd; 173 // positions within the collation element iterator is used to determine 174 // if we are at the start of the text. 175 UCollationElements *textIter; 176 icu::UCollationPCE *textProcessedIter; 177 // utility collation element, used throughout program for temporary 178 // iteration. 179 UCollationElements *utilIter; 180 UBool ownCollator; 181 UCollationStrength strength; 182 uint32_t ceMask; 183 uint32_t variableTop; 184 UBool toShift; 185 UChar canonicalPrefixAccents[INITIAL_ARRAY_SIZE_]; 186 UChar canonicalSuffixAccents[INITIAL_ARRAY_SIZE_]; 187}; 188 189/** 190* Exact matches without checking for the ends for extra accents. 191* The match after the position within the collation element iterator is to be 192* found. 193* After a match is found the offset in the collation element iterator will be 194* shifted to the start of the match. 195* Implementation note: 196* For tertiary we can't use the collator->tertiaryMask, that is a 197* preprocessed mask that takes into account case options. since we are only 198* concerned with exact matches, we don't need that. 199* Alternate handling - since only the 16 most significant digits is only used, 200* we can safely do a compare without masking if the ce is a variable, we mask 201* and get only the primary values no shifting to quartenary is required since 202* all primary values less than variabletop will need to be masked off anyway. 203* If the end character is composite and the pattern ce does not match the text 204* ce, we skip it until we find a match in the end composite character or when 205* it has passed the character. This is so that we can match pattern "a" with 206* the text "\u00e6" 207* @param strsrch string search data 208* @param status error status if any 209* @return TRUE if an exact match is found, FALSE otherwise 210*/ 211U_CFUNC 212UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status); 213 214/** 215* Canonical matches. 216* According to the definition, matches found here will include the whole span 217* of beginning and ending accents if it overlaps that region. 218* @param strsrch string search data 219* @param status error status if any 220* @return TRUE if a canonical match is found, FALSE otherwise 221*/ 222U_CFUNC 223UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status); 224 225/** 226* Gets the previous match. 227* Comments follows from handleNextExact 228* @param strsrch string search data 229* @param status error status if any 230* @return True if a exact math is found, FALSE otherwise. 231*/ 232U_CFUNC 233UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status); 234 235/** 236* Canonical matches. 237* According to the definition, matches found here will include the whole span 238* of beginning and ending accents if it overlaps that region. 239* @param strsrch string search data 240* @param status error status if any 241* @return TRUE if a canonical match is found, FALSE otherwise 242*/ 243U_CFUNC 244UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 245 UErrorCode *status); 246 247#endif /* #if !UCONFIG_NO_COLLATION */ 248 249#endif 250