1/*
2**********************************************************************
3*   Copyright (c) 2001-2012, International Business Machines Corporation
4*   and others.  All Rights Reserved.
5**********************************************************************
6*   Date        Name        Description
7*   07/23/01    aliu        Creation.
8**********************************************************************
9*/
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_TRANSLITERATION
14
15#include "strmatch.h"
16#include "rbt_data.h"
17#include "util.h"
18#include "unicode/uniset.h"
19#include "unicode/utf16.h"
20
21U_NAMESPACE_BEGIN
22
23UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
24
25StringMatcher::StringMatcher(const UnicodeString& theString,
26                             int32_t start,
27                             int32_t limit,
28                             int32_t segmentNum,
29                             const TransliterationRuleData& theData) :
30    data(&theData),
31    segmentNumber(segmentNum),
32    matchStart(-1),
33    matchLimit(-1)
34{
35    theString.extractBetween(start, limit, pattern);
36}
37
38StringMatcher::StringMatcher(const StringMatcher& o) :
39    UnicodeFunctor(o),
40    UnicodeMatcher(o),
41    UnicodeReplacer(o),
42    pattern(o.pattern),
43    data(o.data),
44    segmentNumber(o.segmentNumber),
45    matchStart(o.matchStart),
46    matchLimit(o.matchLimit)
47{
48}
49
50/**
51 * Destructor
52 */
53StringMatcher::~StringMatcher() {
54}
55
56/**
57 * Implement UnicodeFunctor
58 */
59UnicodeFunctor* StringMatcher::clone() const {
60    return new StringMatcher(*this);
61}
62
63/**
64 * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
65 * and return the pointer.
66 */
67UnicodeMatcher* StringMatcher::toMatcher() const {
68  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
69  UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
70
71  return nonconst_base;
72}
73
74/**
75 * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
76 * and return the pointer.
77 */
78UnicodeReplacer* StringMatcher::toReplacer() const {
79  StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
80  UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
81
82  return nonconst_base;
83}
84
85/**
86 * Implement UnicodeMatcher
87 */
88UMatchDegree StringMatcher::matches(const Replaceable& text,
89                                    int32_t& offset,
90                                    int32_t limit,
91                                    UBool incremental) {
92    int32_t i;
93    int32_t cursor = offset;
94    if (limit < cursor) {
95        // Match in the reverse direction
96        for (i=pattern.length()-1; i>=0; --i) {
97            UChar keyChar = pattern.charAt(i);
98            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
99            if (subm == 0) {
100                if (cursor > limit &&
101                    keyChar == text.charAt(cursor)) {
102                    --cursor;
103                } else {
104                    return U_MISMATCH;
105                }
106            } else {
107                UMatchDegree m =
108                    subm->matches(text, cursor, limit, incremental);
109                if (m != U_MATCH) {
110                    return m;
111                }
112            }
113        }
114        // Record the match position, but adjust for a normal
115        // forward start, limit, and only if a prior match does not
116        // exist -- we want the rightmost match.
117        if (matchStart < 0) {
118            matchStart = cursor+1;
119            matchLimit = offset+1;
120        }
121    } else {
122        for (i=0; i<pattern.length(); ++i) {
123            if (incremental && cursor == limit) {
124                // We've reached the context limit without a mismatch and
125                // without completing our match.
126                return U_PARTIAL_MATCH;
127            }
128            UChar keyChar = pattern.charAt(i);
129            UnicodeMatcher* subm = data->lookupMatcher(keyChar);
130            if (subm == 0) {
131                // Don't need the cursor < limit check if
132                // incremental is TRUE (because it's done above); do need
133                // it otherwise.
134                if (cursor < limit &&
135                    keyChar == text.charAt(cursor)) {
136                    ++cursor;
137                } else {
138                    return U_MISMATCH;
139                }
140            } else {
141                UMatchDegree m =
142                    subm->matches(text, cursor, limit, incremental);
143                if (m != U_MATCH) {
144                    return m;
145                }
146            }
147        }
148        // Record the match position
149        matchStart = offset;
150        matchLimit = cursor;
151    }
152
153    offset = cursor;
154    return U_MATCH;
155}
156
157/**
158 * Implement UnicodeMatcher
159 */
160UnicodeString& StringMatcher::toPattern(UnicodeString& result,
161                                        UBool escapeUnprintable) const
162{
163    result.truncate(0);
164    UnicodeString str, quoteBuf;
165    if (segmentNumber > 0) {
166        result.append((UChar)40); /*(*/
167    }
168    for (int32_t i=0; i<pattern.length(); ++i) {
169        UChar keyChar = pattern.charAt(i);
170        const UnicodeMatcher* m = data->lookupMatcher(keyChar);
171        if (m == 0) {
172            ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
173        } else {
174            ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
175                         TRUE, escapeUnprintable, quoteBuf);
176        }
177    }
178    if (segmentNumber > 0) {
179        result.append((UChar)41); /*)*/
180    }
181    // Flush quoteBuf out to result
182    ICU_Utility::appendToRule(result, -1,
183                              TRUE, escapeUnprintable, quoteBuf);
184    return result;
185}
186
187/**
188 * Implement UnicodeMatcher
189 */
190UBool StringMatcher::matchesIndexValue(uint8_t v) const {
191    if (pattern.length() == 0) {
192        return TRUE;
193    }
194    UChar32 c = pattern.char32At(0);
195    const UnicodeMatcher *m = data->lookupMatcher(c);
196    return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
197}
198
199/**
200 * Implement UnicodeMatcher
201 */
202void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
203    UChar32 ch;
204    for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
205        ch = pattern.char32At(i);
206        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
207        if (matcher == NULL) {
208            toUnionTo.add(ch);
209        } else {
210            matcher->addMatchSetTo(toUnionTo);
211        }
212    }
213}
214
215/**
216 * UnicodeReplacer API
217 */
218int32_t StringMatcher::replace(Replaceable& text,
219                               int32_t start,
220                               int32_t limit,
221                               int32_t& /*cursor*/) {
222
223    int32_t outLen = 0;
224
225    // Copy segment with out-of-band data
226    int32_t dest = limit;
227    // If there was no match, that means that a quantifier
228    // matched zero-length.  E.g., x (a)* y matched "xy".
229    if (matchStart >= 0) {
230        if (matchStart != matchLimit) {
231            text.copy(matchStart, matchLimit, dest);
232            outLen = matchLimit - matchStart;
233        }
234    }
235
236    text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
237
238    return outLen;
239}
240
241/**
242 * UnicodeReplacer API
243 */
244UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
245                                                UBool /*escapeUnprintable*/) const {
246    // assert(segmentNumber > 0);
247    rule.truncate(0);
248    rule.append((UChar)0x0024 /*$*/);
249    ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
250    return rule;
251}
252
253/**
254 * Remove any match info.  This must be called before performing a
255 * set of matches with this segment.
256 */
257 void StringMatcher::resetMatch() {
258    matchStart = matchLimit = -1;
259}
260
261/**
262 * Union the set of all characters that may output by this object
263 * into the given set.
264 * @param toUnionTo the set into which to union the output characters
265 */
266void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
267    // The output of this replacer varies; it is the source text between
268    // matchStart and matchLimit.  Since this varies depending on the
269    // input text, we can't compute it here.  We can either do nothing
270    // or we can add ALL characters to the set.  It's probably more useful
271    // to do nothing.
272}
273
274/**
275 * Implement UnicodeFunctor
276 */
277void StringMatcher::setData(const TransliterationRuleData* d) {
278    data = d;
279    int32_t i = 0;
280    while (i<pattern.length()) {
281        UChar32 c = pattern.char32At(i);
282        UnicodeFunctor* f = data->lookup(c);
283        if (f != NULL) {
284            f->setData(data);
285        }
286        i += U16_LENGTH(c);
287    }
288}
289
290U_NAMESPACE_END
291
292#endif /* #if !UCONFIG_NO_TRANSLITERATION */
293
294//eof
295