1/*
2*******************************************************************************
3* Copyright (C) 1996-2014, International Business Machines
4* Corporation and others.  All Rights Reserved.
5*******************************************************************************
6* collationcompare.cpp
7*
8* created on: 2012feb14 with new and old collation code
9* created by: Markus W. Scherer
10*/
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_COLLATION
15
16#include "unicode/ucol.h"
17#include "cmemory.h"
18#include "collation.h"
19#include "collationcompare.h"
20#include "collationiterator.h"
21#include "collationsettings.h"
22#include "uassert.h"
23
24U_NAMESPACE_BEGIN
25
26UCollationResult
27CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28                                        const CollationSettings &settings,
29                                        UErrorCode &errorCode) {
30    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31
32    int32_t options = settings.options;
33    uint32_t variableTop;
34    if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35        variableTop = 0;
36    } else {
37        // +1 so that we can use "<" and primary ignorables test out early.
38        variableTop = settings.variableTop + 1;
39    }
40    UBool anyVariable = FALSE;
41
42    // Fetch CEs, compare primaries, store secondary & tertiary weights.
43    U_ALIGN_CODE(16);
44    for(;;) {
45        // We fetch CEs until we get a non-ignorable primary or reach the end.
46        uint32_t leftPrimary;
47        do {
48            int64_t ce = left.nextCE(errorCode);
49            leftPrimary = (uint32_t)(ce >> 32);
50            if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
51                // Variable CE, shift it to quaternary level.
52                // Ignore all following primary ignorables, and shift further variable CEs.
53                anyVariable = TRUE;
54                do {
55                    // Store only the primary of the variable CE.
56                    left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
57                    for(;;) {
58                        ce = left.nextCE(errorCode);
59                        leftPrimary = (uint32_t)(ce >> 32);
60                        if(leftPrimary == 0) {
61                            left.setCurrentCE(0);
62                        } else {
63                            break;
64                        }
65                    }
66                } while(leftPrimary < variableTop &&
67                        leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
68            }
69        } while(leftPrimary == 0);
70
71        uint32_t rightPrimary;
72        do {
73            int64_t ce = right.nextCE(errorCode);
74            rightPrimary = (uint32_t)(ce >> 32);
75            if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
76                // Variable CE, shift it to quaternary level.
77                // Ignore all following primary ignorables, and shift further variable CEs.
78                anyVariable = TRUE;
79                do {
80                    // Store only the primary of the variable CE.
81                    right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
82                    for(;;) {
83                        ce = right.nextCE(errorCode);
84                        rightPrimary = (uint32_t)(ce >> 32);
85                        if(rightPrimary == 0) {
86                            right.setCurrentCE(0);
87                        } else {
88                            break;
89                        }
90                    }
91                } while(rightPrimary < variableTop &&
92                        rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
93            }
94        } while(rightPrimary == 0);
95
96        if(leftPrimary != rightPrimary) {
97            // Return the primary difference, with script reordering.
98            const uint8_t *reorderTable = settings.reorderTable;
99            if (reorderTable != NULL) {
100                leftPrimary = Collation::reorder(reorderTable, leftPrimary);
101                rightPrimary = Collation::reorder(reorderTable, rightPrimary);
102            }
103            return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
104        }
105        if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
106    }
107    if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
108
109    // Compare the buffered secondary & tertiary weights.
110    // We might skip the secondary level but continue with the case level
111    // which is turned on separately.
112    if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
113        if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
114            int32_t leftIndex = 0;
115            int32_t rightIndex = 0;
116            for(;;) {
117                uint32_t leftSecondary;
118                do {
119                    leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
120                } while(leftSecondary == 0);
121
122                uint32_t rightSecondary;
123                do {
124                    rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
125                } while(rightSecondary == 0);
126
127                if(leftSecondary != rightSecondary) {
128                    return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
129                }
130                if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
131            }
132        } else {
133            // The backwards secondary level compares secondary weights backwards
134            // within segments separated by the merge separator (U+FFFE, weight 02).
135            int32_t leftStart = 0;
136            int32_t rightStart = 0;
137            for(;;) {
138                // Find the merge separator or the NO_CE terminator.
139                int32_t leftLimit = leftStart;
140                uint32_t leftLower32;
141                while((leftLower32 = (uint32_t)left.getCE(leftLimit)) >
142                            Collation::MERGE_SEPARATOR_LOWER32 ||
143                        leftLower32 == 0) {
144                    ++leftLimit;
145                }
146                int32_t rightLimit = rightStart;
147                uint32_t rightLower32;
148                while((rightLower32 = (uint32_t)right.getCE(rightLimit)) >
149                            Collation::MERGE_SEPARATOR_LOWER32 ||
150                        rightLower32 == 0) {
151                    ++rightLimit;
152                }
153
154                // Compare the segments.
155                int32_t leftIndex = leftLimit;
156                int32_t rightIndex = rightLimit;
157                for(;;) {
158                    int32_t leftSecondary = 0;
159                    while(leftSecondary == 0 && leftIndex > leftStart) {
160                        leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
161                    }
162
163                    int32_t rightSecondary = 0;
164                    while(rightSecondary == 0 && rightIndex > rightStart) {
165                        rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
166                    }
167
168                    if(leftSecondary != rightSecondary) {
169                        return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
170                    }
171                    if(leftSecondary == 0) { break; }
172                }
173
174                // Did we reach the end of either string?
175                // Both strings have the same number of merge separators,
176                // or else there would have been a primary-level difference.
177                U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
178                if(left.getCE(leftLimit) == Collation::NO_CE) { break; }
179                // Skip both merge separators and continue.
180                leftStart = leftLimit + 1;
181                rightStart = rightLimit + 1;
182            }
183        }
184    }
185
186    if((options & CollationSettings::CASE_LEVEL) != 0) {
187        int32_t strength = CollationSettings::getStrength(options);
188        int32_t leftIndex = 0;
189        int32_t rightIndex = 0;
190        for(;;) {
191            uint32_t leftCase, leftLower32, rightCase;
192            if(strength == UCOL_PRIMARY) {
193                // Primary+caseLevel: Ignore case level weights of primary ignorables.
194                // Otherwise we would get a-umlaut > a
195                // which is not desirable for accent-insensitive sorting.
196                // Check for (lower 32 bits) == 0 as well because variable CEs are stored
197                // with only primary weights.
198                int64_t ce;
199                do {
200                    ce = left.getCE(leftIndex++);
201                    leftCase = (uint32_t)ce;
202                } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
203                leftLower32 = leftCase;
204                leftCase &= 0xc000;
205
206                do {
207                    ce = right.getCE(rightIndex++);
208                    rightCase = (uint32_t)ce;
209                } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
210                rightCase &= 0xc000;
211            } else {
212                // Secondary+caseLevel: By analogy with the above,
213                // ignore case level weights of secondary ignorables.
214                //
215                // Note: A tertiary CE has uppercase case bits (0.0.ut)
216                // to keep tertiary+caseFirst well-formed.
217                //
218                // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
219                // Otherwise a tertiary CE's uppercase would be no greater than
220                // a primary/secondary CE's uppercase.
221                // (See UCA well-formedness condition 2.)
222                // We could construct a special case weight higher than uppercase,
223                // but it's simpler to always ignore case weights of secondary ignorables,
224                // turning 0.0.ut into 0.0.0.t.
225                // (See LDML Collation, Case Parameters.)
226                do {
227                    leftCase = (uint32_t)left.getCE(leftIndex++);
228                } while(leftCase <= 0xffff);
229                leftLower32 = leftCase;
230                leftCase &= 0xc000;
231
232                do {
233                    rightCase = (uint32_t)right.getCE(rightIndex++);
234                } while(rightCase <= 0xffff);
235                rightCase &= 0xc000;
236            }
237
238            // No need to handle NO_CE and MERGE_SEPARATOR specially:
239            // There is one case weight for each previous-level weight,
240            // so level length differences were handled there.
241            if(leftCase != rightCase) {
242                if((options & CollationSettings::UPPER_FIRST) == 0) {
243                    return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
244                } else {
245                    return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
246                }
247            }
248            if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
249        }
250    }
251    if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
252
253    uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
254
255    int32_t leftIndex = 0;
256    int32_t rightIndex = 0;
257    uint32_t anyQuaternaries = 0;
258    for(;;) {
259        uint32_t leftLower32, leftTertiary;
260        do {
261            leftLower32 = (uint32_t)left.getCE(leftIndex++);
262            anyQuaternaries |= leftLower32;
263            U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
264                     (leftLower32 & 0xc0c0) == 0);
265            leftTertiary = leftLower32 & tertiaryMask;
266        } while(leftTertiary == 0);
267
268        uint32_t rightLower32, rightTertiary;
269        do {
270            rightLower32 = (uint32_t)right.getCE(rightIndex++);
271            anyQuaternaries |= rightLower32;
272            U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
273                     (rightLower32 & 0xc0c0) == 0);
274            rightTertiary = rightLower32 & tertiaryMask;
275        } while(rightTertiary == 0);
276
277        if(leftTertiary != rightTertiary) {
278            if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
279                // Pass through NO_CE and MERGE_SEPARATOR
280                // and keep real tertiary weights larger than the MERGE_SEPARATOR.
281                // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
282                // to keep tertiary CEs well-formed.
283                // Their case+tertiary weights must be greater than those of
284                // primary and secondary CEs.
285                if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
286                    if(leftLower32 > 0xffff) {
287                        leftTertiary ^= 0xc000;
288                    } else {
289                        leftTertiary += 0x4000;
290                    }
291                }
292                if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) {
293                    if(rightLower32 > 0xffff) {
294                        rightTertiary ^= 0xc000;
295                    } else {
296                        rightTertiary += 0x4000;
297                    }
298                }
299            }
300            return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
301        }
302        if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
303    }
304    if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
305
306    if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
307        // If there are no "variable" CEs and no non-zero quaternary weights,
308        // then there are no quaternary differences.
309        return UCOL_EQUAL;
310    }
311
312    leftIndex = 0;
313    rightIndex = 0;
314    for(;;) {
315        uint32_t leftQuaternary;
316        do {
317            int64_t ce = left.getCE(leftIndex++);
318            leftQuaternary = (uint32_t)ce & 0xffff;
319            if(leftQuaternary == 0) {
320                // Variable primary or completely ignorable.
321                leftQuaternary = (uint32_t)(ce >> 32);
322            } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
323                // Leave NO_CE or MERGE_SEPARATOR as is.
324            } else {
325                // Regular CE, not tertiary ignorable.
326                // Preserve the quaternary weight in bits 7..6.
327                leftQuaternary |= 0xffffff3f;
328            }
329        } while(leftQuaternary == 0);
330
331        uint32_t rightQuaternary;
332        do {
333            int64_t ce = right.getCE(rightIndex++);
334            rightQuaternary = (uint32_t)ce & 0xffff;
335            if(rightQuaternary == 0) {
336                // Variable primary or completely ignorable.
337                rightQuaternary = (uint32_t)(ce >> 32);
338            } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) {
339                // Leave NO_CE or MERGE_SEPARATOR as is.
340            } else {
341                // Regular CE, not tertiary ignorable.
342                // Preserve the quaternary weight in bits 7..6.
343                rightQuaternary |= 0xffffff3f;
344            }
345        } while(rightQuaternary == 0);
346
347        if(leftQuaternary != rightQuaternary) {
348            // Return the difference, with script reordering.
349            const uint8_t *reorderTable = settings.reorderTable;
350            if (reorderTable != NULL) {
351                leftQuaternary = Collation::reorder(reorderTable, leftQuaternary);
352                rightQuaternary = Collation::reorder(reorderTable, rightQuaternary);
353            }
354            return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
355        }
356        if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; }
357    }
358    return UCOL_EQUAL;
359}
360
361U_NAMESPACE_END
362
363#endif  // !UCONFIG_NO_COLLATION
364