1/* 2******************************************************************************* 3* Copyright (C) 1996-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* collationcompare.cpp 7* 8* created on: 2012feb14 with new and old collation code 9* created by: Markus W. Scherer 10*/ 11 12#include "unicode/utypes.h" 13 14#if !UCONFIG_NO_COLLATION 15 16#include "unicode/ucol.h" 17#include "cmemory.h" 18#include "collation.h" 19#include "collationcompare.h" 20#include "collationiterator.h" 21#include "collationsettings.h" 22#include "uassert.h" 23 24U_NAMESPACE_BEGIN 25 26UCollationResult 27CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, 28 const CollationSettings &settings, 29 UErrorCode &errorCode) { 30 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 31 32 int32_t options = settings.options; 33 uint32_t variableTop; 34 if((options & CollationSettings::ALTERNATE_MASK) == 0) { 35 variableTop = 0; 36 } else { 37 // +1 so that we can use "<" and primary ignorables test out early. 38 variableTop = settings.variableTop + 1; 39 } 40 UBool anyVariable = FALSE; 41 42 // Fetch CEs, compare primaries, store secondary & tertiary weights. 43 U_ALIGN_CODE(16); 44 for(;;) { 45 // We fetch CEs until we get a non-ignorable primary or reach the end. 46 uint32_t leftPrimary; 47 do { 48 int64_t ce = left.nextCE(errorCode); 49 leftPrimary = (uint32_t)(ce >> 32); 50 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 51 // Variable CE, shift it to quaternary level. 52 // Ignore all following primary ignorables, and shift further variable CEs. 53 anyVariable = TRUE; 54 do { 55 // Store only the primary of the variable CE. 56 left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 57 for(;;) { 58 ce = left.nextCE(errorCode); 59 leftPrimary = (uint32_t)(ce >> 32); 60 if(leftPrimary == 0) { 61 left.setCurrentCE(0); 62 } else { 63 break; 64 } 65 } 66 } while(leftPrimary < variableTop && 67 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 68 } 69 } while(leftPrimary == 0); 70 71 uint32_t rightPrimary; 72 do { 73 int64_t ce = right.nextCE(errorCode); 74 rightPrimary = (uint32_t)(ce >> 32); 75 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 76 // Variable CE, shift it to quaternary level. 77 // Ignore all following primary ignorables, and shift further variable CEs. 78 anyVariable = TRUE; 79 do { 80 // Store only the primary of the variable CE. 81 right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 82 for(;;) { 83 ce = right.nextCE(errorCode); 84 rightPrimary = (uint32_t)(ce >> 32); 85 if(rightPrimary == 0) { 86 right.setCurrentCE(0); 87 } else { 88 break; 89 } 90 } 91 } while(rightPrimary < variableTop && 92 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 93 } 94 } while(rightPrimary == 0); 95 96 if(leftPrimary != rightPrimary) { 97 // Return the primary difference, with script reordering. 98 const uint8_t *reorderTable = settings.reorderTable; 99 if (reorderTable != NULL) { 100 leftPrimary = Collation::reorder(reorderTable, leftPrimary); 101 rightPrimary = Collation::reorder(reorderTable, rightPrimary); 102 } 103 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 104 } 105 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } 106 } 107 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 108 109 // Compare the buffered secondary & tertiary weights. 110 // We might skip the secondary level but continue with the case level 111 // which is turned on separately. 112 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 113 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { 114 int32_t leftIndex = 0; 115 int32_t rightIndex = 0; 116 for(;;) { 117 uint32_t leftSecondary; 118 do { 119 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; 120 } while(leftSecondary == 0); 121 122 uint32_t rightSecondary; 123 do { 124 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; 125 } while(rightSecondary == 0); 126 127 if(leftSecondary != rightSecondary) { 128 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 129 } 130 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } 131 } 132 } else { 133 // The backwards secondary level compares secondary weights backwards 134 // within segments separated by the merge separator (U+FFFE, weight 02). 135 int32_t leftStart = 0; 136 int32_t rightStart = 0; 137 for(;;) { 138 // Find the merge separator or the NO_CE terminator. 139 int32_t leftLimit = leftStart; 140 uint32_t leftLower32; 141 while((leftLower32 = (uint32_t)left.getCE(leftLimit)) > 142 Collation::MERGE_SEPARATOR_LOWER32 || 143 leftLower32 == 0) { 144 ++leftLimit; 145 } 146 int32_t rightLimit = rightStart; 147 uint32_t rightLower32; 148 while((rightLower32 = (uint32_t)right.getCE(rightLimit)) > 149 Collation::MERGE_SEPARATOR_LOWER32 || 150 rightLower32 == 0) { 151 ++rightLimit; 152 } 153 154 // Compare the segments. 155 int32_t leftIndex = leftLimit; 156 int32_t rightIndex = rightLimit; 157 for(;;) { 158 int32_t leftSecondary = 0; 159 while(leftSecondary == 0 && leftIndex > leftStart) { 160 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; 161 } 162 163 int32_t rightSecondary = 0; 164 while(rightSecondary == 0 && rightIndex > rightStart) { 165 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; 166 } 167 168 if(leftSecondary != rightSecondary) { 169 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 170 } 171 if(leftSecondary == 0) { break; } 172 } 173 174 // Did we reach the end of either string? 175 // Both strings have the same number of merge separators, 176 // or else there would have been a primary-level difference. 177 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); 178 if(left.getCE(leftLimit) == Collation::NO_CE) { break; } 179 // Skip both merge separators and continue. 180 leftStart = leftLimit + 1; 181 rightStart = rightLimit + 1; 182 } 183 } 184 } 185 186 if((options & CollationSettings::CASE_LEVEL) != 0) { 187 int32_t strength = CollationSettings::getStrength(options); 188 int32_t leftIndex = 0; 189 int32_t rightIndex = 0; 190 for(;;) { 191 uint32_t leftCase, leftLower32, rightCase; 192 if(strength == UCOL_PRIMARY) { 193 // Primary+caseLevel: Ignore case level weights of primary ignorables. 194 // Otherwise we would get a-umlaut > a 195 // which is not desirable for accent-insensitive sorting. 196 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 197 // with only primary weights. 198 int64_t ce; 199 do { 200 ce = left.getCE(leftIndex++); 201 leftCase = (uint32_t)ce; 202 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); 203 leftLower32 = leftCase; 204 leftCase &= 0xc000; 205 206 do { 207 ce = right.getCE(rightIndex++); 208 rightCase = (uint32_t)ce; 209 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); 210 rightCase &= 0xc000; 211 } else { 212 // Secondary+caseLevel: By analogy with the above, 213 // ignore case level weights of secondary ignorables. 214 // 215 // Note: A tertiary CE has uppercase case bits (0.0.ut) 216 // to keep tertiary+caseFirst well-formed. 217 // 218 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 219 // Otherwise a tertiary CE's uppercase would be no greater than 220 // a primary/secondary CE's uppercase. 221 // (See UCA well-formedness condition 2.) 222 // We could construct a special case weight higher than uppercase, 223 // but it's simpler to always ignore case weights of secondary ignorables, 224 // turning 0.0.ut into 0.0.0.t. 225 // (See LDML Collation, Case Parameters.) 226 do { 227 leftCase = (uint32_t)left.getCE(leftIndex++); 228 } while(leftCase <= 0xffff); 229 leftLower32 = leftCase; 230 leftCase &= 0xc000; 231 232 do { 233 rightCase = (uint32_t)right.getCE(rightIndex++); 234 } while(rightCase <= 0xffff); 235 rightCase &= 0xc000; 236 } 237 238 // No need to handle NO_CE and MERGE_SEPARATOR specially: 239 // There is one case weight for each previous-level weight, 240 // so level length differences were handled there. 241 if(leftCase != rightCase) { 242 if((options & CollationSettings::UPPER_FIRST) == 0) { 243 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 244 } else { 245 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 246 } 247 } 248 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } 249 } 250 } 251 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 252 253 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); 254 255 int32_t leftIndex = 0; 256 int32_t rightIndex = 0; 257 uint32_t anyQuaternaries = 0; 258 for(;;) { 259 uint32_t leftLower32, leftTertiary; 260 do { 261 leftLower32 = (uint32_t)left.getCE(leftIndex++); 262 anyQuaternaries |= leftLower32; 263 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 264 (leftLower32 & 0xc0c0) == 0); 265 leftTertiary = leftLower32 & tertiaryMask; 266 } while(leftTertiary == 0); 267 268 uint32_t rightLower32, rightTertiary; 269 do { 270 rightLower32 = (uint32_t)right.getCE(rightIndex++); 271 anyQuaternaries |= rightLower32; 272 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 273 (rightLower32 & 0xc0c0) == 0); 274 rightTertiary = rightLower32 & tertiaryMask; 275 } while(rightTertiary == 0); 276 277 if(leftTertiary != rightTertiary) { 278 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 279 // Pass through NO_CE and MERGE_SEPARATOR 280 // and keep real tertiary weights larger than the MERGE_SEPARATOR. 281 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 282 // to keep tertiary CEs well-formed. 283 // Their case+tertiary weights must be greater than those of 284 // primary and secondary CEs. 285 if(leftTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { 286 if(leftLower32 > 0xffff) { 287 leftTertiary ^= 0xc000; 288 } else { 289 leftTertiary += 0x4000; 290 } 291 } 292 if(rightTertiary > Collation::MERGE_SEPARATOR_WEIGHT16) { 293 if(rightLower32 > 0xffff) { 294 rightTertiary ^= 0xc000; 295 } else { 296 rightTertiary += 0x4000; 297 } 298 } 299 } 300 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 301 } 302 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } 303 } 304 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 305 306 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { 307 // If there are no "variable" CEs and no non-zero quaternary weights, 308 // then there are no quaternary differences. 309 return UCOL_EQUAL; 310 } 311 312 leftIndex = 0; 313 rightIndex = 0; 314 for(;;) { 315 uint32_t leftQuaternary; 316 do { 317 int64_t ce = left.getCE(leftIndex++); 318 leftQuaternary = (uint32_t)ce & 0xffff; 319 if(leftQuaternary == 0) { 320 // Variable primary or completely ignorable. 321 leftQuaternary = (uint32_t)(ce >> 32); 322 } else if(leftQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { 323 // Leave NO_CE or MERGE_SEPARATOR as is. 324 } else { 325 // Regular CE, not tertiary ignorable. 326 // Preserve the quaternary weight in bits 7..6. 327 leftQuaternary |= 0xffffff3f; 328 } 329 } while(leftQuaternary == 0); 330 331 uint32_t rightQuaternary; 332 do { 333 int64_t ce = right.getCE(rightIndex++); 334 rightQuaternary = (uint32_t)ce & 0xffff; 335 if(rightQuaternary == 0) { 336 // Variable primary or completely ignorable. 337 rightQuaternary = (uint32_t)(ce >> 32); 338 } else if(rightQuaternary <= Collation::MERGE_SEPARATOR_WEIGHT16) { 339 // Leave NO_CE or MERGE_SEPARATOR as is. 340 } else { 341 // Regular CE, not tertiary ignorable. 342 // Preserve the quaternary weight in bits 7..6. 343 rightQuaternary |= 0xffffff3f; 344 } 345 } while(rightQuaternary == 0); 346 347 if(leftQuaternary != rightQuaternary) { 348 // Return the difference, with script reordering. 349 const uint8_t *reorderTable = settings.reorderTable; 350 if (reorderTable != NULL) { 351 leftQuaternary = Collation::reorder(reorderTable, leftQuaternary); 352 rightQuaternary = Collation::reorder(reorderTable, rightQuaternary); 353 } 354 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 355 } 356 if(leftQuaternary == Collation::NO_CE_WEIGHT16) { break; } 357 } 358 return UCOL_EQUAL; 359} 360 361U_NAMESPACE_END 362 363#endif // !UCONFIG_NO_COLLATION 364