1/* 2******************************************************************************* 3* Copyright (C) 2013-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5******************************************************************************* 6* collationbuilder.cpp 7* 8* (replaced the former ucol_bld.cpp) 9* 10* created on: 2013may06 11* created by: Markus W. Scherer 12*/ 13 14#ifdef DEBUG_COLLATION_BUILDER 15#include <stdio.h> 16#endif 17 18#include "unicode/utypes.h" 19 20#if !UCONFIG_NO_COLLATION 21 22#include "unicode/caniter.h" 23#include "unicode/normalizer2.h" 24#include "unicode/tblcoll.h" 25#include "unicode/parseerr.h" 26#include "unicode/uchar.h" 27#include "unicode/ucol.h" 28#include "unicode/unistr.h" 29#include "unicode/usetiter.h" 30#include "unicode/utf16.h" 31#include "unicode/uversion.h" 32#include "cmemory.h" 33#include "collation.h" 34#include "collationbuilder.h" 35#include "collationdata.h" 36#include "collationdatabuilder.h" 37#include "collationfastlatin.h" 38#include "collationroot.h" 39#include "collationrootelements.h" 40#include "collationruleparser.h" 41#include "collationsettings.h" 42#include "collationtailoring.h" 43#include "collationweights.h" 44#include "normalizer2impl.h" 45#include "uassert.h" 46#include "ucol_imp.h" 47#include "utf16collationiterator.h" 48 49#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 50 51U_NAMESPACE_BEGIN 52 53namespace { 54 55class BundleImporter : public CollationRuleParser::Importer { 56public: 57 BundleImporter() : rules(NULL) {} 58 virtual ~BundleImporter(); 59 virtual const UnicodeString *getRules( 60 const char *localeID, const char *collationType, 61 const char *&errorReason, UErrorCode &errorCode); 62 63private: 64 UnicodeString *rules; 65}; 66 67BundleImporter::~BundleImporter() { 68 delete rules; 69} 70 71const UnicodeString * 72BundleImporter::getRules( 73 const char *localeID, const char *collationType, 74 const char *& /*errorReason*/, UErrorCode &errorCode) { 75 delete rules; 76 return rules = CollationLoader::loadRules(localeID, collationType, errorCode); 77} 78 79} // namespace 80 81// RuleBasedCollator implementation ---------------------------------------- *** 82 83// These methods are here, rather than in rulebasedcollator.cpp, 84// for modularization: 85// Most code using Collator does not need to build a Collator from rules. 86// By moving these constructors and helper methods to a separate file, 87// most code will not have a static dependency on the builder code. 88 89RuleBasedCollator::RuleBasedCollator() 90 : data(NULL), 91 settings(NULL), 92 tailoring(NULL), 93 validLocale(""), 94 explicitlySetAttributes(0), 95 actualLocaleIsSameAsValid(FALSE) { 96} 97 98RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) 99 : data(NULL), 100 settings(NULL), 101 tailoring(NULL), 102 validLocale(""), 103 explicitlySetAttributes(0), 104 actualLocaleIsSameAsValid(FALSE) { 105 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); 106} 107 108RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, 109 UErrorCode &errorCode) 110 : data(NULL), 111 settings(NULL), 112 tailoring(NULL), 113 validLocale(""), 114 explicitlySetAttributes(0), 115 actualLocaleIsSameAsValid(FALSE) { 116 internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); 117} 118 119RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 120 UColAttributeValue decompositionMode, 121 UErrorCode &errorCode) 122 : data(NULL), 123 settings(NULL), 124 tailoring(NULL), 125 validLocale(""), 126 explicitlySetAttributes(0), 127 actualLocaleIsSameAsValid(FALSE) { 128 internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); 129} 130 131RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 132 ECollationStrength strength, 133 UColAttributeValue decompositionMode, 134 UErrorCode &errorCode) 135 : data(NULL), 136 settings(NULL), 137 tailoring(NULL), 138 validLocale(""), 139 explicitlySetAttributes(0), 140 actualLocaleIsSameAsValid(FALSE) { 141 internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); 142} 143 144RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 145 UParseError &parseError, UnicodeString &reason, 146 UErrorCode &errorCode) 147 : data(NULL), 148 settings(NULL), 149 tailoring(NULL), 150 validLocale(""), 151 explicitlySetAttributes(0), 152 actualLocaleIsSameAsValid(FALSE) { 153 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); 154} 155 156void 157RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, 158 int32_t strength, 159 UColAttributeValue decompositionMode, 160 UParseError *outParseError, UnicodeString *outReason, 161 UErrorCode &errorCode) { 162 const CollationTailoring *base = CollationRoot::getRoot(errorCode); 163 if(U_FAILURE(errorCode)) { return; } 164 if(outReason != NULL) { outReason->remove(); } 165 CollationBuilder builder(base, errorCode); 166 UVersionInfo noVersion = { 0, 0, 0, 0 }; 167 BundleImporter importer; 168 LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, 169 &importer, 170 outParseError, errorCode)); 171 if(U_FAILURE(errorCode)) { 172 const char *reason = builder.getErrorReason(); 173 if(reason != NULL && outReason != NULL) { 174 *outReason = UnicodeString(reason, -1, US_INV); 175 } 176 return; 177 } 178 const CollationSettings &ts = *t->settings; 179 uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT]; 180 int32_t fastLatinOptions = CollationFastLatin::getOptions( 181 t->data, ts, fastLatinPrimaries, LENGTHOF(fastLatinPrimaries)); 182 if((strength != UCOL_DEFAULT && strength != ts.getStrength()) || 183 (decompositionMode != UCOL_DEFAULT && 184 decompositionMode != ts.getFlag(CollationSettings::CHECK_FCD)) || 185 fastLatinOptions != ts.fastLatinOptions || 186 (fastLatinOptions >= 0 && 187 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries, 188 sizeof(fastLatinPrimaries)) != 0)) { 189 CollationSettings *ownedSettings = SharedObject::copyOnWrite(t->settings); 190 if(ownedSettings == NULL) { 191 errorCode = U_MEMORY_ALLOCATION_ERROR; 192 return; 193 } 194 if(strength != UCOL_DEFAULT) { 195 ownedSettings->setStrength(strength, 0, errorCode); 196 } 197 if(decompositionMode != UCOL_DEFAULT) { 198 ownedSettings->setFlag(CollationSettings::CHECK_FCD, decompositionMode, 0, errorCode); 199 } 200 ownedSettings->fastLatinOptions = CollationFastLatin::getOptions( 201 t->data, *ownedSettings, 202 ownedSettings->fastLatinPrimaries, LENGTHOF(ownedSettings->fastLatinPrimaries)); 203 } 204 if(U_FAILURE(errorCode)) { return; } 205 t->actualLocale.setToBogus(); 206 adoptTailoring(t.orphan()); 207} 208 209// CollationBuilder implementation ----------------------------------------- *** 210 211CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) 212 : nfd(*Normalizer2::getNFDInstance(errorCode)), 213 fcd(*Normalizer2Factory::getFCDInstance(errorCode)), 214 nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 215 base(b), 216 baseData(b->data), 217 rootElements(b->data->rootElements, b->data->rootElementsLength), 218 variableTop(0), 219 dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), 220 errorReason(NULL), 221 cesLength(0), 222 rootPrimaryIndexes(errorCode), nodes(errorCode) { 223 nfcImpl.ensureCanonIterData(errorCode); 224 if(U_FAILURE(errorCode)) { 225 errorReason = "CollationBuilder fields initialization failed"; 226 return; 227 } 228 if(dataBuilder == NULL) { 229 errorCode = U_MEMORY_ALLOCATION_ERROR; 230 return; 231 } 232 dataBuilder->initForTailoring(baseData, errorCode); 233 if(U_FAILURE(errorCode)) { 234 errorReason = "CollationBuilder initialization failed"; 235 } 236} 237 238CollationBuilder::~CollationBuilder() { 239 delete dataBuilder; 240} 241 242CollationTailoring * 243CollationBuilder::parseAndBuild(const UnicodeString &ruleString, 244 const UVersionInfo rulesVersion, 245 CollationRuleParser::Importer *importer, 246 UParseError *outParseError, 247 UErrorCode &errorCode) { 248 if(U_FAILURE(errorCode)) { return NULL; } 249 if(baseData->rootElements == NULL) { 250 errorCode = U_MISSING_RESOURCE_ERROR; 251 errorReason = "missing root elements data, tailoring not supported"; 252 return NULL; 253 } 254 LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); 255 if(tailoring.isNull() || tailoring->isBogus()) { 256 errorCode = U_MEMORY_ALLOCATION_ERROR; 257 return NULL; 258 } 259 CollationRuleParser parser(baseData, errorCode); 260 if(U_FAILURE(errorCode)) { return NULL; } 261 // Note: This always bases &[last variable] and &[first regular] 262 // on the root collator's maxVariable/variableTop. 263 // If we wanted this to change after [maxVariable x], then we would keep 264 // the tailoring.settings pointer here and read its variableTop when we need it. 265 // See http://unicode.org/cldr/trac/ticket/6070 266 variableTop = base->settings->variableTop; 267 parser.setSink(this); 268 parser.setImporter(importer); 269 parser.parse(ruleString, *SharedObject::copyOnWrite(tailoring->settings), 270 outParseError, errorCode); 271 errorReason = parser.getErrorReason(); 272 if(U_FAILURE(errorCode)) { return NULL; } 273 if(dataBuilder->hasMappings()) { 274 makeTailoredCEs(errorCode); 275 closeOverComposites(errorCode); 276 finalizeCEs(errorCode); 277 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 278 optimizeSet.add(0, 0x7f); 279 optimizeSet.add(0xc0, 0xff); 280 // Hangul is decomposed on the fly during collation, 281 // and the tailoring data is always built with HANGUL_TAG specials. 282 optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 283 dataBuilder->optimize(optimizeSet, errorCode); 284 tailoring->ensureOwnedData(errorCode); 285 if(U_FAILURE(errorCode)) { return NULL; } 286 if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } 287 dataBuilder->build(*tailoring->ownedData, errorCode); 288 tailoring->builder = dataBuilder; 289 dataBuilder = NULL; 290 } else { 291 tailoring->data = baseData; 292 } 293 if(U_FAILURE(errorCode)) { return NULL; } 294 tailoring->rules = ruleString; 295 tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination 296 tailoring->setVersion(base->version, rulesVersion); 297 return tailoring.orphan(); 298} 299 300void 301CollationBuilder::addReset(int32_t strength, const UnicodeString &str, 302 const char *&parserErrorReason, UErrorCode &errorCode) { 303 if(U_FAILURE(errorCode)) { return; } 304 U_ASSERT(!str.isEmpty()); 305 if(str.charAt(0) == CollationRuleParser::POS_LEAD) { 306 ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); 307 cesLength = 1; 308 if(U_FAILURE(errorCode)) { return; } 309 U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); 310 } else { 311 // normal reset to a character or string 312 UnicodeString nfdString = nfd.normalize(str, errorCode); 313 if(U_FAILURE(errorCode)) { 314 parserErrorReason = "normalizing the reset position"; 315 return; 316 } 317 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 318 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 319 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 320 parserErrorReason = "reset position maps to too many collation elements (more than 31)"; 321 return; 322 } 323 } 324 if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position 325 326 // &[before strength]position 327 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); 328 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 329 if(U_FAILURE(errorCode)) { return; } 330 331 int64_t node = nodes.elementAti(index); 332 // If the index is for a "weaker" tailored node, 333 // then skip backwards over this and further "weaker" nodes. 334 while(strengthFromNode(node) > strength) { 335 index = previousIndexFromNode(node); 336 node = nodes.elementAti(index); 337 } 338 339 // Find or insert a node whose index we will put into a temporary CE. 340 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 341 // Reset to just before this same-strength tailored node. 342 index = previousIndexFromNode(node); 343 } else if(strength == UCOL_PRIMARY) { 344 // root primary node (has no previous index) 345 uint32_t p = weight32FromNode(node); 346 if(p == 0) { 347 errorCode = U_UNSUPPORTED_ERROR; 348 parserErrorReason = "reset primary-before ignorable not possible"; 349 return; 350 } 351 if(p <= rootElements.getFirstPrimary()) { 352 // There is no primary gap between ignorables and the space-first-primary. 353 errorCode = U_UNSUPPORTED_ERROR; 354 parserErrorReason = "reset primary-before first non-ignorable not supported"; 355 return; 356 } 357 if(p == Collation::FIRST_TRAILING_PRIMARY) { 358 // We do not support tailoring to an unassigned-implicit CE. 359 errorCode = U_UNSUPPORTED_ERROR; 360 parserErrorReason = "reset primary-before [first trailing] not supported"; 361 return; 362 } 363 p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); 364 index = findOrInsertNodeForPrimary(p, errorCode); 365 // Go to the last node in this list: 366 // Tailor after the last node between adjacent root nodes. 367 for(;;) { 368 node = nodes.elementAti(index); 369 int32_t nextIndex = nextIndexFromNode(node); 370 if(nextIndex == 0) { break; } 371 index = nextIndex; 372 } 373 } else { 374 // &[before 2] or &[before 3] 375 index = findCommonNode(index, UCOL_SECONDARY); 376 if(strength >= UCOL_TERTIARY) { 377 index = findCommonNode(index, UCOL_TERTIARY); 378 } 379 node = nodes.elementAti(index); 380 if(strengthFromNode(node) == strength) { 381 // Found a same-strength node with an explicit weight. 382 uint32_t weight16 = weight16FromNode(node); 383 if(weight16 == 0) { 384 errorCode = U_UNSUPPORTED_ERROR; 385 if(strength == UCOL_SECONDARY) { 386 parserErrorReason = "reset secondary-before secondary ignorable not possible"; 387 } else { 388 parserErrorReason = "reset tertiary-before completely ignorable not possible"; 389 } 390 return; 391 } 392 U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16); 393 int32_t previousIndex = previousIndexFromNode(node); 394 if(weight16 == Collation::COMMON_WEIGHT16) { 395 // Reset to just before this same-strength common-weight node. 396 index = previousIndex; 397 } else { 398 // A non-common weight is only possible from a root CE. 399 // Find the higher-level weights, which must all be explicit, 400 // and then find the preceding weight for this level. 401 uint32_t previousWeight16 = 0; 402 int32_t previousWeightIndex = -1; 403 int32_t i = index; 404 if(strength == UCOL_SECONDARY) { 405 uint32_t p; 406 do { 407 i = previousIndexFromNode(node); 408 node = nodes.elementAti(i); 409 if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) && 410 previousWeightIndex < 0) { 411 previousWeightIndex = i; 412 previousWeight16 = weight16FromNode(node); 413 } 414 } while(strengthFromNode(node) > UCOL_PRIMARY); 415 U_ASSERT(!isTailoredNode(node)); 416 p = weight32FromNode(node); 417 weight16 = rootElements.getSecondaryBefore(p, weight16); 418 } else { 419 uint32_t p, s; 420 do { 421 i = previousIndexFromNode(node); 422 node = nodes.elementAti(i); 423 if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) && 424 previousWeightIndex < 0) { 425 previousWeightIndex = i; 426 previousWeight16 = weight16FromNode(node); 427 } 428 } while(strengthFromNode(node) > UCOL_SECONDARY); 429 U_ASSERT(!isTailoredNode(node)); 430 if(strengthFromNode(node) == UCOL_SECONDARY) { 431 s = weight16FromNode(node); 432 do { 433 i = previousIndexFromNode(node); 434 node = nodes.elementAti(i); 435 } while(strengthFromNode(node) > UCOL_PRIMARY); 436 U_ASSERT(!isTailoredNode(node)); 437 } else { 438 U_ASSERT(!nodeHasBefore2(node)); 439 s = Collation::COMMON_WEIGHT16; 440 } 441 p = weight32FromNode(node); 442 weight16 = rootElements.getTertiaryBefore(p, s, weight16); 443 U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); 444 } 445 // Find or insert the new explicit weight before the current one. 446 if(previousWeightIndex >= 0 && weight16 == previousWeight16) { 447 // Tailor after the last node between adjacent root nodes. 448 index = previousIndex; 449 } else { 450 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 451 index = insertNodeBetween(previousIndex, index, node, errorCode); 452 } 453 } 454 } else { 455 // Found a stronger node with implied strength-common weight. 456 int64_t hasBefore3 = 0; 457 if(strength == UCOL_SECONDARY) { 458 U_ASSERT(!nodeHasBefore2(node)); 459 // Move the HAS_BEFORE3 flag from the parent node 460 // to the new secondary common node. 461 hasBefore3 = node & HAS_BEFORE3; 462 node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2; 463 } else { 464 U_ASSERT(!nodeHasBefore3(node)); 465 node |= HAS_BEFORE3; 466 } 467 nodes.setElementAt(node, index); 468 int32_t nextIndex = nextIndexFromNode(node); 469 // Insert default nodes with weights 02 and 05, reset to the 02 node. 470 node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); 471 index = insertNodeBetween(index, nextIndex, node, errorCode); 472 node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | 473 nodeFromStrength(strength); 474 insertNodeBetween(index, nextIndex, node, errorCode); 475 } 476 // Strength of the temporary CE = strength of its reset position. 477 // Code above raises an error if the before-strength is stronger. 478 strength = ceStrength(ces[cesLength - 1]); 479 } 480 if(U_FAILURE(errorCode)) { 481 parserErrorReason = "inserting reset position for &[before n]"; 482 return; 483 } 484 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 485} 486 487int64_t 488CollationBuilder::getSpecialResetPosition(const UnicodeString &str, 489 const char *&parserErrorReason, UErrorCode &errorCode) { 490 U_ASSERT(str.length() == 2); 491 int64_t ce; 492 int32_t strength = UCOL_PRIMARY; 493 UBool isBoundary = FALSE; 494 UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; 495 U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); 496 switch(pos) { 497 case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: 498 // Quaternary CEs are not supported. 499 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 500 return 0; 501 case CollationRuleParser::LAST_TERTIARY_IGNORABLE: 502 return 0; 503 case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { 504 // Look for a tailored tertiary node after [0, 0, 0]. 505 int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); 506 if(U_FAILURE(errorCode)) { return 0; } 507 int64_t node = nodes.elementAti(index); 508 if((index = nextIndexFromNode(node)) != 0) { 509 node = nodes.elementAti(index); 510 U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); 511 if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { 512 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); 513 } 514 } 515 return rootElements.getFirstTertiaryCE(); 516 // No need to look for nodeHasAnyBefore() on a tertiary node. 517 } 518 case CollationRuleParser::LAST_SECONDARY_IGNORABLE: 519 ce = rootElements.getLastTertiaryCE(); 520 strength = UCOL_TERTIARY; 521 break; 522 case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { 523 // Look for a tailored secondary node after [0, 0, *]. 524 int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); 525 if(U_FAILURE(errorCode)) { return 0; } 526 int64_t node = nodes.elementAti(index); 527 while((index = nextIndexFromNode(node)) != 0) { 528 node = nodes.elementAti(index); 529 strength = strengthFromNode(node); 530 if(strength < UCOL_SECONDARY) { break; } 531 if(strength == UCOL_SECONDARY) { 532 if(isTailoredNode(node)) { 533 if(nodeHasBefore3(node)) { 534 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 535 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 536 } 537 return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); 538 } else { 539 break; 540 } 541 } 542 } 543 ce = rootElements.getFirstSecondaryCE(); 544 strength = UCOL_SECONDARY; 545 break; 546 } 547 case CollationRuleParser::LAST_PRIMARY_IGNORABLE: 548 ce = rootElements.getLastSecondaryCE(); 549 strength = UCOL_SECONDARY; 550 break; 551 case CollationRuleParser::FIRST_VARIABLE: 552 ce = rootElements.getFirstPrimaryCE(); 553 isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 554 break; 555 case CollationRuleParser::LAST_VARIABLE: 556 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 557 break; 558 case CollationRuleParser::FIRST_REGULAR: 559 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 560 isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 561 break; 562 case CollationRuleParser::LAST_REGULAR: 563 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 564 // for backward compatibility with behavior before the introduction of 565 // script-first-primary CEs in the root collator. 566 ce = rootElements.firstCEWithPrimaryAtLeast( 567 baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); 568 break; 569 case CollationRuleParser::FIRST_IMPLICIT: { 570 uint32_t ce32 = baseData->getCE32(0x4e00); 571 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::OFFSET_TAG)); 572 ce = baseData->getCEFromOffsetCE32(0x4e00, ce32); 573 break; 574 } 575 case CollationRuleParser::LAST_IMPLICIT: 576 // We do not support tailoring to an unassigned-implicit CE. 577 errorCode = U_UNSUPPORTED_ERROR; 578 parserErrorReason = "reset to [last implicit] not supported"; 579 return 0; 580 case CollationRuleParser::FIRST_TRAILING: 581 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 582 isBoundary = TRUE; // trailing first primary (there is no mapping for it) 583 break; 584 case CollationRuleParser::LAST_TRAILING: 585 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 586 parserErrorReason = "LDML forbids tailoring to U+FFFF"; 587 return 0; 588 default: 589 U_ASSERT(FALSE); 590 return 0; 591 } 592 593 int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); 594 if(U_FAILURE(errorCode)) { return 0; } 595 int64_t node = nodes.elementAti(index); 596 if((pos & 1) == 0) { 597 // even pos = [first xyz] 598 if(!nodeHasAnyBefore(node) && isBoundary) { 599 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 600 // It is reachable via its special contraction, but is not normally used. 601 // Find the first character tailored after the boundary CE, 602 // or the first real root CE after it. 603 if((index = nextIndexFromNode(node)) != 0) { 604 // If there is a following node, then it must be tailored 605 // because there are no root CEs with a boundary primary 606 // and non-common secondary/tertiary weights. 607 node = nodes.elementAti(index); 608 U_ASSERT(isTailoredNode(node)); 609 ce = tempCEFromIndexAndStrength(index, strength); 610 } else { 611 U_ASSERT(strength == UCOL_PRIMARY); 612 uint32_t p = (uint32_t)(ce >> 32); 613 int32_t pIndex = rootElements.findPrimary(p); 614 UBool isCompressible = baseData->isCompressiblePrimary(p); 615 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 616 ce = Collation::makeCE(p); 617 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); 618 if(U_FAILURE(errorCode)) { return 0; } 619 node = nodes.elementAti(index); 620 } 621 } 622 if(nodeHasAnyBefore(node)) { 623 // Get the first node that was tailored before this one at a weaker strength. 624 if(nodeHasBefore2(node)) { 625 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 626 node = nodes.elementAti(index); 627 } 628 if(nodeHasBefore3(node)) { 629 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 630 } 631 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 632 ce = tempCEFromIndexAndStrength(index, strength); 633 } 634 } else { 635 // odd pos = [last xyz] 636 // Find the last node that was tailored after the [last xyz] 637 // at a strength no greater than the position's strength. 638 for(;;) { 639 int32_t nextIndex = nextIndexFromNode(node); 640 if(nextIndex == 0) { break; } 641 int64_t nextNode = nodes.elementAti(nextIndex); 642 if(strengthFromNode(nextNode) < strength) { break; } 643 index = nextIndex; 644 node = nextNode; 645 } 646 // Do not make a temporary CE for a root node. 647 // This last node might be the node for the root CE itself, 648 // or a node with a common secondary or tertiary weight. 649 if(isTailoredNode(node)) { 650 ce = tempCEFromIndexAndStrength(index, strength); 651 } 652 } 653 return ce; 654} 655 656void 657CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, 658 const UnicodeString &str, const UnicodeString &extension, 659 const char *&parserErrorReason, UErrorCode &errorCode) { 660 if(U_FAILURE(errorCode)) { return; } 661 UnicodeString nfdPrefix; 662 if(!prefix.isEmpty()) { 663 nfd.normalize(prefix, nfdPrefix, errorCode); 664 if(U_FAILURE(errorCode)) { 665 parserErrorReason = "normalizing the relation prefix"; 666 return; 667 } 668 } 669 UnicodeString nfdString = nfd.normalize(str, errorCode); 670 if(U_FAILURE(errorCode)) { 671 parserErrorReason = "normalizing the relation string"; 672 return; 673 } 674 675 // The runtime code decomposes Hangul syllables on the fly, 676 // with recursive processing but without making the Jamo pieces visible for matching. 677 // It does not work with certain types of contextual mappings. 678 int32_t nfdLength = nfdString.length(); 679 if(nfdLength >= 2) { 680 UChar c = nfdString.charAt(0); 681 if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { 682 // While handling a Hangul syllable, contractions starting with Jamo L or V 683 // would not see the following Jamo of that syllable. 684 errorCode = U_UNSUPPORTED_ERROR; 685 parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; 686 return; 687 } 688 c = nfdString.charAt(nfdLength - 1); 689 if(Hangul::isJamoL(c) || 690 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { 691 // A contraction ending with Jamo L or L+V would require 692 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 693 // or decomposing a following Hangul syllable on the fly, during contraction matching. 694 errorCode = U_UNSUPPORTED_ERROR; 695 parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; 696 return; 697 } 698 // A Hangul syllable completely inside a contraction is ok. 699 } 700 // Note: If there is a prefix, then the parser checked that 701 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 702 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 703 // (While handling a Hangul syllable, prefixes on Jamo V or T 704 // would not see the previous Jamo of that syllable.) 705 706 if(strength != UCOL_IDENTICAL) { 707 // Find the node index after which we insert the new tailored node. 708 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 709 U_ASSERT(cesLength > 0); 710 int64_t ce = ces[cesLength - 1]; 711 if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { 712 // There is no primary gap between ignorables and the space-first-primary. 713 errorCode = U_UNSUPPORTED_ERROR; 714 parserErrorReason = "tailoring primary after ignorables not supported"; 715 return; 716 } 717 if(strength == UCOL_QUATERNARY && ce == 0) { 718 // The CE data structure does not support non-zero quaternary weights 719 // on tertiary ignorables. 720 errorCode = U_UNSUPPORTED_ERROR; 721 parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; 722 return; 723 } 724 // Insert the new tailored node. 725 index = insertTailoredNodeAfter(index, strength, errorCode); 726 if(U_FAILURE(errorCode)) { 727 parserErrorReason = "modifying collation elements"; 728 return; 729 } 730 // Strength of the temporary CE: 731 // The new relation may yield a stronger CE but not a weaker one. 732 int32_t tempStrength = ceStrength(ce); 733 if(strength < tempStrength) { tempStrength = strength; } 734 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 735 } 736 737 setCaseBits(nfdString, parserErrorReason, errorCode); 738 if(U_FAILURE(errorCode)) { return; } 739 740 int32_t cesLengthBeforeExtension = cesLength; 741 if(!extension.isEmpty()) { 742 UnicodeString nfdExtension = nfd.normalize(extension, errorCode); 743 if(U_FAILURE(errorCode)) { 744 parserErrorReason = "normalizing the relation extension"; 745 return; 746 } 747 cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); 748 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 749 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 750 parserErrorReason = 751 "extension string adds too many collation elements (more than 31 total)"; 752 return; 753 } 754 } 755 uint32_t ce32 = Collation::UNASSIGNED_CE32; 756 if((prefix != nfdPrefix || str != nfdString) && 757 !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { 758 // Map from the original input to the CEs. 759 // We do this in case the canonical closure is incomplete, 760 // so that it is possible to explicitly provide the missing mappings. 761 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); 762 } 763 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); 764 if(U_FAILURE(errorCode)) { 765 parserErrorReason = "writing collation elements"; 766 return; 767 } 768 cesLength = cesLengthBeforeExtension; 769} 770 771int32_t 772CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, 773 UErrorCode &errorCode) { 774 if(U_FAILURE(errorCode)) { return 0; } 775 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); 776 777 // Find the last CE that is at least as "strong" as the requested difference. 778 // Note: Stronger is smaller (UCOL_PRIMARY=0). 779 int64_t ce; 780 for(;; --cesLength) { 781 if(cesLength == 0) { 782 ce = ces[0] = 0; 783 cesLength = 1; 784 break; 785 } else { 786 ce = ces[cesLength - 1]; 787 } 788 if(ceStrength(ce) <= strength) { break; } 789 } 790 791 if(isTempCE(ce)) { 792 // No need to findCommonNode() here for lower levels 793 // because insertTailoredNodeAfter() will do that anyway. 794 return indexFromTempCE(ce); 795 } 796 797 // root CE 798 if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { 799 errorCode = U_UNSUPPORTED_ERROR; 800 parserErrorReason = "tailoring relative to an unassigned code point not supported"; 801 return 0; 802 } 803 return findOrInsertNodeForRootCE(ce, strength, errorCode); 804} 805 806int32_t 807CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { 808 if(U_FAILURE(errorCode)) { return 0; } 809 U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); 810 811 // Find or insert the node for each of the root CE's weights, 812 // down to the requested level/strength. 813 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 814 U_ASSERT((ce & 0xc0) == 0); 815 int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode); 816 if(strength >= UCOL_SECONDARY) { 817 uint32_t lower32 = (uint32_t)ce; 818 index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); 819 if(strength >= UCOL_TERTIARY) { 820 index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, 821 UCOL_TERTIARY, errorCode); 822 } 823 } 824 return index; 825} 826 827namespace { 828 829/** 830 * Like Java Collections.binarySearch(List, key, Comparator). 831 * 832 * @return the index>=0 where the item was found, 833 * or the index<0 for inserting the string at ~index in sorted order 834 * (index into rootPrimaryIndexes) 835 */ 836int32_t 837binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, 838 const int64_t *nodes, uint32_t p) { 839 if(length == 0) { return ~0; } 840 int32_t start = 0; 841 int32_t limit = length; 842 for (;;) { 843 int32_t i = (start + limit) / 2; 844 int64_t node = nodes[rootPrimaryIndexes[i]]; 845 uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) 846 if (p == nodePrimary) { 847 return i; 848 } else if (p < nodePrimary) { 849 if (i == start) { 850 return ~start; // insert s before i 851 } 852 limit = i; 853 } else { 854 if (i == start) { 855 return ~(start + 1); // insert s after i 856 } 857 start = i; 858 } 859 } 860} 861 862} // namespace 863 864int32_t 865CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { 866 if(U_FAILURE(errorCode)) { return 0; } 867 868 int32_t rootIndex = binarySearchForRootPrimaryNode( 869 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 870 if(rootIndex >= 0) { 871 return rootPrimaryIndexes.elementAti(rootIndex); 872 } else { 873 // Start a new list of nodes with this primary. 874 int32_t index = nodes.size(); 875 nodes.addElement(nodeFromWeight32(p), errorCode); 876 rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); 877 return index; 878 } 879} 880 881int32_t 882CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { 883 if(U_FAILURE(errorCode)) { return 0; } 884 U_ASSERT(0 <= index && index < nodes.size()); 885 886 U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16); 887 // Only reset-before inserts common weights. 888 if(weight16 == Collation::COMMON_WEIGHT16) { 889 return findCommonNode(index, level); 890 } 891 // Find the root CE's weight for this level. 892 // Postpone insertion if not found: 893 // Insert the new root node before the next stronger node, 894 // or before the next root node with the same strength and a larger weight. 895 int64_t node = nodes.elementAti(index); 896 int32_t nextIndex; 897 while((nextIndex = nextIndexFromNode(node)) != 0) { 898 node = nodes.elementAti(nextIndex); 899 int32_t nextStrength = strengthFromNode(node); 900 if(nextStrength <= level) { 901 // Insert before a stronger node. 902 if(nextStrength < level) { break; } 903 // nextStrength == level 904 if(!isTailoredNode(node)) { 905 uint32_t nextWeight16 = weight16FromNode(node); 906 if(nextWeight16 == weight16) { 907 // Found the node for the root CE up to this level. 908 return nextIndex; 909 } 910 // Insert before a node with a larger same-strength weight. 911 if(nextWeight16 > weight16) { break; } 912 } 913 } 914 // Skip the next node. 915 index = nextIndex; 916 } 917 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 918 return insertNodeBetween(index, nextIndex, node, errorCode); 919} 920 921int32_t 922CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { 923 if(U_FAILURE(errorCode)) { return 0; } 924 U_ASSERT(0 <= index && index < nodes.size()); 925 if(strength >= UCOL_SECONDARY) { 926 index = findCommonNode(index, UCOL_SECONDARY); 927 if(strength >= UCOL_TERTIARY) { 928 index = findCommonNode(index, UCOL_TERTIARY); 929 } 930 } 931 // Postpone insertion: 932 // Insert the new node before the next one with a strength at least as strong. 933 int64_t node = nodes.elementAti(index); 934 int32_t nextIndex; 935 while((nextIndex = nextIndexFromNode(node)) != 0) { 936 node = nodes.elementAti(nextIndex); 937 if(strengthFromNode(node) <= strength) { break; } 938 // Skip the next node which has a weaker (larger) strength than the new one. 939 index = nextIndex; 940 } 941 node = IS_TAILORED | nodeFromStrength(strength); 942 return insertNodeBetween(index, nextIndex, node, errorCode); 943} 944 945int32_t 946CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, 947 UErrorCode &errorCode) { 948 if(U_FAILURE(errorCode)) { return 0; } 949 U_ASSERT(previousIndexFromNode(node) == 0); 950 U_ASSERT(nextIndexFromNode(node) == 0); 951 U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 952 // Append the new node and link it to the existing nodes. 953 int32_t newIndex = nodes.size(); 954 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 955 nodes.addElement(node, errorCode); 956 if(U_FAILURE(errorCode)) { return 0; } 957 // nodes[index].nextIndex = newIndex 958 node = nodes.elementAti(index); 959 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 960 // nodes[nextIndex].previousIndex = newIndex 961 if(nextIndex != 0) { 962 node = nodes.elementAti(nextIndex); 963 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 964 } 965 return newIndex; 966} 967 968int32_t 969CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { 970 U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); 971 int64_t node = nodes.elementAti(index); 972 if(strengthFromNode(node) >= strength) { 973 // The current node is no stronger. 974 return index; 975 } 976 if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 977 // The current node implies the strength-common weight. 978 return index; 979 } 980 index = nextIndexFromNode(node); 981 node = nodes.elementAti(index); 982 U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && 983 weight16FromNode(node) == BEFORE_WEIGHT16); 984 // Skip to the explicit common node. 985 do { 986 index = nextIndexFromNode(node); 987 node = nodes.elementAti(index); 988 U_ASSERT(strengthFromNode(node) >= strength); 989 } while(isTailoredNode(node) || strengthFromNode(node) > strength); 990 U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); 991 return index; 992} 993 994void 995CollationBuilder::setCaseBits(const UnicodeString &nfdString, 996 const char *&parserErrorReason, UErrorCode &errorCode) { 997 if(U_FAILURE(errorCode)) { return; } 998 int32_t numTailoredPrimaries = 0; 999 for(int32_t i = 0; i < cesLength; ++i) { 1000 if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } 1001 } 1002 // We should not be able to get too many case bits because 1003 // cesLength<=31==MAX_EXPANSION_LENGTH. 1004 // 31 pairs of case bits fit into an int64_t without setting its sign bit. 1005 U_ASSERT(numTailoredPrimaries <= 31); 1006 1007 int64_t cases = 0; 1008 if(numTailoredPrimaries > 0) { 1009 const UChar *s = nfdString.getBuffer(); 1010 UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); 1011 int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; 1012 if(U_FAILURE(errorCode)) { 1013 parserErrorReason = "fetching root CEs for tailored string"; 1014 return; 1015 } 1016 U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); 1017 1018 uint32_t lastCase = 0; 1019 int32_t numBasePrimaries = 0; 1020 for(int32_t i = 0; i < baseCEsLength; ++i) { 1021 int64_t ce = baseCEs.getCE(i); 1022 if((ce >> 32) != 0) { 1023 ++numBasePrimaries; 1024 uint32_t c = ((uint32_t)ce >> 14) & 3; 1025 U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 1026 if(numBasePrimaries < numTailoredPrimaries) { 1027 cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); 1028 } else if(numBasePrimaries == numTailoredPrimaries) { 1029 lastCase = c; 1030 } else if(c != lastCase) { 1031 // There are more base primary CEs than tailored primaries. 1032 // Set mixed case if the case bits of the remainder differ. 1033 lastCase = 1; 1034 // Nothing more can change. 1035 break; 1036 } 1037 } 1038 } 1039 if(numBasePrimaries >= numTailoredPrimaries) { 1040 cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); 1041 } 1042 } 1043 1044 for(int32_t i = 0; i < cesLength; ++i) { 1045 int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits 1046 int32_t strength = ceStrength(ce); 1047 if(strength == UCOL_PRIMARY) { 1048 ce |= (cases & 3) << 14; 1049 cases >>= 2; 1050 } else if(strength == UCOL_TERTIARY) { 1051 // Tertiary CEs must have uppercase bits. 1052 // See the LDML spec, and comments in class CollationCompare. 1053 ce |= 0x8000; 1054 } 1055 // Tertiary ignorable CEs must have 0 case bits. 1056 // We set 0 case bits for secondary CEs too 1057 // since currently only U+0345 is cased and maps to a secondary CE, 1058 // and it is lowercase. Other secondaries are uncased. 1059 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 1060 ces[i] = ce; 1061 } 1062} 1063 1064void 1065CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, 1066 UErrorCode &errorCode) { 1067 if(U_FAILURE(errorCode)) { return; } 1068 dataBuilder->suppressContractions(set, errorCode); 1069 if(U_FAILURE(errorCode)) { 1070 parserErrorReason = "application of [suppressContractions [set]] failed"; 1071 } 1072} 1073 1074void 1075CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, 1076 UErrorCode &errorCode) { 1077 if(U_FAILURE(errorCode)) { return; } 1078 optimizeSet.addAll(set); 1079} 1080 1081uint32_t 1082CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1083 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1084 UErrorCode &errorCode) { 1085 // Map from the NFD input to the CEs. 1086 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1087 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1088 addTailComposites(nfdPrefix, nfdString, errorCode); 1089 return ce32; 1090} 1091 1092uint32_t 1093CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1094 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1095 UErrorCode &errorCode) { 1096 if(U_FAILURE(errorCode)) { return ce32; } 1097 1098 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 1099 if(nfdPrefix.isEmpty()) { 1100 CanonicalIterator stringIter(nfdString, errorCode); 1101 if(U_FAILURE(errorCode)) { return ce32; } 1102 UnicodeString prefix; 1103 for(;;) { 1104 UnicodeString str = stringIter.next(); 1105 if(str.isBogus()) { break; } 1106 if(ignoreString(str, errorCode) || str == nfdString) { continue; } 1107 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1108 if(U_FAILURE(errorCode)) { return ce32; } 1109 } 1110 } else { 1111 CanonicalIterator prefixIter(nfdPrefix, errorCode); 1112 CanonicalIterator stringIter(nfdString, errorCode); 1113 if(U_FAILURE(errorCode)) { return ce32; } 1114 for(;;) { 1115 UnicodeString prefix = prefixIter.next(); 1116 if(prefix.isBogus()) { break; } 1117 if(ignorePrefix(prefix, errorCode)) { continue; } 1118 UBool samePrefix = prefix == nfdPrefix; 1119 for(;;) { 1120 UnicodeString str = stringIter.next(); 1121 if(str.isBogus()) { break; } 1122 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } 1123 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1124 if(U_FAILURE(errorCode)) { return ce32; } 1125 } 1126 stringIter.reset(); 1127 } 1128 } 1129 return ce32; 1130} 1131 1132void 1133CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1134 UErrorCode &errorCode) { 1135 if(U_FAILURE(errorCode)) { return; } 1136 1137 // Look for the last starter in the NFD string. 1138 UChar32 lastStarter; 1139 int32_t indexAfterLastStarter = nfdString.length(); 1140 for(;;) { 1141 if(indexAfterLastStarter == 0) { return; } // no starter at all 1142 lastStarter = nfdString.char32At(indexAfterLastStarter - 1); 1143 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 1144 indexAfterLastStarter -= U16_LENGTH(lastStarter); 1145 } 1146 // No closure to Hangul syllables since we decompose them on the fly. 1147 if(Hangul::isJamoL(lastStarter)) { return; } 1148 1149 // Are there any composites whose decomposition starts with the lastStarter? 1150 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 1151 // We might find some more equivalent mappings here if it did. 1152 UnicodeSet composites; 1153 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 1154 1155 UnicodeString decomp; 1156 UnicodeString newNFDString, newString; 1157 int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; 1158 UnicodeSetIterator iter(composites); 1159 while(iter.next()) { 1160 U_ASSERT(!iter.isString()); 1161 UChar32 composite = iter.getCodepoint(); 1162 nfd.getDecomposition(composite, decomp); 1163 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 1164 newNFDString, newString, errorCode)) { 1165 continue; 1166 } 1167 int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); 1168 if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { 1169 // Ignore mappings that we cannot store. 1170 continue; 1171 } 1172 // Note: It is possible that the newCEs do not make use of the mapping 1173 // for which we are adding the tail composites, in which case we might be adding 1174 // unnecessary mappings. 1175 // For example, when we add tail composites for ae^ (^=combining circumflex), 1176 // UCA discontiguous-contraction matching does not find any matches 1177 // for ae_^ (_=any combining diacritic below) *unless* there is also 1178 // a contraction mapping for ae. 1179 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 1180 // while fetching the newCEs for ae_^. 1181 // TODO: Try to detect this effectively. 1182 // (Alternatively, print a warning when prefix contractions are missing.) 1183 1184 // We do not need an explicit mapping for the NFD strings. 1185 // It is fine if the NFD input collates like this via a sequence of mappings. 1186 // It also saves a little bit of space, and may reduce the set of characters with contractions. 1187 uint32_t ce32 = addIfDifferent(nfdPrefix, newString, 1188 newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); 1189 if(ce32 != Collation::UNASSIGNED_CE32) { 1190 // was different, was added 1191 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); 1192 } 1193 } 1194} 1195 1196UBool 1197CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, 1198 int32_t indexAfterLastStarter, 1199 UChar32 composite, const UnicodeString &decomp, 1200 UnicodeString &newNFDString, UnicodeString &newString, 1201 UErrorCode &errorCode) const { 1202 if(U_FAILURE(errorCode)) { return FALSE; } 1203 U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); 1204 int32_t lastStarterLength = decomp.moveIndex32(0, 1); 1205 if(lastStarterLength == decomp.length()) { 1206 // Singleton decompositions should be found by addWithClosure() 1207 // and the CanonicalIterator, so we can ignore them here. 1208 return FALSE; 1209 } 1210 if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, 1211 decomp, lastStarterLength, 0x7fffffff) == 0) { 1212 // same strings, nothing new to be found here 1213 return FALSE; 1214 } 1215 1216 // Make new FCD strings that combine a composite, or its decomposition, 1217 // into the nfdString's last starter and the combining marks following it. 1218 // Make an NFD version, and a version with the composite. 1219 newNFDString.setTo(nfdString, 0, indexAfterLastStarter); 1220 newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); 1221 1222 // The following is related to discontiguous contraction matching, 1223 // but builds only FCD strings (or else returns FALSE). 1224 int32_t sourceIndex = indexAfterLastStarter; 1225 int32_t decompIndex = lastStarterLength; 1226 // Small optimization: We keep the source character across loop iterations 1227 // because we do not always consume it, 1228 // and then need not fetch it again nor look up its combining class again. 1229 UChar32 sourceChar = U_SENTINEL; 1230 // The cc variables need to be declared before the loop so that at the end 1231 // they are set to the last combining classes seen. 1232 uint8_t sourceCC = 0; 1233 uint8_t decompCC = 0; 1234 for(;;) { 1235 if(sourceChar < 0) { 1236 if(sourceIndex >= nfdString.length()) { break; } 1237 sourceChar = nfdString.char32At(sourceIndex); 1238 sourceCC = nfd.getCombiningClass(sourceChar); 1239 U_ASSERT(sourceCC != 0); 1240 } 1241 // We consume a decomposition character in each iteration. 1242 if(decompIndex >= decomp.length()) { break; } 1243 UChar32 decompChar = decomp.char32At(decompIndex); 1244 decompCC = nfd.getCombiningClass(decompChar); 1245 // Compare the two characters and their combining classes. 1246 if(decompCC == 0) { 1247 // Unable to merge because the source contains a non-zero combining mark 1248 // but the composite's decomposition contains another starter. 1249 // The strings would not be equivalent. 1250 return FALSE; 1251 } else if(sourceCC < decompCC) { 1252 // Composite + sourceChar would not be FCD. 1253 return FALSE; 1254 } else if(decompCC < sourceCC) { 1255 newNFDString.append(decompChar); 1256 decompIndex += U16_LENGTH(decompChar); 1257 } else if(decompChar != sourceChar) { 1258 // Blocked because same combining class. 1259 return FALSE; 1260 } else { // match: decompChar == sourceChar 1261 newNFDString.append(decompChar); 1262 decompIndex += U16_LENGTH(decompChar); 1263 sourceIndex += U16_LENGTH(decompChar); 1264 sourceChar = U_SENTINEL; 1265 } 1266 } 1267 // We are at the end of at least one of the two inputs. 1268 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1269 if(sourceCC < decompCC) { 1270 // Appending the next source character to the composite would not be FCD. 1271 return FALSE; 1272 } 1273 newNFDString.append(nfdString, sourceIndex, 0x7fffffff); 1274 newString.append(nfdString, sourceIndex, 0x7fffffff); 1275 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1276 newNFDString.append(decomp, decompIndex, 0x7fffffff); 1277 } 1278 U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); 1279 U_ASSERT(fcd.isNormalized(newString, errorCode)); 1280 U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent 1281 return TRUE; 1282} 1283 1284UBool 1285CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { 1286 // Do not map non-FCD prefixes. 1287 return !isFCD(s, errorCode); 1288} 1289 1290UBool 1291CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { 1292 // Do not map non-FCD strings. 1293 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1294 return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); 1295} 1296 1297UBool 1298CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { 1299 return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); 1300} 1301 1302void 1303CollationBuilder::closeOverComposites(UErrorCode &errorCode) { 1304 UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final 1305 if(U_FAILURE(errorCode)) { return; } 1306 // Hangul is decomposed on the fly during collation. 1307 composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 1308 UnicodeString prefix; // empty 1309 UnicodeString nfdString; 1310 UnicodeSetIterator iter(composites); 1311 while(iter.next()) { 1312 U_ASSERT(!iter.isString()); 1313 nfd.getDecomposition(iter.getCodepoint(), nfdString); 1314 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 1315 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 1316 // Too many CEs from the decomposition (unusual), ignore this composite. 1317 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1318 // However, this can only really happen in contrived cases. 1319 continue; 1320 } 1321 const UnicodeString &composite(iter.getString()); 1322 addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); 1323 } 1324} 1325 1326uint32_t 1327CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, 1328 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1329 UErrorCode &errorCode) { 1330 if(U_FAILURE(errorCode)) { return ce32; } 1331 int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; 1332 int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); 1333 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1334 if(ce32 == Collation::UNASSIGNED_CE32) { 1335 ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); 1336 } 1337 dataBuilder->addCE32(prefix, str, ce32, errorCode); 1338 } 1339 return ce32; 1340} 1341 1342UBool 1343CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, 1344 const int64_t ces2[], int32_t ces2Length) { 1345 if(ces1Length != ces2Length) { 1346 return FALSE; 1347 } 1348 U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); 1349 for(int32_t i = 0; i < ces1Length; ++i) { 1350 if(ces1[i] != ces2[i]) { return FALSE; } 1351 } 1352 return TRUE; 1353} 1354 1355#ifdef DEBUG_COLLATION_BUILDER 1356 1357uint32_t 1358alignWeightRight(uint32_t w) { 1359 if(w != 0) { 1360 while((w & 0xff) == 0) { w >>= 8; } 1361 } 1362 return w; 1363} 1364 1365#endif 1366 1367void 1368CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { 1369 if(U_FAILURE(errorCode)) { return; } 1370 1371 CollationWeights primaries, secondaries, tertiaries; 1372 int64_t *nodesArray = nodes.getBuffer(); 1373 1374 for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1375 int32_t i = rootPrimaryIndexes.elementAti(rpi); 1376 int64_t node = nodesArray[i]; 1377 uint32_t p = weight32FromNode(node); 1378 uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; 1379 uint32_t t = s; 1380 uint32_t q = 0; 1381 UBool pIsTailored = FALSE; 1382 UBool sIsTailored = FALSE; 1383 UBool tIsTailored = FALSE; 1384#ifdef DEBUG_COLLATION_BUILDER 1385 printf("\nprimary %lx\n", (long)alignWeightRight(p)); 1386#endif 1387 int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1388 int32_t nextIndex = nextIndexFromNode(node); 1389 while(nextIndex != 0) { 1390 i = nextIndex; 1391 node = nodesArray[i]; 1392 nextIndex = nextIndexFromNode(node); 1393 int32_t strength = strengthFromNode(node); 1394 if(strength == UCOL_QUATERNARY) { 1395 U_ASSERT(isTailoredNode(node)); 1396#ifdef DEBUG_COLLATION_BUILDER 1397 printf(" quat+ "); 1398#endif 1399 if(q == 3) { 1400 errorCode = U_BUFFER_OVERFLOW_ERROR; 1401 errorReason = "quaternary tailoring gap too small"; 1402 return; 1403 } 1404 ++q; 1405 } else { 1406 if(strength == UCOL_TERTIARY) { 1407 if(isTailoredNode(node)) { 1408#ifdef DEBUG_COLLATION_BUILDER 1409 printf(" ter+ "); 1410#endif 1411 if(!tIsTailored) { 1412 // First tailored tertiary node for [p, s]. 1413 int32_t tCount = countTailoredNodes(nodesArray, nextIndex, 1414 UCOL_TERTIARY) + 1; 1415 uint32_t tLimit; 1416 if(t == 0) { 1417 // Gap at the beginning of the tertiary CE range. 1418 t = rootElements.getTertiaryBoundary() - 0x100; 1419 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; 1420 } else if(t == BEFORE_WEIGHT16) { 1421 tLimit = Collation::COMMON_WEIGHT16; 1422 } else if(!pIsTailored && !sIsTailored) { 1423 // p and s are root weights. 1424 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1425 } else { 1426 // [p, s] is tailored. 1427 U_ASSERT(t == Collation::COMMON_WEIGHT16); 1428 tLimit = rootElements.getTertiaryBoundary(); 1429 } 1430 U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); 1431 tertiaries.initForTertiary(); 1432 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1433 errorCode = U_BUFFER_OVERFLOW_ERROR; 1434 errorReason = "tertiary tailoring gap too small"; 1435 return; 1436 } 1437 tIsTailored = TRUE; 1438 } 1439 t = tertiaries.nextWeight(); 1440 U_ASSERT(t != 0xffffffff); 1441 } else { 1442 t = weight16FromNode(node); 1443 tIsTailored = FALSE; 1444#ifdef DEBUG_COLLATION_BUILDER 1445 printf(" ter %lx\n", (long)alignWeightRight(t)); 1446#endif 1447 } 1448 } else { 1449 if(strength == UCOL_SECONDARY) { 1450 if(isTailoredNode(node)) { 1451#ifdef DEBUG_COLLATION_BUILDER 1452 printf(" sec+ "); 1453#endif 1454 if(!sIsTailored) { 1455 // First tailored secondary node for p. 1456 int32_t sCount = countTailoredNodes(nodesArray, nextIndex, 1457 UCOL_SECONDARY) + 1; 1458 uint32_t sLimit; 1459 if(s == 0) { 1460 // Gap at the beginning of the secondary CE range. 1461 s = rootElements.getSecondaryBoundary() - 0x100; 1462 sLimit = rootElements.getFirstSecondaryCE() >> 16; 1463 } else if(s == BEFORE_WEIGHT16) { 1464 sLimit = Collation::COMMON_WEIGHT16; 1465 } else if(!pIsTailored) { 1466 // p is a root primary. 1467 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1468 } else { 1469 // p is a tailored primary. 1470 U_ASSERT(s == Collation::COMMON_WEIGHT16); 1471 sLimit = rootElements.getSecondaryBoundary(); 1472 } 1473 if(s == Collation::COMMON_WEIGHT16) { 1474 // Do not tailor into the getSortKey() range of 1475 // compressed common secondaries. 1476 s = rootElements.getLastCommonSecondary(); 1477 } 1478 secondaries.initForSecondary(); 1479 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1480 errorCode = U_BUFFER_OVERFLOW_ERROR; 1481 errorReason = "secondary tailoring gap too small"; 1482 return; 1483 } 1484 sIsTailored = TRUE; 1485 } 1486 s = secondaries.nextWeight(); 1487 U_ASSERT(s != 0xffffffff); 1488 } else { 1489 s = weight16FromNode(node); 1490 sIsTailored = FALSE; 1491#ifdef DEBUG_COLLATION_BUILDER 1492 printf(" sec %lx\n", (long)alignWeightRight(s)); 1493#endif 1494 } 1495 } else /* UCOL_PRIMARY */ { 1496 U_ASSERT(isTailoredNode(node)); 1497#ifdef DEBUG_COLLATION_BUILDER 1498 printf("pri+ "); 1499#endif 1500 if(!pIsTailored) { 1501 // First tailored primary node in this list. 1502 int32_t pCount = countTailoredNodes(nodesArray, nextIndex, 1503 UCOL_PRIMARY) + 1; 1504 UBool isCompressible = baseData->isCompressiblePrimary(p); 1505 uint32_t pLimit = 1506 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1507 primaries.initForPrimary(isCompressible); 1508 if(!primaries.allocWeights(p, pLimit, pCount)) { 1509 errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? 1510 errorReason = "primary tailoring gap too small"; 1511 return; 1512 } 1513 pIsTailored = TRUE; 1514 } 1515 p = primaries.nextWeight(); 1516 U_ASSERT(p != 0xffffffff); 1517 s = Collation::COMMON_WEIGHT16; 1518 sIsTailored = FALSE; 1519 } 1520 t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; 1521 tIsTailored = FALSE; 1522 } 1523 q = 0; 1524 } 1525 if(isTailoredNode(node)) { 1526 nodesArray[i] = Collation::makeCE(p, s, t, q); 1527#ifdef DEBUG_COLLATION_BUILDER 1528 printf("%016llx\n", (long long)nodesArray[i]); 1529#endif 1530 } 1531 } 1532 } 1533} 1534 1535int32_t 1536CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { 1537 int32_t count = 0; 1538 for(;;) { 1539 if(i == 0) { break; } 1540 int64_t node = nodesArray[i]; 1541 if(strengthFromNode(node) < strength) { break; } 1542 if(strengthFromNode(node) == strength) { 1543 if(isTailoredNode(node)) { 1544 ++count; 1545 } else { 1546 break; 1547 } 1548 } 1549 i = nextIndexFromNode(node); 1550 } 1551 return count; 1552} 1553 1554class CEFinalizer : public CollationDataBuilder::CEModifier { 1555public: 1556 CEFinalizer(const int64_t *ces) : finalCEs(ces) {} 1557 virtual ~CEFinalizer(); 1558 virtual int64_t modifyCE32(uint32_t ce32) const { 1559 U_ASSERT(!Collation::isSpecialCE32(ce32)); 1560 if(CollationBuilder::isTempCE32(ce32)) { 1561 // retain case bits 1562 return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1563 } else { 1564 return Collation::NO_CE; 1565 } 1566 } 1567 virtual int64_t modifyCE(int64_t ce) const { 1568 if(CollationBuilder::isTempCE(ce)) { 1569 // retain case bits 1570 return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); 1571 } else { 1572 return Collation::NO_CE; 1573 } 1574 } 1575 1576private: 1577 const int64_t *finalCEs; 1578}; 1579 1580CEFinalizer::~CEFinalizer() {} 1581 1582void 1583CollationBuilder::finalizeCEs(UErrorCode &errorCode) { 1584 if(U_FAILURE(errorCode)) { return; } 1585 LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode)); 1586 if(newBuilder.isNull()) { 1587 errorCode = U_MEMORY_ALLOCATION_ERROR; 1588 return; 1589 } 1590 newBuilder->initForTailoring(baseData, errorCode); 1591 CEFinalizer finalizer(nodes.getBuffer()); 1592 newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); 1593 if(U_FAILURE(errorCode)) { return; } 1594 delete dataBuilder; 1595 dataBuilder = newBuilder.orphan(); 1596} 1597 1598int32_t 1599CollationBuilder::ceStrength(int64_t ce) { 1600 return 1601 isTempCE(ce) ? strengthFromTempCE(ce) : 1602 (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : 1603 ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : 1604 ce != 0 ? UCOL_TERTIARY : 1605 UCOL_IDENTICAL; 1606} 1607 1608U_NAMESPACE_END 1609 1610U_NAMESPACE_USE 1611 1612U_CAPI UCollator * U_EXPORT2 1613ucol_openRules(const UChar *rules, int32_t rulesLength, 1614 UColAttributeValue normalizationMode, UCollationStrength strength, 1615 UParseError *parseError, UErrorCode *pErrorCode) { 1616 if(U_FAILURE(*pErrorCode)) { return NULL; } 1617 if(rules == NULL && rulesLength != 0) { 1618 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1619 return NULL; 1620 } 1621 RuleBasedCollator *coll = new RuleBasedCollator(); 1622 if(coll == NULL) { 1623 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 1624 return NULL; 1625 } 1626 UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); 1627 coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); 1628 if(U_FAILURE(*pErrorCode)) { 1629 delete coll; 1630 return NULL; 1631 } 1632 return coll->toUCollator(); 1633} 1634 1635static const int32_t internalBufferSize = 512; 1636 1637// The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp 1638// because it calls UnicodeSet "builder" code that depends on all Unicode properties, 1639// and the rest of the collation "runtime" code only depends on normalization. 1640// This function is not related to the collation builder, 1641// but it did not seem worth moving it into its own .cpp file, 1642// nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. 1643U_CAPI int32_t U_EXPORT2 1644ucol_getUnsafeSet( const UCollator *coll, 1645 USet *unsafe, 1646 UErrorCode *status) 1647{ 1648 UChar buffer[internalBufferSize]; 1649 int32_t len = 0; 1650 1651 uset_clear(unsafe); 1652 1653 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1654 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1655 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1656 1657 // add chars that fail the fcd check 1658 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1659 1660 // add lead/trail surrogates 1661 // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, 1662 // not when testing code *points*) 1663 uset_addRange(unsafe, 0xd800, 0xdfff); 1664 1665 USet *contractions = uset_open(0,0); 1666 1667 int32_t i = 0, j = 0; 1668 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1669 int32_t contsSize = uset_size(contractions); 1670 UChar32 c = 0; 1671 // Contraction set consists only of strings 1672 // to get unsafe code points, we need to 1673 // break the strings apart and add them to the unsafe set 1674 for(i = 0; i < contsSize; i++) { 1675 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1676 if(len > 0) { 1677 j = 0; 1678 while(j < len) { 1679 U16_NEXT(buffer, j, len, c); 1680 if(j < len) { 1681 uset_add(unsafe, c); 1682 } 1683 } 1684 } 1685 } 1686 1687 uset_close(contractions); 1688 1689 return uset_size(unsafe); 1690} 1691 1692#endif // !UCONFIG_NO_COLLATION 1693