1/* 2****************************************************************************** 3* Copyright (C) 1997-2014, International Business Machines 4* Corporation and others. All Rights Reserved. 5****************************************************************************** 6* file name: nfrule.cpp 7* encoding: US-ASCII 8* tab size: 8 (not used) 9* indentation:4 10* 11* Modification history 12* Date Name Comments 13* 10/11/2001 Doug Ported from ICU4J 14*/ 15 16#include "nfrule.h" 17 18#if U_HAVE_RBNF 19 20#include "unicode/localpointer.h" 21#include "unicode/rbnf.h" 22#include "unicode/tblcoll.h" 23#include "unicode/coleitr.h" 24#include "unicode/uchar.h" 25#include "nfrs.h" 26#include "nfrlist.h" 27#include "nfsubs.h" 28#include "patternprops.h" 29 30U_NAMESPACE_BEGIN 31 32NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) 33 : baseValue((int32_t)0) 34 , radix(0) 35 , exponent(0) 36 , ruleText() 37 , sub1(NULL) 38 , sub2(NULL) 39 , formatter(_rbnf) 40{ 41} 42 43NFRule::~NFRule() 44{ 45 delete sub1; 46 delete sub2; 47} 48 49static const UChar gLeftBracket = 0x005b; 50static const UChar gRightBracket = 0x005d; 51static const UChar gColon = 0x003a; 52static const UChar gZero = 0x0030; 53static const UChar gNine = 0x0039; 54static const UChar gSpace = 0x0020; 55static const UChar gSlash = 0x002f; 56static const UChar gGreaterThan = 0x003e; 57static const UChar gLessThan = 0x003c; 58static const UChar gComma = 0x002c; 59static const UChar gDot = 0x002e; 60static const UChar gTick = 0x0027; 61//static const UChar gMinus = 0x002d; 62static const UChar gSemicolon = 0x003b; 63 64static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ 65static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ 66static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ 67static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ 68 69static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ 70static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ 71static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ 72static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ 73static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ 74static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ 75static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ 76static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ 77static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ 78static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ 79static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ 80static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ 81 82static const UChar * const tokenStrings[] = { 83 gLessLess, gLessPercent, gLessHash, gLessZero, 84 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, 85 gEqualPercent, gEqualHash, gEqualZero, NULL 86}; 87 88void 89NFRule::makeRules(UnicodeString& description, 90 const NFRuleSet *ruleSet, 91 const NFRule *predecessor, 92 const RuleBasedNumberFormat *rbnf, 93 NFRuleList& rules, 94 UErrorCode& status) 95{ 96 // we know we're making at least one rule, so go ahead and 97 // new it up and initialize its basevalue and divisor 98 // (this also strips the rule descriptor, if any, off the 99 // descripton string) 100 NFRule* rule1 = new NFRule(rbnf); 101 /* test for NULL */ 102 if (rule1 == 0) { 103 status = U_MEMORY_ALLOCATION_ERROR; 104 return; 105 } 106 rule1->parseRuleDescriptor(description, status); 107 108 // check the description to see whether there's text enclosed 109 // in brackets 110 int32_t brack1 = description.indexOf(gLeftBracket); 111 int32_t brack2 = description.indexOf(gRightBracket); 112 113 // if the description doesn't contain a matched pair of brackets, 114 // or if it's of a type that doesn't recognize bracketed text, 115 // then leave the description alone, initialize the rule's 116 // rule text and substitutions, and return that rule 117 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 118 || rule1->getType() == kProperFractionRule 119 || rule1->getType() == kNegativeNumberRule) { 120 rule1->ruleText = description; 121 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); 122 rules.add(rule1); 123 } else { 124 // if the description does contain a matched pair of brackets, 125 // then it's really shorthand for two rules (with one exception) 126 NFRule* rule2 = NULL; 127 UnicodeString sbuf; 128 129 // we'll actually only split the rule into two rules if its 130 // base value is an even multiple of its divisor (or it's one 131 // of the special rules) 132 if ((rule1->baseValue > 0 133 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) 134 || rule1->getType() == kImproperFractionRule 135 || rule1->getType() == kMasterRule) { 136 137 // if it passes that test, new up the second rule. If the 138 // rule set both rules will belong to is a fraction rule 139 // set, they both have the same base value; otherwise, 140 // increment the original rule's base value ("rule1" actually 141 // goes SECOND in the rule set's rule list) 142 rule2 = new NFRule(rbnf); 143 /* test for NULL */ 144 if (rule2 == 0) { 145 status = U_MEMORY_ALLOCATION_ERROR; 146 return; 147 } 148 if (rule1->baseValue >= 0) { 149 rule2->baseValue = rule1->baseValue; 150 if (!ruleSet->isFractionRuleSet()) { 151 ++rule1->baseValue; 152 } 153 } 154 155 // if the description began with "x.x" and contains bracketed 156 // text, it describes both the improper fraction rule and 157 // the proper fraction rule 158 else if (rule1->getType() == kImproperFractionRule) { 159 rule2->setType(kProperFractionRule); 160 } 161 162 // if the description began with "x.0" and contains bracketed 163 // text, it describes both the master rule and the 164 // improper fraction rule 165 else if (rule1->getType() == kMasterRule) { 166 rule2->baseValue = rule1->baseValue; 167 rule1->setType(kImproperFractionRule); 168 } 169 170 // both rules have the same radix and exponent (i.e., the 171 // same divisor) 172 rule2->radix = rule1->radix; 173 rule2->exponent = rule1->exponent; 174 175 // rule2's rule text omits the stuff in brackets: initalize 176 // its rule text and substitutions accordingly 177 sbuf.append(description, 0, brack1); 178 if (brack2 + 1 < description.length()) { 179 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 180 } 181 rule2->ruleText.setTo(sbuf); 182 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); 183 } 184 185 // rule1's text includes the text in the brackets but omits 186 // the brackets themselves: initialize _its_ rule text and 187 // substitutions accordingly 188 sbuf.setTo(description, 0, brack1); 189 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); 190 if (brack2 + 1 < description.length()) { 191 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); 192 } 193 rule1->ruleText.setTo(sbuf); 194 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); 195 196 // if we only have one rule, return it; if we have two, return 197 // a two-element array containing them (notice that rule2 goes 198 // BEFORE rule1 in the list: in all cases, rule2 OMITS the 199 // material in the brackets and rule1 INCLUDES the material 200 // in the brackets) 201 if (rule2 != NULL) { 202 rules.add(rule2); 203 } 204 rules.add(rule1); 205 } 206} 207 208/** 209 * This function parses the rule's rule descriptor (i.e., the base 210 * value and/or other tokens that precede the rule's rule text 211 * in the description) and sets the rule's base value, radix, and 212 * exponent according to the descriptor. (If the description doesn't 213 * include a rule descriptor, then this function sets everything to 214 * default values and the rule set sets the rule's real base value). 215 * @param description The rule's description 216 * @return If "description" included a rule descriptor, this is 217 * "description" with the descriptor and any trailing whitespace 218 * stripped off. Otherwise; it's "descriptor" unchangd. 219 */ 220void 221NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) 222{ 223 // the description consists of a rule descriptor and a rule body, 224 // separated by a colon. The rule descriptor is optional. If 225 // it's omitted, just set the base value to 0. 226 int32_t p = description.indexOf(gColon); 227 if (p == -1) { 228 setBaseValue((int32_t)0, status); 229 } else { 230 // copy the descriptor out into its own string and strip it, 231 // along with any trailing whitespace, out of the original 232 // description 233 UnicodeString descriptor; 234 descriptor.setTo(description, 0, p); 235 236 ++p; 237 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { 238 ++p; 239 } 240 description.removeBetween(0, p); 241 242 // check first to see if the rule descriptor matches the token 243 // for one of the special rules. If it does, set the base 244 // value to the correct identfier value 245 if (0 == descriptor.compare(gMinusX, 2)) { 246 setType(kNegativeNumberRule); 247 } 248 else if (0 == descriptor.compare(gXDotX, 3)) { 249 setType(kImproperFractionRule); 250 } 251 else if (0 == descriptor.compare(gZeroDotX, 3)) { 252 setType(kProperFractionRule); 253 } 254 else if (0 == descriptor.compare(gXDotZero, 3)) { 255 setType(kMasterRule); 256 } 257 258 // if the rule descriptor begins with a digit, it's a descriptor 259 // for a normal rule 260 // since we don't have Long.parseLong, and this isn't much work anyway, 261 // just build up the value as we encounter the digits. 262 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { 263 int64_t val = 0; 264 p = 0; 265 UChar c = gSpace; 266 267 // begin parsing the descriptor: copy digits 268 // into "tempValue", skip periods, commas, and spaces, 269 // stop on a slash or > sign (or at the end of the string), 270 // and throw an exception on any other character 271 int64_t ll_10 = 10; 272 while (p < descriptor.length()) { 273 c = descriptor.charAt(p); 274 if (c >= gZero && c <= gNine) { 275 val = val * ll_10 + (int32_t)(c - gZero); 276 } 277 else if (c == gSlash || c == gGreaterThan) { 278 break; 279 } 280 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 281 } 282 else { 283 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 284 status = U_PARSE_ERROR; 285 return; 286 } 287 ++p; 288 } 289 290 // we have the base value, so set it 291 setBaseValue(val, status); 292 293 // if we stopped the previous loop on a slash, we're 294 // now parsing the rule's radix. Again, accumulate digits 295 // in tempValue, skip punctuation, stop on a > mark, and 296 // throw an exception on anything else 297 if (c == gSlash) { 298 val = 0; 299 ++p; 300 int64_t ll_10 = 10; 301 while (p < descriptor.length()) { 302 c = descriptor.charAt(p); 303 if (c >= gZero && c <= gNine) { 304 val = val * ll_10 + (int32_t)(c - gZero); 305 } 306 else if (c == gGreaterThan) { 307 break; 308 } 309 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { 310 } 311 else { 312 // throw new IllegalArgumentException("Illegal character is rule descriptor"); 313 status = U_PARSE_ERROR; 314 return; 315 } 316 ++p; 317 } 318 319 // tempValue now contain's the rule's radix. Set it 320 // accordingly, and recalculate the rule's exponent 321 radix = (int32_t)val; 322 if (radix == 0) { 323 // throw new IllegalArgumentException("Rule can't have radix of 0"); 324 status = U_PARSE_ERROR; 325 } 326 327 exponent = expectedExponent(); 328 } 329 330 // if we stopped the previous loop on a > sign, then continue 331 // for as long as we still see > signs. For each one, 332 // decrement the exponent (unless the exponent is already 0). 333 // If we see another character before reaching the end of 334 // the descriptor, that's also a syntax error. 335 if (c == gGreaterThan) { 336 while (p < descriptor.length()) { 337 c = descriptor.charAt(p); 338 if (c == gGreaterThan && exponent > 0) { 339 --exponent; 340 } else { 341 // throw new IllegalArgumentException("Illegal character in rule descriptor"); 342 status = U_PARSE_ERROR; 343 return; 344 } 345 ++p; 346 } 347 } 348 } 349 } 350 351 // finally, if the rule body begins with an apostrophe, strip it off 352 // (this is generally used to put whitespace at the beginning of 353 // a rule's rule text) 354 if (description.length() > 0 && description.charAt(0) == gTick) { 355 description.removeBetween(0, 1); 356 } 357 358 // return the description with all the stuff we've just waded through 359 // stripped off the front. It now contains just the rule body. 360 // return description; 361} 362 363/** 364* Searches the rule's rule text for the substitution tokens, 365* creates the substitutions, and removes the substitution tokens 366* from the rule's rule text. 367* @param owner The rule set containing this rule 368* @param predecessor The rule preseding this one in "owners" rule list 369* @param ownersOwner The RuleBasedFormat that owns this rule 370*/ 371void 372NFRule::extractSubstitutions(const NFRuleSet* ruleSet, 373 const NFRule* predecessor, 374 const RuleBasedNumberFormat* rbnf, 375 UErrorCode& status) 376{ 377 if (U_SUCCESS(status)) { 378 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); 379 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); 380 } 381} 382 383/** 384* Searches the rule's rule text for the first substitution token, 385* creates a substitution based on it, and removes the token from 386* the rule's rule text. 387* @param owner The rule set containing this rule 388* @param predecessor The rule preceding this one in the rule set's 389* rule list 390* @param ownersOwner The RuleBasedNumberFormat that owns this rule 391* @return The newly-created substitution. This is never null; if 392* the rule text doesn't contain any substitution tokens, this will 393* be a NullSubstitution. 394*/ 395NFSubstitution * 396NFRule::extractSubstitution(const NFRuleSet* ruleSet, 397 const NFRule* predecessor, 398 const RuleBasedNumberFormat* rbnf, 399 UErrorCode& status) 400{ 401 NFSubstitution* result = NULL; 402 403 // search the rule's rule text for the first two characters of 404 // a substitution token 405 int32_t subStart = indexOfAny(tokenStrings); 406 int32_t subEnd = subStart; 407 408 // if we didn't find one, create a null substitution positioned 409 // at the end of the rule text 410 if (subStart == -1) { 411 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 412 ruleSet, rbnf, UnicodeString(), status); 413 } 414 415 // special-case the ">>>" token, since searching for the > at the 416 // end will actually find the > in the middle 417 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { 418 subEnd = subStart + 2; 419 420 // otherwise the substitution token ends with the same character 421 // it began with 422 } else { 423 UChar c = ruleText.charAt(subStart); 424 subEnd = ruleText.indexOf(c, subStart + 1); 425 // special case for '<%foo<<' 426 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { 427 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle 428 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack 429 // to get around this. Having the duplicate at the front would cause problems with 430 // rules like "<<%" to format, say, percents... 431 ++subEnd; 432 } 433 } 434 435 // if we don't find the end of the token (i.e., if we're on a single, 436 // unmatched token character), create a null substitution positioned 437 // at the end of the rule 438 if (subEnd == -1) { 439 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, 440 ruleSet, rbnf, UnicodeString(), status); 441 } 442 443 // if we get here, we have a real substitution token (or at least 444 // some text bounded by substitution token characters). Use 445 // makeSubstitution() to create the right kind of substitution 446 UnicodeString subToken; 447 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); 448 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, 449 rbnf, subToken, status); 450 451 // remove the substitution from the rule text 452 ruleText.removeBetween(subStart, subEnd+1); 453 454 return result; 455} 456 457/** 458 * Sets the rule's base value, and causes the radix and exponent 459 * to be recalculated. This is used during construction when we 460 * don't know the rule's base value until after it's been 461 * constructed. It should be used at any other time. 462 * @param The new base value for the rule. 463 */ 464void 465NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) 466{ 467 // set the base value 468 baseValue = newBaseValue; 469 470 // if this isn't a special rule, recalculate the radix and exponent 471 // (the radix always defaults to 10; if it's supposed to be something 472 // else, it's cleaned up by the caller and the exponent is 473 // recalculated again-- the only function that does this is 474 // NFRule.parseRuleDescriptor() ) 475 if (baseValue >= 1) { 476 radix = 10; 477 exponent = expectedExponent(); 478 479 // this function gets called on a fully-constructed rule whose 480 // description didn't specify a base value. This means it 481 // has substitutions, and some substitutions hold on to copies 482 // of the rule's divisor. Fix their copies of the divisor. 483 if (sub1 != NULL) { 484 sub1->setDivisor(radix, exponent, status); 485 } 486 if (sub2 != NULL) { 487 sub2->setDivisor(radix, exponent, status); 488 } 489 490 // if this is a special rule, its radix and exponent are basically 491 // ignored. Set them to "safe" default values 492 } else { 493 radix = 10; 494 exponent = 0; 495 } 496} 497 498/** 499* This calculates the rule's exponent based on its radix and base 500* value. This will be the highest power the radix can be raised to 501* and still produce a result less than or equal to the base value. 502*/ 503int16_t 504NFRule::expectedExponent() const 505{ 506 // since the log of 0, or the log base 0 of something, causes an 507 // error, declare the exponent in these cases to be 0 (we also 508 // deal with the special-rule identifiers here) 509 if (radix == 0 || baseValue < 1) { 510 return 0; 511 } 512 513 // we get rounding error in some cases-- for example, log 1000 / log 10 514 // gives us 1.9999999996 instead of 2. The extra logic here is to take 515 // that into account 516 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); 517 int64_t temp = util64_pow(radix, tempResult + 1); 518 if (temp <= baseValue) { 519 tempResult += 1; 520 } 521 return tempResult; 522} 523 524/** 525 * Searches the rule's rule text for any of the specified strings. 526 * @param strings An array of strings to search the rule's rule 527 * text for 528 * @return The index of the first match in the rule's rule text 529 * (i.e., the first substring in the rule's rule text that matches 530 * _any_ of the strings in "strings"). If none of the strings in 531 * "strings" is found in the rule's rule text, returns -1. 532 */ 533int32_t 534NFRule::indexOfAny(const UChar* const strings[]) const 535{ 536 int result = -1; 537 for (int i = 0; strings[i]; i++) { 538 int32_t pos = ruleText.indexOf(*strings[i]); 539 if (pos != -1 && (result == -1 || pos < result)) { 540 result = pos; 541 } 542 } 543 return result; 544} 545 546//----------------------------------------------------------------------- 547// boilerplate 548//----------------------------------------------------------------------- 549 550/** 551* Tests two rules for equality. 552* @param that The rule to compare this one against 553* @return True is the two rules are functionally equivalent 554*/ 555UBool 556NFRule::operator==(const NFRule& rhs) const 557{ 558 return baseValue == rhs.baseValue 559 && radix == rhs.radix 560 && exponent == rhs.exponent 561 && ruleText == rhs.ruleText 562 && *sub1 == *rhs.sub1 563 && *sub2 == *rhs.sub2; 564} 565 566/** 567* Returns a textual representation of the rule. This won't 568* necessarily be the same as the description that this rule 569* was created with, but it will produce the same result. 570* @return A textual description of the rule 571*/ 572static void util_append64(UnicodeString& result, int64_t n) 573{ 574 UChar buffer[256]; 575 int32_t len = util64_tou(n, buffer, sizeof(buffer)); 576 UnicodeString temp(buffer, len); 577 result.append(temp); 578} 579 580void 581NFRule::_appendRuleText(UnicodeString& result) const 582{ 583 switch (getType()) { 584 case kNegativeNumberRule: result.append(gMinusX, 2); break; 585 case kImproperFractionRule: result.append(gXDotX, 3); break; 586 case kProperFractionRule: result.append(gZeroDotX, 3); break; 587 case kMasterRule: result.append(gXDotZero, 3); break; 588 default: 589 // for a normal rule, write out its base value, and if the radix is 590 // something other than 10, write out the radix (with the preceding 591 // slash, of course). Then calculate the expected exponent and if 592 // if isn't the same as the actual exponent, write an appropriate 593 // number of > signs. Finally, terminate the whole thing with 594 // a colon. 595 util_append64(result, baseValue); 596 if (radix != 10) { 597 result.append(gSlash); 598 util_append64(result, radix); 599 } 600 int numCarets = expectedExponent() - exponent; 601 for (int i = 0; i < numCarets; i++) { 602 result.append(gGreaterThan); 603 } 604 break; 605 } 606 result.append(gColon); 607 result.append(gSpace); 608 609 // if the rule text begins with a space, write an apostrophe 610 // (whitespace after the rule descriptor is ignored; the 611 // apostrophe is used to make the whitespace significant) 612 if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) { 613 result.append(gTick); 614 } 615 616 // now, write the rule's rule text, inserting appropriate 617 // substitution tokens in the appropriate places 618 UnicodeString ruleTextCopy; 619 ruleTextCopy.setTo(ruleText); 620 621 UnicodeString temp; 622 sub2->toString(temp); 623 ruleTextCopy.insert(sub2->getPos(), temp); 624 sub1->toString(temp); 625 ruleTextCopy.insert(sub1->getPos(), temp); 626 627 result.append(ruleTextCopy); 628 629 // and finally, top the whole thing off with a semicolon and 630 // return the result 631 result.append(gSemicolon); 632} 633 634//----------------------------------------------------------------------- 635// formatting 636//----------------------------------------------------------------------- 637 638/** 639* Formats the number, and inserts the resulting text into 640* toInsertInto. 641* @param number The number being formatted 642* @param toInsertInto The string where the resultant text should 643* be inserted 644* @param pos The position in toInsertInto where the resultant text 645* should be inserted 646*/ 647void 648NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const 649{ 650 // first, insert the rule's rule text into toInsertInto at the 651 // specified position, then insert the results of the substitutions 652 // into the right places in toInsertInto (notice we do the 653 // substitutions in reverse order so that the offsets don't get 654 // messed up) 655 toInsertInto.insert(pos, ruleText); 656 sub2->doSubstitution(number, toInsertInto, pos); 657 sub1->doSubstitution(number, toInsertInto, pos); 658} 659 660/** 661* Formats the number, and inserts the resulting text into 662* toInsertInto. 663* @param number The number being formatted 664* @param toInsertInto The string where the resultant text should 665* be inserted 666* @param pos The position in toInsertInto where the resultant text 667* should be inserted 668*/ 669void 670NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const 671{ 672 // first, insert the rule's rule text into toInsertInto at the 673 // specified position, then insert the results of the substitutions 674 // into the right places in toInsertInto 675 // [again, we have two copies of this routine that do the same thing 676 // so that we don't sacrifice precision in a long by casting it 677 // to a double] 678 toInsertInto.insert(pos, ruleText); 679 sub2->doSubstitution(number, toInsertInto, pos); 680 sub1->doSubstitution(number, toInsertInto, pos); 681} 682 683/** 684* Used by the owning rule set to determine whether to invoke the 685* rollback rule (i.e., whether this rule or the one that precedes 686* it in the rule set's list should be used to format the number) 687* @param The number being formatted 688* @return True if the rule set should use the rule that precedes 689* this one in its list; false if it should use this rule 690*/ 691UBool 692NFRule::shouldRollBack(double number) const 693{ 694 // we roll back if the rule contains a modulus substitution, 695 // the number being formatted is an even multiple of the rule's 696 // divisor, and the rule's base value is NOT an even multiple 697 // of its divisor 698 // In other words, if the original description had 699 // 100: << hundred[ >>]; 700 // that expands into 701 // 100: << hundred; 702 // 101: << hundred >>; 703 // internally. But when we're formatting 200, if we use the rule 704 // at 101, which would normally apply, we get "two hundred zero". 705 // To prevent this, we roll back and use the rule at 100 instead. 706 // This is the logic that makes this happen: the rule at 101 has 707 // a modulus substitution, its base value isn't an even multiple 708 // of 100, and the value we're trying to format _is_ an even 709 // multiple of 100. This is called the "rollback rule." 710 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { 711 int64_t re = util64_pow(radix, exponent); 712 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; 713 } 714 return FALSE; 715} 716 717//----------------------------------------------------------------------- 718// parsing 719//----------------------------------------------------------------------- 720 721/** 722* Attempts to parse the string with this rule. 723* @param text The string being parsed 724* @param parsePosition On entry, the value is ignored and assumed to 725* be 0. On exit, this has been updated with the position of the first 726* character not consumed by matching the text against this rule 727* (if this rule doesn't match the text at all, the parse position 728* if left unchanged (presumably at 0) and the function returns 729* new Long(0)). 730* @param isFractionRule True if this rule is contained within a 731* fraction rule set. This is only used if the rule has no 732* substitutions. 733* @return If this rule matched the text, this is the rule's base value 734* combined appropriately with the results of parsing the substitutions. 735* If nothing matched, this is new Long(0) and the parse position is 736* left unchanged. The result will be an instance of Long if the 737* result is an integer and Double otherwise. The result is never null. 738*/ 739#ifdef RBNF_DEBUG 740#include <stdio.h> 741 742static void dumpUS(FILE* f, const UnicodeString& us) { 743 int len = us.length(); 744 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; 745 if (buf != NULL) { 746 us.extract(0, len, buf); 747 buf[len] = 0; 748 fprintf(f, "%s", buf); 749 uprv_free(buf); //delete[] buf; 750 } 751} 752#endif 753 754UBool 755NFRule::doParse(const UnicodeString& text, 756 ParsePosition& parsePosition, 757 UBool isFractionRule, 758 double upperBound, 759 Formattable& resVal, 760 UBool isDecimFmtParseable) const 761{ 762 // internally we operate on a copy of the string being parsed 763 // (because we're going to change it) and use our own ParsePosition 764 ParsePosition pp; 765 UnicodeString workText(text); 766 767 // check to see whether the text before the first substitution 768 // matches the text at the beginning of the string being 769 // parsed. If it does, strip that off the front of workText; 770 // otherwise, dump out with a mismatch 771 UnicodeString prefix; 772 prefix.setTo(ruleText, 0, sub1->getPos()); 773 774#ifdef RBNF_DEBUG 775 fprintf(stderr, "doParse %x ", this); 776 { 777 UnicodeString rt; 778 _appendRuleText(rt); 779 dumpUS(stderr, rt); 780 } 781 782 fprintf(stderr, " text: '", this); 783 dumpUS(stderr, text); 784 fprintf(stderr, "' prefix: '"); 785 dumpUS(stderr, prefix); 786#endif 787 stripPrefix(workText, prefix, pp); 788 int32_t prefixLength = text.length() - workText.length(); 789 790#ifdef RBNF_DEBUG 791 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); 792#endif 793 794 if (pp.getIndex() == 0 && sub1->getPos() != 0) { 795 // commented out because ParsePosition doesn't have error index in 1.1.x 796 // restored for ICU4C port 797 parsePosition.setErrorIndex(pp.getErrorIndex()); 798 resVal.setLong(0); 799 return TRUE; 800 } 801 802 // Detect when this rule's main job is to parse a decimal format and we're not 803 // supposed to. 804 if (!isDecimFmtParseable) { 805 // The following tries to detect a rule like "x.x: =#,##0.#=;" 806 if ( sub1->isDecimalFormatSubstitutionOnly() && sub2->isRuleSetSubstitutionOnly() ) { 807 parsePosition.setErrorIndex(pp.getErrorIndex()); 808 resVal.setLong(0); 809 return TRUE; 810 } 811 } 812 813 // this is the fun part. The basic guts of the rule-matching 814 // logic is matchToDelimiter(), which is called twice. The first 815 // time it searches the input string for the rule text BETWEEN 816 // the substitutions and tries to match the intervening text 817 // in the input string with the first substitution. If that 818 // succeeds, it then calls it again, this time to look for the 819 // rule text after the second substitution and to match the 820 // intervening input text against the second substitution. 821 // 822 // For example, say we have a rule that looks like this: 823 // first << middle >> last; 824 // and input text that looks like this: 825 // first one middle two last 826 // First we use stripPrefix() to match "first " in both places and 827 // strip it off the front, leaving 828 // one middle two last 829 // Then we use matchToDelimiter() to match " middle " and try to 830 // match "one" against a substitution. If it's successful, we now 831 // have 832 // two last 833 // We use matchToDelimiter() a second time to match " last" and 834 // try to match "two" against a substitution. If "two" matches 835 // the substitution, we have a successful parse. 836 // 837 // Since it's possible in many cases to find multiple instances 838 // of each of these pieces of rule text in the input string, 839 // we need to try all the possible combinations of these 840 // locations. This prevents us from prematurely declaring a mismatch, 841 // and makes sure we match as much input text as we can. 842 int highWaterMark = 0; 843 double result = 0; 844 int start = 0; 845 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); 846 847 UnicodeString temp; 848 do { 849 // our partial parse result starts out as this rule's base 850 // value. If it finds a successful match, matchToDelimiter() 851 // will compose this in some way with what it gets back from 852 // the substitution, giving us a new partial parse result 853 pp.setIndex(0); 854 855 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); 856 double partialResult = matchToDelimiter(workText, start, tempBaseValue, 857 temp, pp, sub1, 858 upperBound); 859 860 // if we got a successful match (or were trying to match a 861 // null substitution), pp is now pointing at the first unmatched 862 // character. Take note of that, and try matchToDelimiter() 863 // on the input text again 864 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { 865 start = pp.getIndex(); 866 867 UnicodeString workText2; 868 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); 869 ParsePosition pp2; 870 871 // the second matchToDelimiter() will compose our previous 872 // partial result with whatever it gets back from its 873 // substitution if there's a successful match, giving us 874 // a real result 875 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); 876 partialResult = matchToDelimiter(workText2, 0, partialResult, 877 temp, pp2, sub2, 878 upperBound); 879 880 // if we got a successful match on this second 881 // matchToDelimiter() call, update the high-water mark 882 // and result (if necessary) 883 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { 884 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { 885 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); 886 result = partialResult; 887 } 888 } 889 // commented out because ParsePosition doesn't have error index in 1.1.x 890 // restored for ICU4C port 891 else { 892 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); 893 if (temp> parsePosition.getErrorIndex()) { 894 parsePosition.setErrorIndex(temp); 895 } 896 } 897 } 898 // commented out because ParsePosition doesn't have error index in 1.1.x 899 // restored for ICU4C port 900 else { 901 int32_t temp = sub1->getPos() + pp.getErrorIndex(); 902 if (temp > parsePosition.getErrorIndex()) { 903 parsePosition.setErrorIndex(temp); 904 } 905 } 906 // keep trying to match things until the outer matchToDelimiter() 907 // call fails to make a match (each time, it picks up where it 908 // left off the previous time) 909 } while (sub1->getPos() != sub2->getPos() 910 && pp.getIndex() > 0 911 && pp.getIndex() < workText.length() 912 && pp.getIndex() != start); 913 914 // update the caller's ParsePosition with our high-water mark 915 // (i.e., it now points at the first character this function 916 // didn't match-- the ParsePosition is therefore unchanged if 917 // we didn't match anything) 918 parsePosition.setIndex(highWaterMark); 919 // commented out because ParsePosition doesn't have error index in 1.1.x 920 // restored for ICU4C port 921 if (highWaterMark > 0) { 922 parsePosition.setErrorIndex(0); 923 } 924 925 // this is a hack for one unusual condition: Normally, whether this 926 // rule belong to a fraction rule set or not is handled by its 927 // substitutions. But if that rule HAS NO substitutions, then 928 // we have to account for it here. By definition, if the matching 929 // rule in a fraction rule set has no substitutions, its numerator 930 // is 1, and so the result is the reciprocal of its base value. 931 if (isFractionRule && 932 highWaterMark > 0 && 933 sub1->isNullSubstitution()) { 934 result = 1 / result; 935 } 936 937 resVal.setDouble(result); 938 return TRUE; // ??? do we need to worry if it is a long or a double? 939} 940 941/** 942* This function is used by parse() to match the text being parsed 943* against a possible prefix string. This function 944* matches characters from the beginning of the string being parsed 945* to characters from the prospective prefix. If they match, pp is 946* updated to the first character not matched, and the result is 947* the unparsed part of the string. If they don't match, the whole 948* string is returned, and pp is left unchanged. 949* @param text The string being parsed 950* @param prefix The text to match against 951* @param pp On entry, ignored and assumed to be 0. On exit, points 952* to the first unmatched character (assuming the whole prefix matched), 953* or is unchanged (if the whole prefix didn't match). 954* @return If things match, this is the unparsed part of "text"; 955* if they didn't match, this is "text". 956*/ 957void 958NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const 959{ 960 // if the prefix text is empty, dump out without doing anything 961 if (prefix.length() != 0) { 962 UErrorCode status = U_ZERO_ERROR; 963 // use prefixLength() to match the beginning of 964 // "text" against "prefix". This function returns the 965 // number of characters from "text" that matched (or 0 if 966 // we didn't match the whole prefix) 967 int32_t pfl = prefixLength(text, prefix, status); 968 if (U_FAILURE(status)) { // Memory allocation error. 969 return; 970 } 971 if (pfl != 0) { 972 // if we got a successful match, update the parse position 973 // and strip the prefix off of "text" 974 pp.setIndex(pp.getIndex() + pfl); 975 text.remove(0, pfl); 976 } 977 } 978} 979 980/** 981* Used by parse() to match a substitution and any following text. 982* "text" is searched for instances of "delimiter". For each instance 983* of delimiter, the intervening text is tested to see whether it 984* matches the substitution. The longest match wins. 985* @param text The string being parsed 986* @param startPos The position in "text" where we should start looking 987* for "delimiter". 988* @param baseValue A partial parse result (often the rule's base value), 989* which is combined with the result from matching the substitution 990* @param delimiter The string to search "text" for. 991* @param pp Ignored and presumed to be 0 on entry. If there's a match, 992* on exit this will point to the first unmatched character. 993* @param sub If we find "delimiter" in "text", this substitution is used 994* to match the text between the beginning of the string and the 995* position of "delimiter." (If "delimiter" is the empty string, then 996* this function just matches against this substitution and updates 997* everything accordingly.) 998* @param upperBound When matching the substitution, it will only 999* consider rules with base values lower than this value. 1000* @return If there's a match, this is the result of composing 1001* baseValue with the result of matching the substitution. Otherwise, 1002* this is new Long(0). It's never null. If the result is an integer, 1003* this will be an instance of Long; otherwise, it's an instance of 1004* Double. 1005* 1006* !!! note {dlf} in point of fact, in the java code the caller always converts 1007* the result to a double, so we might as well return one. 1008*/ 1009double 1010NFRule::matchToDelimiter(const UnicodeString& text, 1011 int32_t startPos, 1012 double _baseValue, 1013 const UnicodeString& delimiter, 1014 ParsePosition& pp, 1015 const NFSubstitution* sub, 1016 double upperBound) const 1017{ 1018 UErrorCode status = U_ZERO_ERROR; 1019 // if "delimiter" contains real (i.e., non-ignorable) text, search 1020 // it for "delimiter" beginning at "start". If that succeeds, then 1021 // use "sub"'s doParse() method to match the text before the 1022 // instance of "delimiter" we just found. 1023 if (!allIgnorable(delimiter, status)) { 1024 if (U_FAILURE(status)) { //Memory allocation error. 1025 return 0; 1026 } 1027 ParsePosition tempPP; 1028 Formattable result; 1029 1030 // use findText() to search for "delimiter". It returns a two- 1031 // element array: element 0 is the position of the match, and 1032 // element 1 is the number of characters that matched 1033 // "delimiter". 1034 int32_t dLen; 1035 int32_t dPos = findText(text, delimiter, startPos, &dLen); 1036 1037 // if findText() succeeded, isolate the text preceding the 1038 // match, and use "sub" to match that text 1039 while (dPos >= 0) { 1040 UnicodeString subText; 1041 subText.setTo(text, 0, dPos); 1042 if (subText.length() > 0) { 1043 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, 1044#if UCONFIG_NO_COLLATION 1045 FALSE, 1046#else 1047 formatter->isLenient(), 1048#endif 1049 result); 1050 1051 // if the substitution could match all the text up to 1052 // where we found "delimiter", then this function has 1053 // a successful match. Bump the caller's parse position 1054 // to point to the first character after the text 1055 // that matches "delimiter", and return the result 1056 // we got from parsing the substitution. 1057 if (success && tempPP.getIndex() == dPos) { 1058 pp.setIndex(dPos + dLen); 1059 return result.getDouble(); 1060 } 1061 // commented out because ParsePosition doesn't have error index in 1.1.x 1062 // restored for ICU4C port 1063 else { 1064 if (tempPP.getErrorIndex() > 0) { 1065 pp.setErrorIndex(tempPP.getErrorIndex()); 1066 } else { 1067 pp.setErrorIndex(tempPP.getIndex()); 1068 } 1069 } 1070 } 1071 1072 // if we didn't match the substitution, search for another 1073 // copy of "delimiter" in "text" and repeat the loop if 1074 // we find it 1075 tempPP.setIndex(0); 1076 dPos = findText(text, delimiter, dPos + dLen, &dLen); 1077 } 1078 // if we make it here, this was an unsuccessful match, and we 1079 // leave pp unchanged and return 0 1080 pp.setIndex(0); 1081 return 0; 1082 1083 // if "delimiter" is empty, or consists only of ignorable characters 1084 // (i.e., is semantically empty), thwe we obviously can't search 1085 // for "delimiter". Instead, just use "sub" to parse as much of 1086 // "text" as possible. 1087 } else { 1088 ParsePosition tempPP; 1089 Formattable result; 1090 1091 // try to match the whole string against the substitution 1092 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, 1093#if UCONFIG_NO_COLLATION 1094 FALSE, 1095#else 1096 formatter->isLenient(), 1097#endif 1098 result); 1099 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { 1100 // if there's a successful match (or it's a null 1101 // substitution), update pp to point to the first 1102 // character we didn't match, and pass the result from 1103 // sub.doParse() on through to the caller 1104 pp.setIndex(tempPP.getIndex()); 1105 return result.getDouble(); 1106 } 1107 // commented out because ParsePosition doesn't have error index in 1.1.x 1108 // restored for ICU4C port 1109 else { 1110 pp.setErrorIndex(tempPP.getErrorIndex()); 1111 } 1112 1113 // and if we get to here, then nothing matched, so we return 1114 // 0 and leave pp alone 1115 return 0; 1116 } 1117} 1118 1119/** 1120* Used by stripPrefix() to match characters. If lenient parse mode 1121* is off, this just calls startsWith(). If lenient parse mode is on, 1122* this function uses CollationElementIterators to match characters in 1123* the strings (only primary-order differences are significant in 1124* determining whether there's a match). 1125* @param str The string being tested 1126* @param prefix The text we're hoping to see at the beginning 1127* of "str" 1128* @return If "prefix" is found at the beginning of "str", this 1129* is the number of characters in "str" that were matched (this 1130* isn't necessarily the same as the length of "prefix" when matching 1131* text with a collator). If there's no match, this is 0. 1132*/ 1133int32_t 1134NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const 1135{ 1136 // if we're looking for an empty prefix, it obviously matches 1137 // zero characters. Just go ahead and return 0. 1138 if (prefix.length() == 0) { 1139 return 0; 1140 } 1141 1142#if !UCONFIG_NO_COLLATION 1143 // go through all this grief if we're in lenient-parse mode 1144 if (formatter->isLenient()) { 1145 // get the formatter's collator and use it to create two 1146 // collation element iterators, one over the target string 1147 // and another over the prefix (right now, we'll throw an 1148 // exception if the collator we get back from the formatter 1149 // isn't a RuleBasedCollator, because RuleBasedCollator defines 1150 // the CollationElementIterator protocol. Hopefully, this 1151 // will change someday.) 1152 const RuleBasedCollator* collator = formatter->getCollator(); 1153 if (collator == NULL) { 1154 status = U_MEMORY_ALLOCATION_ERROR; 1155 return 0; 1156 } 1157 LocalPointer<CollationElementIterator> strIter(collator->createCollationElementIterator(str)); 1158 LocalPointer<CollationElementIterator> prefixIter(collator->createCollationElementIterator(prefix)); 1159 // Check for memory allocation error. 1160 if (strIter.isNull() || prefixIter.isNull()) { 1161 status = U_MEMORY_ALLOCATION_ERROR; 1162 return 0; 1163 } 1164 1165 UErrorCode err = U_ZERO_ERROR; 1166 1167 // The original code was problematic. Consider this match: 1168 // prefix = "fifty-" 1169 // string = " fifty-7" 1170 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 1171 // in the string. Unfortunately, we were getting a match, and then computing where 1172 // the match terminated by rematching the string. The rematch code was using as an 1173 // initial guess the substring of string between 0 and prefix.length. Because of 1174 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving 1175 // the position before the hyphen in the string. Recursing down, we then parsed the 1176 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). 1177 // This was not pretty, especially since the string "fifty-7" parsed just fine. 1178 // 1179 // We have newer APIs now, so we can use calls on the iterator to determine what we 1180 // matched up to. If we terminate because we hit the last element in the string, 1181 // our match terminates at this length. If we terminate because we hit the last element 1182 // in the target, our match terminates at one before the element iterator position. 1183 1184 // match collation elements between the strings 1185 int32_t oStr = strIter->next(err); 1186 int32_t oPrefix = prefixIter->next(err); 1187 1188 while (oPrefix != CollationElementIterator::NULLORDER) { 1189 // skip over ignorable characters in the target string 1190 while (CollationElementIterator::primaryOrder(oStr) == 0 1191 && oStr != CollationElementIterator::NULLORDER) { 1192 oStr = strIter->next(err); 1193 } 1194 1195 // skip over ignorable characters in the prefix 1196 while (CollationElementIterator::primaryOrder(oPrefix) == 0 1197 && oPrefix != CollationElementIterator::NULLORDER) { 1198 oPrefix = prefixIter->next(err); 1199 } 1200 1201 // dlf: move this above following test, if we consume the 1202 // entire target, aren't we ok even if the source was also 1203 // entirely consumed? 1204 1205 // if skipping over ignorables brought to the end of 1206 // the prefix, we DID match: drop out of the loop 1207 if (oPrefix == CollationElementIterator::NULLORDER) { 1208 break; 1209 } 1210 1211 // if skipping over ignorables brought us to the end 1212 // of the target string, we didn't match and return 0 1213 if (oStr == CollationElementIterator::NULLORDER) { 1214 return 0; 1215 } 1216 1217 // match collation elements from the two strings 1218 // (considering only primary differences). If we 1219 // get a mismatch, dump out and return 0 1220 if (CollationElementIterator::primaryOrder(oStr) 1221 != CollationElementIterator::primaryOrder(oPrefix)) { 1222 return 0; 1223 1224 // otherwise, advance to the next character in each string 1225 // and loop (we drop out of the loop when we exhaust 1226 // collation elements in the prefix) 1227 } else { 1228 oStr = strIter->next(err); 1229 oPrefix = prefixIter->next(err); 1230 } 1231 } 1232 1233 int32_t result = strIter->getOffset(); 1234 if (oStr != CollationElementIterator::NULLORDER) { 1235 --result; // back over character that we don't want to consume; 1236 } 1237 1238#ifdef RBNF_DEBUG 1239 fprintf(stderr, "prefix length: %d\n", result); 1240#endif 1241 return result; 1242#if 0 1243 //---------------------------------------------------------------- 1244 // JDK 1.2-specific API call 1245 // return strIter.getOffset(); 1246 //---------------------------------------------------------------- 1247 // JDK 1.1 HACK (take out for 1.2-specific code) 1248 1249 // if we make it to here, we have a successful match. Now we 1250 // have to find out HOW MANY characters from the target string 1251 // matched the prefix (there isn't necessarily a one-to-one 1252 // mapping between collation elements and characters). 1253 // In JDK 1.2, there's a simple getOffset() call we can use. 1254 // In JDK 1.1, on the other hand, we have to go through some 1255 // ugly contortions. First, use the collator to compare the 1256 // same number of characters from the prefix and target string. 1257 // If they're equal, we're done. 1258 collator->setStrength(Collator::PRIMARY); 1259 if (str.length() >= prefix.length()) { 1260 UnicodeString temp; 1261 temp.setTo(str, 0, prefix.length()); 1262 if (collator->equals(temp, prefix)) { 1263#ifdef RBNF_DEBUG 1264 fprintf(stderr, "returning: %d\n", prefix.length()); 1265#endif 1266 return prefix.length(); 1267 } 1268 } 1269 1270 // if they're not equal, then we have to compare successively 1271 // larger and larger substrings of the target string until we 1272 // get to one that matches the prefix. At that point, we know 1273 // how many characters matched the prefix, and we can return. 1274 int32_t p = 1; 1275 while (p <= str.length()) { 1276 UnicodeString temp; 1277 temp.setTo(str, 0, p); 1278 if (collator->equals(temp, prefix)) { 1279 return p; 1280 } else { 1281 ++p; 1282 } 1283 } 1284 1285 // SHOULD NEVER GET HERE!!! 1286 return 0; 1287 //---------------------------------------------------------------- 1288#endif 1289 1290 // If lenient parsing is turned off, forget all that crap above. 1291 // Just use String.startsWith() and be done with it. 1292 } else 1293#endif 1294 { 1295 if (str.startsWith(prefix)) { 1296 return prefix.length(); 1297 } else { 1298 return 0; 1299 } 1300 } 1301} 1302 1303/** 1304* Searches a string for another string. If lenient parsing is off, 1305* this just calls indexOf(). If lenient parsing is on, this function 1306* uses CollationElementIterator to match characters, and only 1307* primary-order differences are significant in determining whether 1308* there's a match. 1309* @param str The string to search 1310* @param key The string to search "str" for 1311* @param startingAt The index into "str" where the search is to 1312* begin 1313* @return A two-element array of ints. Element 0 is the position 1314* of the match, or -1 if there was no match. Element 1 is the 1315* number of characters in "str" that matched (which isn't necessarily 1316* the same as the length of "key") 1317*/ 1318int32_t 1319NFRule::findText(const UnicodeString& str, 1320 const UnicodeString& key, 1321 int32_t startingAt, 1322 int32_t* length) const 1323{ 1324#if !UCONFIG_NO_COLLATION 1325 // if lenient parsing is turned off, this is easy: just call 1326 // String.indexOf() and we're done 1327 if (!formatter->isLenient()) { 1328 *length = key.length(); 1329 return str.indexOf(key, startingAt); 1330 1331 // but if lenient parsing is turned ON, we've got some work 1332 // ahead of us 1333 } else 1334#endif 1335 { 1336 //---------------------------------------------------------------- 1337 // JDK 1.1 HACK (take out of 1.2-specific code) 1338 1339 // in JDK 1.2, CollationElementIterator provides us with an 1340 // API to map between character offsets and collation elements 1341 // and we can do this by marching through the string comparing 1342 // collation elements. We can't do that in JDK 1.1. Insted, 1343 // we have to go through this horrible slow mess: 1344 int32_t p = startingAt; 1345 int32_t keyLen = 0; 1346 1347 // basically just isolate smaller and smaller substrings of 1348 // the target string (each running to the end of the string, 1349 // and with the first one running from startingAt to the end) 1350 // and then use prefixLength() to see if the search key is at 1351 // the beginning of each substring. This is excruciatingly 1352 // slow, but it will locate the key and tell use how long the 1353 // matching text was. 1354 UnicodeString temp; 1355 UErrorCode status = U_ZERO_ERROR; 1356 while (p < str.length() && keyLen == 0) { 1357 temp.setTo(str, p, str.length() - p); 1358 keyLen = prefixLength(temp, key, status); 1359 if (U_FAILURE(status)) { 1360 break; 1361 } 1362 if (keyLen != 0) { 1363 *length = keyLen; 1364 return p; 1365 } 1366 ++p; 1367 } 1368 // if we make it to here, we didn't find it. Return -1 for the 1369 // location. The length should be ignored, but set it to 0, 1370 // which should be "safe" 1371 *length = 0; 1372 return -1; 1373 1374 //---------------------------------------------------------------- 1375 // JDK 1.2 version of this routine 1376 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); 1377 // 1378 //CollationElementIterator strIter = collator.getCollationElementIterator(str); 1379 //CollationElementIterator keyIter = collator.getCollationElementIterator(key); 1380 // 1381 //int keyStart = -1; 1382 // 1383 //str.setOffset(startingAt); 1384 // 1385 //int oStr = strIter.next(); 1386 //int oKey = keyIter.next(); 1387 //while (oKey != CollationElementIterator.NULLORDER) { 1388 // while (oStr != CollationElementIterator.NULLORDER && 1389 // CollationElementIterator.primaryOrder(oStr) == 0) 1390 // oStr = strIter.next(); 1391 // 1392 // while (oKey != CollationElementIterator.NULLORDER && 1393 // CollationElementIterator.primaryOrder(oKey) == 0) 1394 // oKey = keyIter.next(); 1395 // 1396 // if (oStr == CollationElementIterator.NULLORDER) { 1397 // return new int[] { -1, 0 }; 1398 // } 1399 // 1400 // if (oKey == CollationElementIterator.NULLORDER) { 1401 // break; 1402 // } 1403 // 1404 // if (CollationElementIterator.primaryOrder(oStr) == 1405 // CollationElementIterator.primaryOrder(oKey)) { 1406 // keyStart = strIter.getOffset(); 1407 // oStr = strIter.next(); 1408 // oKey = keyIter.next(); 1409 // } else { 1410 // if (keyStart != -1) { 1411 // keyStart = -1; 1412 // keyIter.reset(); 1413 // } else { 1414 // oStr = strIter.next(); 1415 // } 1416 // } 1417 //} 1418 // 1419 //if (oKey == CollationElementIterator.NULLORDER) { 1420 // return new int[] { keyStart, strIter.getOffset() - keyStart }; 1421 //} else { 1422 // return new int[] { -1, 0 }; 1423 //} 1424 } 1425} 1426 1427/** 1428* Checks to see whether a string consists entirely of ignorable 1429* characters. 1430* @param str The string to test. 1431* @return true if the string is empty of consists entirely of 1432* characters that the number formatter's collator says are 1433* ignorable at the primary-order level. false otherwise. 1434*/ 1435UBool 1436NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const 1437{ 1438 // if the string is empty, we can just return true 1439 if (str.length() == 0) { 1440 return TRUE; 1441 } 1442 1443#if !UCONFIG_NO_COLLATION 1444 // if lenient parsing is turned on, walk through the string with 1445 // a collation element iterator and make sure each collation 1446 // element is 0 (ignorable) at the primary level 1447 if (formatter->isLenient()) { 1448 const RuleBasedCollator* collator = formatter->getCollator(); 1449 if (collator == NULL) { 1450 status = U_MEMORY_ALLOCATION_ERROR; 1451 return FALSE; 1452 } 1453 LocalPointer<CollationElementIterator> iter(collator->createCollationElementIterator(str)); 1454 1455 // Memory allocation error check. 1456 if (iter.isNull()) { 1457 status = U_MEMORY_ALLOCATION_ERROR; 1458 return FALSE; 1459 } 1460 1461 UErrorCode err = U_ZERO_ERROR; 1462 int32_t o = iter->next(err); 1463 while (o != CollationElementIterator::NULLORDER 1464 && CollationElementIterator::primaryOrder(o) == 0) { 1465 o = iter->next(err); 1466 } 1467 1468 return o == CollationElementIterator::NULLORDER; 1469 } 1470#endif 1471 1472 // if lenient parsing is turned off, there is no such thing as 1473 // an ignorable character: return true only if the string is empty 1474 return FALSE; 1475} 1476 1477U_NAMESPACE_END 1478 1479/* U_HAVE_RBNF */ 1480#endif 1481