intl/icu/source/i18n/nfrule.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 * Copyright (C) 1997-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 ******************************************************************************
michael@0 6 * file name: nfrule.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * Modification history
michael@0 12 * Date Name Comments
michael@0 13 * 10/11/2001 Doug Ported from ICU4J
michael@0 14 */
michael@0 15
michael@0 16 #include "nfrule.h"
michael@0 17
michael@0 18 #if U_HAVE_RBNF
michael@0 19
michael@0 20 #include "unicode/rbnf.h"
michael@0 21 #include "unicode/tblcoll.h"
michael@0 22 #include "unicode/coleitr.h"
michael@0 23 #include "unicode/uchar.h"
michael@0 24 #include "nfrs.h"
michael@0 25 #include "nfrlist.h"
michael@0 26 #include "nfsubs.h"
michael@0 27 #include "patternprops.h"
michael@0 28
michael@0 29 U_NAMESPACE_BEGIN
michael@0 30
michael@0 31 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf)
michael@0 32 : baseValue((int32_t)0)
michael@0 33 , radix(0)
michael@0 34 , exponent(0)
michael@0 35 , ruleText()
michael@0 36 , sub1(NULL)
michael@0 37 , sub2(NULL)
michael@0 38 , formatter(_rbnf)
michael@0 39 {
michael@0 40 }
michael@0 41
michael@0 42 NFRule::~NFRule()
michael@0 43 {
michael@0 44 delete sub1;
michael@0 45 delete sub2;
michael@0 46 }
michael@0 47
michael@0 48 static const UChar gLeftBracket = 0x005b;
michael@0 49 static const UChar gRightBracket = 0x005d;
michael@0 50 static const UChar gColon = 0x003a;
michael@0 51 static const UChar gZero = 0x0030;
michael@0 52 static const UChar gNine = 0x0039;
michael@0 53 static const UChar gSpace = 0x0020;
michael@0 54 static const UChar gSlash = 0x002f;
michael@0 55 static const UChar gGreaterThan = 0x003e;
michael@0 56 static const UChar gLessThan = 0x003c;
michael@0 57 static const UChar gComma = 0x002c;
michael@0 58 static const UChar gDot = 0x002e;
michael@0 59 static const UChar gTick = 0x0027;
michael@0 60 //static const UChar gMinus = 0x002d;
michael@0 61 static const UChar gSemicolon = 0x003b;
michael@0 62
michael@0 63 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */
michael@0 64 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */
michael@0 65 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */
michael@0 66 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */
michael@0 67
michael@0 68 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */
michael@0 69 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */
michael@0 70 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */
michael@0 71 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */
michael@0 72 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */
michael@0 73 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */
michael@0 74 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */
michael@0 75 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */
michael@0 76 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */
michael@0 77 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */
michael@0 78 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */
michael@0 79 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */
michael@0 80
michael@0 81 static const UChar * const tokenStrings[] = {
michael@0 82 gLessLess, gLessPercent, gLessHash, gLessZero,
michael@0 83 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero,
michael@0 84 gEqualPercent, gEqualHash, gEqualZero, NULL
michael@0 85 };
michael@0 86
michael@0 87 void
michael@0 88 NFRule::makeRules(UnicodeString& description,
michael@0 89 const NFRuleSet *ruleSet,
michael@0 90 const NFRule *predecessor,
michael@0 91 const RuleBasedNumberFormat *rbnf,
michael@0 92 NFRuleList& rules,
michael@0 93 UErrorCode& status)
michael@0 94 {
michael@0 95 // we know we're making at least one rule, so go ahead and
michael@0 96 // new it up and initialize its basevalue and divisor
michael@0 97 // (this also strips the rule descriptor, if any, off the
michael@0 98 // descripton string)
michael@0 99 NFRule* rule1 = new NFRule(rbnf);
michael@0 100 /* test for NULL */
michael@0 101 if (rule1 == 0) {
michael@0 102 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 103 return;
michael@0 104 }
michael@0 105 rule1->parseRuleDescriptor(description, status);
michael@0 106
michael@0 107 // check the description to see whether there's text enclosed
michael@0 108 // in brackets
michael@0 109 int32_t brack1 = description.indexOf(gLeftBracket);
michael@0 110 int32_t brack2 = description.indexOf(gRightBracket);
michael@0 111
michael@0 112 // if the description doesn't contain a matched pair of brackets,
michael@0 113 // or if it's of a type that doesn't recognize bracketed text,
michael@0 114 // then leave the description alone, initialize the rule's
michael@0 115 // rule text and substitutions, and return that rule
michael@0 116 if (brack1 == -1 || brack2 == -1 || brack1 > brack2
michael@0 117 || rule1->getType() == kProperFractionRule
michael@0 118 || rule1->getType() == kNegativeNumberRule) {
michael@0 119 rule1->ruleText = description;
michael@0 120 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status);
michael@0 121 rules.add(rule1);
michael@0 122 } else {
michael@0 123 // if the description does contain a matched pair of brackets,
michael@0 124 // then it's really shorthand for two rules (with one exception)
michael@0 125 NFRule* rule2 = NULL;
michael@0 126 UnicodeString sbuf;
michael@0 127
michael@0 128 // we'll actually only split the rule into two rules if its
michael@0 129 // base value is an even multiple of its divisor (or it's one
michael@0 130 // of the special rules)
michael@0 131 if ((rule1->baseValue > 0
michael@0 132 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0)
michael@0 133 || rule1->getType() == kImproperFractionRule
michael@0 134 || rule1->getType() == kMasterRule) {
michael@0 135
michael@0 136 // if it passes that test, new up the second rule. If the
michael@0 137 // rule set both rules will belong to is a fraction rule
michael@0 138 // set, they both have the same base value; otherwise,
michael@0 139 // increment the original rule's base value ("rule1" actually
michael@0 140 // goes SECOND in the rule set's rule list)
michael@0 141 rule2 = new NFRule(rbnf);
michael@0 142 /* test for NULL */
michael@0 143 if (rule2 == 0) {
michael@0 144 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 145 return;
michael@0 146 }
michael@0 147 if (rule1->baseValue >= 0) {
michael@0 148 rule2->baseValue = rule1->baseValue;
michael@0 149 if (!ruleSet->isFractionRuleSet()) {
michael@0 150 ++rule1->baseValue;
michael@0 151 }
michael@0 152 }
michael@0 153
michael@0 154 // if the description began with "x.x" and contains bracketed
michael@0 155 // text, it describes both the improper fraction rule and
michael@0 156 // the proper fraction rule
michael@0 157 else if (rule1->getType() == kImproperFractionRule) {
michael@0 158 rule2->setType(kProperFractionRule);
michael@0 159 }
michael@0 160
michael@0 161 // if the description began with "x.0" and contains bracketed
michael@0 162 // text, it describes both the master rule and the
michael@0 163 // improper fraction rule
michael@0 164 else if (rule1->getType() == kMasterRule) {
michael@0 165 rule2->baseValue = rule1->baseValue;
michael@0 166 rule1->setType(kImproperFractionRule);
michael@0 167 }
michael@0 168
michael@0 169 // both rules have the same radix and exponent (i.e., the
michael@0 170 // same divisor)
michael@0 171 rule2->radix = rule1->radix;
michael@0 172 rule2->exponent = rule1->exponent;
michael@0 173
michael@0 174 // rule2's rule text omits the stuff in brackets: initalize
michael@0 175 // its rule text and substitutions accordingly
michael@0 176 sbuf.append(description, 0, brack1);
michael@0 177 if (brack2 + 1 < description.length()) {
michael@0 178 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
michael@0 179 }
michael@0 180 rule2->ruleText.setTo(sbuf);
michael@0 181 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status);
michael@0 182 }
michael@0 183
michael@0 184 // rule1's text includes the text in the brackets but omits
michael@0 185 // the brackets themselves: initialize _its_ rule text and
michael@0 186 // substitutions accordingly
michael@0 187 sbuf.setTo(description, 0, brack1);
michael@0 188 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
michael@0 189 if (brack2 + 1 < description.length()) {
michael@0 190 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
michael@0 191 }
michael@0 192 rule1->ruleText.setTo(sbuf);
michael@0 193 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status);
michael@0 194
michael@0 195 // if we only have one rule, return it; if we have two, return
michael@0 196 // a two-element array containing them (notice that rule2 goes
michael@0 197 // BEFORE rule1 in the list: in all cases, rule2 OMITS the
michael@0 198 // material in the brackets and rule1 INCLUDES the material
michael@0 199 // in the brackets)
michael@0 200 if (rule2 != NULL) {
michael@0 201 rules.add(rule2);
michael@0 202 }
michael@0 203 rules.add(rule1);
michael@0 204 }
michael@0 205 }
michael@0 206
michael@0 207 /**
michael@0 208 * This function parses the rule's rule descriptor (i.e., the base
michael@0 209 * value and/or other tokens that precede the rule's rule text
michael@0 210 * in the description) and sets the rule's base value, radix, and
michael@0 211 * exponent according to the descriptor. (If the description doesn't
michael@0 212 * include a rule descriptor, then this function sets everything to
michael@0 213 * default values and the rule set sets the rule's real base value).
michael@0 214 * @param description The rule's description
michael@0 215 * @return If "description" included a rule descriptor, this is
michael@0 216 * "description" with the descriptor and any trailing whitespace
michael@0 217 * stripped off. Otherwise; it's "descriptor" unchangd.
michael@0 218 */
michael@0 219 void
michael@0 220 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
michael@0 221 {
michael@0 222 // the description consists of a rule descriptor and a rule body,
michael@0 223 // separated by a colon. The rule descriptor is optional. If
michael@0 224 // it's omitted, just set the base value to 0.
michael@0 225 int32_t p = description.indexOf(gColon);
michael@0 226 if (p == -1) {
michael@0 227 setBaseValue((int32_t)0, status);
michael@0 228 } else {
michael@0 229 // copy the descriptor out into its own string and strip it,
michael@0 230 // along with any trailing whitespace, out of the original
michael@0 231 // description
michael@0 232 UnicodeString descriptor;
michael@0 233 descriptor.setTo(description, 0, p);
michael@0 234
michael@0 235 ++p;
michael@0 236 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) {
michael@0 237 ++p;
michael@0 238 }
michael@0 239 description.removeBetween(0, p);
michael@0 240
michael@0 241 // check first to see if the rule descriptor matches the token
michael@0 242 // for one of the special rules. If it does, set the base
michael@0 243 // value to the correct identfier value
michael@0 244 if (0 == descriptor.compare(gMinusX, 2)) {
michael@0 245 setType(kNegativeNumberRule);
michael@0 246 }
michael@0 247 else if (0 == descriptor.compare(gXDotX, 3)) {
michael@0 248 setType(kImproperFractionRule);
michael@0 249 }
michael@0 250 else if (0 == descriptor.compare(gZeroDotX, 3)) {
michael@0 251 setType(kProperFractionRule);
michael@0 252 }
michael@0 253 else if (0 == descriptor.compare(gXDotZero, 3)) {
michael@0 254 setType(kMasterRule);
michael@0 255 }
michael@0 256
michael@0 257 // if the rule descriptor begins with a digit, it's a descriptor
michael@0 258 // for a normal rule
michael@0 259 // since we don't have Long.parseLong, and this isn't much work anyway,
michael@0 260 // just build up the value as we encounter the digits.
michael@0 261 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) {
michael@0 262 int64_t val = 0;
michael@0 263 p = 0;
michael@0 264 UChar c = gSpace;
michael@0 265
michael@0 266 // begin parsing the descriptor: copy digits
michael@0 267 // into "tempValue", skip periods, commas, and spaces,
michael@0 268 // stop on a slash or > sign (or at the end of the string),
michael@0 269 // and throw an exception on any other character
michael@0 270 int64_t ll_10 = 10;
michael@0 271 while (p < descriptor.length()) {
michael@0 272 c = descriptor.charAt(p);
michael@0 273 if (c >= gZero && c <= gNine) {
michael@0 274 val = val * ll_10 + (int32_t)(c - gZero);
michael@0 275 }
michael@0 276 else if (c == gSlash || c == gGreaterThan) {
michael@0 277 break;
michael@0 278 }
michael@0 279 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
michael@0 280 }
michael@0 281 else {
michael@0 282 // throw new IllegalArgumentException("Illegal character in rule descriptor");
michael@0 283 status = U_PARSE_ERROR;
michael@0 284 return;
michael@0 285 }
michael@0 286 ++p;
michael@0 287 }
michael@0 288
michael@0 289 // we have the base value, so set it
michael@0 290 setBaseValue(val, status);
michael@0 291
michael@0 292 // if we stopped the previous loop on a slash, we're
michael@0 293 // now parsing the rule's radix. Again, accumulate digits
michael@0 294 // in tempValue, skip punctuation, stop on a > mark, and
michael@0 295 // throw an exception on anything else
michael@0 296 if (c == gSlash) {
michael@0 297 val = 0;
michael@0 298 ++p;
michael@0 299 int64_t ll_10 = 10;
michael@0 300 while (p < descriptor.length()) {
michael@0 301 c = descriptor.charAt(p);
michael@0 302 if (c >= gZero && c <= gNine) {
michael@0 303 val = val * ll_10 + (int32_t)(c - gZero);
michael@0 304 }
michael@0 305 else if (c == gGreaterThan) {
michael@0 306 break;
michael@0 307 }
michael@0 308 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) {
michael@0 309 }
michael@0 310 else {
michael@0 311 // throw new IllegalArgumentException("Illegal character is rule descriptor");
michael@0 312 status = U_PARSE_ERROR;
michael@0 313 return;
michael@0 314 }
michael@0 315 ++p;
michael@0 316 }
michael@0 317
michael@0 318 // tempValue now contain's the rule's radix. Set it
michael@0 319 // accordingly, and recalculate the rule's exponent
michael@0 320 radix = (int32_t)val;
michael@0 321 if (radix == 0) {
michael@0 322 // throw new IllegalArgumentException("Rule can't have radix of 0");
michael@0 323 status = U_PARSE_ERROR;
michael@0 324 }
michael@0 325
michael@0 326 exponent = expectedExponent();
michael@0 327 }
michael@0 328
michael@0 329 // if we stopped the previous loop on a > sign, then continue
michael@0 330 // for as long as we still see > signs. For each one,
michael@0 331 // decrement the exponent (unless the exponent is already 0).
michael@0 332 // If we see another character before reaching the end of
michael@0 333 // the descriptor, that's also a syntax error.
michael@0 334 if (c == gGreaterThan) {
michael@0 335 while (p < descriptor.length()) {
michael@0 336 c = descriptor.charAt(p);
michael@0 337 if (c == gGreaterThan && exponent > 0) {
michael@0 338 --exponent;
michael@0 339 } else {
michael@0 340 // throw new IllegalArgumentException("Illegal character in rule descriptor");
michael@0 341 status = U_PARSE_ERROR;
michael@0 342 return;
michael@0 343 }
michael@0 344 ++p;
michael@0 345 }
michael@0 346 }
michael@0 347 }
michael@0 348 }
michael@0 349
michael@0 350 // finally, if the rule body begins with an apostrophe, strip it off
michael@0 351 // (this is generally used to put whitespace at the beginning of
michael@0 352 // a rule's rule text)
michael@0 353 if (description.length() > 0 && description.charAt(0) == gTick) {
michael@0 354 description.removeBetween(0, 1);
michael@0 355 }
michael@0 356
michael@0 357 // return the description with all the stuff we've just waded through
michael@0 358 // stripped off the front. It now contains just the rule body.
michael@0 359 // return description;
michael@0 360 }
michael@0 361
michael@0 362 /**
michael@0 363 * Searches the rule's rule text for the substitution tokens,
michael@0 364 * creates the substitutions, and removes the substitution tokens
michael@0 365 * from the rule's rule text.
michael@0 366 * @param owner The rule set containing this rule
michael@0 367 * @param predecessor The rule preseding this one in "owners" rule list
michael@0 368 * @param ownersOwner The RuleBasedFormat that owns this rule
michael@0 369 */
michael@0 370 void
michael@0 371 NFRule::extractSubstitutions(const NFRuleSet* ruleSet,
michael@0 372 const NFRule* predecessor,
michael@0 373 const RuleBasedNumberFormat* rbnf,
michael@0 374 UErrorCode& status)
michael@0 375 {
michael@0 376 if (U_SUCCESS(status)) {
michael@0 377 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status);
michael@0 378 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status);
michael@0 379 }
michael@0 380 }
michael@0 381
michael@0 382 /**
michael@0 383 * Searches the rule's rule text for the first substitution token,
michael@0 384 * creates a substitution based on it, and removes the token from
michael@0 385 * the rule's rule text.
michael@0 386 * @param owner The rule set containing this rule
michael@0 387 * @param predecessor The rule preceding this one in the rule set's
michael@0 388 * rule list
michael@0 389 * @param ownersOwner The RuleBasedNumberFormat that owns this rule
michael@0 390 * @return The newly-created substitution. This is never null; if
michael@0 391 * the rule text doesn't contain any substitution tokens, this will
michael@0 392 * be a NullSubstitution.
michael@0 393 */
michael@0 394 NFSubstitution *
michael@0 395 NFRule::extractSubstitution(const NFRuleSet* ruleSet,
michael@0 396 const NFRule* predecessor,
michael@0 397 const RuleBasedNumberFormat* rbnf,
michael@0 398 UErrorCode& status)
michael@0 399 {
michael@0 400 NFSubstitution* result = NULL;
michael@0 401
michael@0 402 // search the rule's rule text for the first two characters of
michael@0 403 // a substitution token
michael@0 404 int32_t subStart = indexOfAny(tokenStrings);
michael@0 405 int32_t subEnd = subStart;
michael@0 406
michael@0 407 // if we didn't find one, create a null substitution positioned
michael@0 408 // at the end of the rule text
michael@0 409 if (subStart == -1) {
michael@0 410 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor,
michael@0 411 ruleSet, rbnf, UnicodeString(), status);
michael@0 412 }
michael@0 413
michael@0 414 // special-case the ">>>" token, since searching for the > at the
michael@0 415 // end will actually find the > in the middle
michael@0 416 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) {
michael@0 417 subEnd = subStart + 2;
michael@0 418
michael@0 419 // otherwise the substitution token ends with the same character
michael@0 420 // it began with
michael@0 421 } else {
michael@0 422 UChar c = ruleText.charAt(subStart);
michael@0 423 subEnd = ruleText.indexOf(c, subStart + 1);
michael@0 424 // special case for '<%foo<<'
michael@0 425 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) {
michael@0 426 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle
michael@0 427 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack
michael@0 428 // to get around this. Having the duplicate at the front would cause problems with
michael@0 429 // rules like "<<%" to format, say, percents...
michael@0 430 ++subEnd;
michael@0 431 }
michael@0 432 }
michael@0 433
michael@0 434 // if we don't find the end of the token (i.e., if we're on a single,
michael@0 435 // unmatched token character), create a null substitution positioned
michael@0 436 // at the end of the rule
michael@0 437 if (subEnd == -1) {
michael@0 438 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor,
michael@0 439 ruleSet, rbnf, UnicodeString(), status);
michael@0 440 }
michael@0 441
michael@0 442 // if we get here, we have a real substitution token (or at least
michael@0 443 // some text bounded by substitution token characters). Use
michael@0 444 // makeSubstitution() to create the right kind of substitution
michael@0 445 UnicodeString subToken;
michael@0 446 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart);
michael@0 447 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet,
michael@0 448 rbnf, subToken, status);
michael@0 449
michael@0 450 // remove the substitution from the rule text
michael@0 451 ruleText.removeBetween(subStart, subEnd+1);
michael@0 452
michael@0 453 return result;
michael@0 454 }
michael@0 455
michael@0 456 /**
michael@0 457 * Sets the rule's base value, and causes the radix and exponent
michael@0 458 * to be recalculated. This is used during construction when we
michael@0 459 * don't know the rule's base value until after it's been
michael@0 460 * constructed. It should be used at any other time.
michael@0 461 * @param The new base value for the rule.
michael@0 462 */
michael@0 463 void
michael@0 464 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status)
michael@0 465 {
michael@0 466 // set the base value
michael@0 467 baseValue = newBaseValue;
michael@0 468
michael@0 469 // if this isn't a special rule, recalculate the radix and exponent
michael@0 470 // (the radix always defaults to 10; if it's supposed to be something
michael@0 471 // else, it's cleaned up by the caller and the exponent is
michael@0 472 // recalculated again-- the only function that does this is
michael@0 473 // NFRule.parseRuleDescriptor() )
michael@0 474 if (baseValue >= 1) {
michael@0 475 radix = 10;
michael@0 476 exponent = expectedExponent();
michael@0 477
michael@0 478 // this function gets called on a fully-constructed rule whose
michael@0 479 // description didn't specify a base value. This means it
michael@0 480 // has substitutions, and some substitutions hold on to copies
michael@0 481 // of the rule's divisor. Fix their copies of the divisor.
michael@0 482 if (sub1 != NULL) {
michael@0 483 sub1->setDivisor(radix, exponent, status);
michael@0 484 }
michael@0 485 if (sub2 != NULL) {
michael@0 486 sub2->setDivisor(radix, exponent, status);
michael@0 487 }
michael@0 488
michael@0 489 // if this is a special rule, its radix and exponent are basically
michael@0 490 // ignored. Set them to "safe" default values
michael@0 491 } else {
michael@0 492 radix = 10;
michael@0 493 exponent = 0;
michael@0 494 }
michael@0 495 }
michael@0 496
michael@0 497 /**
michael@0 498 * This calculates the rule's exponent based on its radix and base
michael@0 499 * value. This will be the highest power the radix can be raised to
michael@0 500 * and still produce a result less than or equal to the base value.
michael@0 501 */
michael@0 502 int16_t
michael@0 503 NFRule::expectedExponent() const
michael@0 504 {
michael@0 505 // since the log of 0, or the log base 0 of something, causes an
michael@0 506 // error, declare the exponent in these cases to be 0 (we also
michael@0 507 // deal with the special-rule identifiers here)
michael@0 508 if (radix == 0 || baseValue < 1) {
michael@0 509 return 0;
michael@0 510 }
michael@0 511
michael@0 512 // we get rounding error in some cases-- for example, log 1000 / log 10
michael@0 513 // gives us 1.9999999996 instead of 2. The extra logic here is to take
michael@0 514 // that into account
michael@0 515 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix));
michael@0 516 int64_t temp = util64_pow(radix, tempResult + 1);
michael@0 517 if (temp <= baseValue) {
michael@0 518 tempResult += 1;
michael@0 519 }
michael@0 520 return tempResult;
michael@0 521 }
michael@0 522
michael@0 523 /**
michael@0 524 * Searches the rule's rule text for any of the specified strings.
michael@0 525 * @param strings An array of strings to search the rule's rule
michael@0 526 * text for
michael@0 527 * @return The index of the first match in the rule's rule text
michael@0 528 * (i.e., the first substring in the rule's rule text that matches
michael@0 529 * _any_ of the strings in "strings"). If none of the strings in
michael@0 530 * "strings" is found in the rule's rule text, returns -1.
michael@0 531 */
michael@0 532 int32_t
michael@0 533 NFRule::indexOfAny(const UChar* const strings[]) const
michael@0 534 {
michael@0 535 int result = -1;
michael@0 536 for (int i = 0; strings[i]; i++) {
michael@0 537 int32_t pos = ruleText.indexOf(*strings[i]);
michael@0 538 if (pos != -1 && (result == -1 || pos < result)) {
michael@0 539 result = pos;
michael@0 540 }
michael@0 541 }
michael@0 542 return result;
michael@0 543 }
michael@0 544
michael@0 545 //-----------------------------------------------------------------------
michael@0 546 // boilerplate
michael@0 547 //-----------------------------------------------------------------------
michael@0 548
michael@0 549 /**
michael@0 550 * Tests two rules for equality.
michael@0 551 * @param that The rule to compare this one against
michael@0 552 * @return True is the two rules are functionally equivalent
michael@0 553 */
michael@0 554 UBool
michael@0 555 NFRule::operator==(const NFRule& rhs) const
michael@0 556 {
michael@0 557 return baseValue == rhs.baseValue
michael@0 558 && radix == rhs.radix
michael@0 559 && exponent == rhs.exponent
michael@0 560 && ruleText == rhs.ruleText
michael@0 561 && *sub1 == *rhs.sub1
michael@0 562 && *sub2 == *rhs.sub2;
michael@0 563 }
michael@0 564
michael@0 565 /**
michael@0 566 * Returns a textual representation of the rule. This won't
michael@0 567 * necessarily be the same as the description that this rule
michael@0 568 * was created with, but it will produce the same result.
michael@0 569 * @return A textual description of the rule
michael@0 570 */
michael@0 571 static void util_append64(UnicodeString& result, int64_t n)
michael@0 572 {
michael@0 573 UChar buffer[256];
michael@0 574 int32_t len = util64_tou(n, buffer, sizeof(buffer));
michael@0 575 UnicodeString temp(buffer, len);
michael@0 576 result.append(temp);
michael@0 577 }
michael@0 578
michael@0 579 void
michael@0 580 NFRule::_appendRuleText(UnicodeString& result) const
michael@0 581 {
michael@0 582 switch (getType()) {
michael@0 583 case kNegativeNumberRule: result.append(gMinusX, 2); break;
michael@0 584 case kImproperFractionRule: result.append(gXDotX, 3); break;
michael@0 585 case kProperFractionRule: result.append(gZeroDotX, 3); break;
michael@0 586 case kMasterRule: result.append(gXDotZero, 3); break;
michael@0 587 default:
michael@0 588 // for a normal rule, write out its base value, and if the radix is
michael@0 589 // something other than 10, write out the radix (with the preceding
michael@0 590 // slash, of course). Then calculate the expected exponent and if
michael@0 591 // if isn't the same as the actual exponent, write an appropriate
michael@0 592 // number of > signs. Finally, terminate the whole thing with
michael@0 593 // a colon.
michael@0 594 util_append64(result, baseValue);
michael@0 595 if (radix != 10) {
michael@0 596 result.append(gSlash);
michael@0 597 util_append64(result, radix);
michael@0 598 }
michael@0 599 int numCarets = expectedExponent() - exponent;
michael@0 600 for (int i = 0; i < numCarets; i++) {
michael@0 601 result.append(gGreaterThan);
michael@0 602 }
michael@0 603 break;
michael@0 604 }
michael@0 605 result.append(gColon);
michael@0 606 result.append(gSpace);
michael@0 607
michael@0 608 // if the rule text begins with a space, write an apostrophe
michael@0 609 // (whitespace after the rule descriptor is ignored; the
michael@0 610 // apostrophe is used to make the whitespace significant)
michael@0 611 if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) {
michael@0 612 result.append(gTick);
michael@0 613 }
michael@0 614
michael@0 615 // now, write the rule's rule text, inserting appropriate
michael@0 616 // substitution tokens in the appropriate places
michael@0 617 UnicodeString ruleTextCopy;
michael@0 618 ruleTextCopy.setTo(ruleText);
michael@0 619
michael@0 620 UnicodeString temp;
michael@0 621 sub2->toString(temp);
michael@0 622 ruleTextCopy.insert(sub2->getPos(), temp);
michael@0 623 sub1->toString(temp);
michael@0 624 ruleTextCopy.insert(sub1->getPos(), temp);
michael@0 625
michael@0 626 result.append(ruleTextCopy);
michael@0 627
michael@0 628 // and finally, top the whole thing off with a semicolon and
michael@0 629 // return the result
michael@0 630 result.append(gSemicolon);
michael@0 631 }
michael@0 632
michael@0 633 //-----------------------------------------------------------------------
michael@0 634 // formatting
michael@0 635 //-----------------------------------------------------------------------
michael@0 636
michael@0 637 /**
michael@0 638 * Formats the number, and inserts the resulting text into
michael@0 639 * toInsertInto.
michael@0 640 * @param number The number being formatted
michael@0 641 * @param toInsertInto The string where the resultant text should
michael@0 642 * be inserted
michael@0 643 * @param pos The position in toInsertInto where the resultant text
michael@0 644 * should be inserted
michael@0 645 */
michael@0 646 void
michael@0 647 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const
michael@0 648 {
michael@0 649 // first, insert the rule's rule text into toInsertInto at the
michael@0 650 // specified position, then insert the results of the substitutions
michael@0 651 // into the right places in toInsertInto (notice we do the
michael@0 652 // substitutions in reverse order so that the offsets don't get
michael@0 653 // messed up)
michael@0 654 toInsertInto.insert(pos, ruleText);
michael@0 655 sub2->doSubstitution(number, toInsertInto, pos);
michael@0 656 sub1->doSubstitution(number, toInsertInto, pos);
michael@0 657 }
michael@0 658
michael@0 659 /**
michael@0 660 * Formats the number, and inserts the resulting text into
michael@0 661 * toInsertInto.
michael@0 662 * @param number The number being formatted
michael@0 663 * @param toInsertInto The string where the resultant text should
michael@0 664 * be inserted
michael@0 665 * @param pos The position in toInsertInto where the resultant text
michael@0 666 * should be inserted
michael@0 667 */
michael@0 668 void
michael@0 669 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const
michael@0 670 {
michael@0 671 // first, insert the rule's rule text into toInsertInto at the
michael@0 672 // specified position, then insert the results of the substitutions
michael@0 673 // into the right places in toInsertInto
michael@0 674 // [again, we have two copies of this routine that do the same thing
michael@0 675 // so that we don't sacrifice precision in a long by casting it
michael@0 676 // to a double]
michael@0 677 toInsertInto.insert(pos, ruleText);
michael@0 678 sub2->doSubstitution(number, toInsertInto, pos);
michael@0 679 sub1->doSubstitution(number, toInsertInto, pos);
michael@0 680 }
michael@0 681
michael@0 682 /**
michael@0 683 * Used by the owning rule set to determine whether to invoke the
michael@0 684 * rollback rule (i.e., whether this rule or the one that precedes
michael@0 685 * it in the rule set's list should be used to format the number)
michael@0 686 * @param The number being formatted
michael@0 687 * @return True if the rule set should use the rule that precedes
michael@0 688 * this one in its list; false if it should use this rule
michael@0 689 */
michael@0 690 UBool
michael@0 691 NFRule::shouldRollBack(double number) const
michael@0 692 {
michael@0 693 // we roll back if the rule contains a modulus substitution,
michael@0 694 // the number being formatted is an even multiple of the rule's
michael@0 695 // divisor, and the rule's base value is NOT an even multiple
michael@0 696 // of its divisor
michael@0 697 // In other words, if the original description had
michael@0 698 // 100: << hundred[ >>];
michael@0 699 // that expands into
michael@0 700 // 100: << hundred;
michael@0 701 // 101: << hundred >>;
michael@0 702 // internally. But when we're formatting 200, if we use the rule
michael@0 703 // at 101, which would normally apply, we get "two hundred zero".
michael@0 704 // To prevent this, we roll back and use the rule at 100 instead.
michael@0 705 // This is the logic that makes this happen: the rule at 101 has
michael@0 706 // a modulus substitution, its base value isn't an even multiple
michael@0 707 // of 100, and the value we're trying to format _is_ an even
michael@0 708 // multiple of 100. This is called the "rollback rule."
michael@0 709 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) {
michael@0 710 int64_t re = util64_pow(radix, exponent);
michael@0 711 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0;
michael@0 712 }
michael@0 713 return FALSE;
michael@0 714 }
michael@0 715
michael@0 716 //-----------------------------------------------------------------------
michael@0 717 // parsing
michael@0 718 //-----------------------------------------------------------------------
michael@0 719
michael@0 720 /**
michael@0 721 * Attempts to parse the string with this rule.
michael@0 722 * @param text The string being parsed
michael@0 723 * @param parsePosition On entry, the value is ignored and assumed to
michael@0 724 * be 0. On exit, this has been updated with the position of the first
michael@0 725 * character not consumed by matching the text against this rule
michael@0 726 * (if this rule doesn't match the text at all, the parse position
michael@0 727 * if left unchanged (presumably at 0) and the function returns
michael@0 728 * new Long(0)).
michael@0 729 * @param isFractionRule True if this rule is contained within a
michael@0 730 * fraction rule set. This is only used if the rule has no
michael@0 731 * substitutions.
michael@0 732 * @return If this rule matched the text, this is the rule's base value
michael@0 733 * combined appropriately with the results of parsing the substitutions.
michael@0 734 * If nothing matched, this is new Long(0) and the parse position is
michael@0 735 * left unchanged. The result will be an instance of Long if the
michael@0 736 * result is an integer and Double otherwise. The result is never null.
michael@0 737 */
michael@0 738 #ifdef RBNF_DEBUG
michael@0 739 #include <stdio.h>
michael@0 740
michael@0 741 static void dumpUS(FILE* f, const UnicodeString& us) {
michael@0 742 int len = us.length();
michael@0 743 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1];
michael@0 744 if (buf != NULL) {
michael@0 745 us.extract(0, len, buf);
michael@0 746 buf[len] = 0;
michael@0 747 fprintf(f, "%s", buf);
michael@0 748 uprv_free(buf); //delete[] buf;
michael@0 749 }
michael@0 750 }
michael@0 751 #endif
michael@0 752
michael@0 753 UBool
michael@0 754 NFRule::doParse(const UnicodeString& text,
michael@0 755 ParsePosition& parsePosition,
michael@0 756 UBool isFractionRule,
michael@0 757 double upperBound,
michael@0 758 Formattable& resVal) const
michael@0 759 {
michael@0 760 // internally we operate on a copy of the string being parsed
michael@0 761 // (because we're going to change it) and use our own ParsePosition
michael@0 762 ParsePosition pp;
michael@0 763 UnicodeString workText(text);
michael@0 764
michael@0 765 // check to see whether the text before the first substitution
michael@0 766 // matches the text at the beginning of the string being
michael@0 767 // parsed. If it does, strip that off the front of workText;
michael@0 768 // otherwise, dump out with a mismatch
michael@0 769 UnicodeString prefix;
michael@0 770 prefix.setTo(ruleText, 0, sub1->getPos());
michael@0 771
michael@0 772 #ifdef RBNF_DEBUG
michael@0 773 fprintf(stderr, "doParse %x ", this);
michael@0 774 {
michael@0 775 UnicodeString rt;
michael@0 776 _appendRuleText(rt);
michael@0 777 dumpUS(stderr, rt);
michael@0 778 }
michael@0 779
michael@0 780 fprintf(stderr, " text: '", this);
michael@0 781 dumpUS(stderr, text);
michael@0 782 fprintf(stderr, "' prefix: '");
michael@0 783 dumpUS(stderr, prefix);
michael@0 784 #endif
michael@0 785 stripPrefix(workText, prefix, pp);
michael@0 786 int32_t prefixLength = text.length() - workText.length();
michael@0 787
michael@0 788 #ifdef RBNF_DEBUG
michael@0 789 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos());
michael@0 790 #endif
michael@0 791
michael@0 792 if (pp.getIndex() == 0 && sub1->getPos() != 0) {
michael@0 793 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 794 // restored for ICU4C port
michael@0 795 parsePosition.setErrorIndex(pp.getErrorIndex());
michael@0 796 resVal.setLong(0);
michael@0 797 return TRUE;
michael@0 798 }
michael@0 799
michael@0 800 // this is the fun part. The basic guts of the rule-matching
michael@0 801 // logic is matchToDelimiter(), which is called twice. The first
michael@0 802 // time it searches the input string for the rule text BETWEEN
michael@0 803 // the substitutions and tries to match the intervening text
michael@0 804 // in the input string with the first substitution. If that
michael@0 805 // succeeds, it then calls it again, this time to look for the
michael@0 806 // rule text after the second substitution and to match the
michael@0 807 // intervening input text against the second substitution.
michael@0 808 //
michael@0 809 // For example, say we have a rule that looks like this:
michael@0 810 // first << middle >> last;
michael@0 811 // and input text that looks like this:
michael@0 812 // first one middle two last
michael@0 813 // First we use stripPrefix() to match "first " in both places and
michael@0 814 // strip it off the front, leaving
michael@0 815 // one middle two last
michael@0 816 // Then we use matchToDelimiter() to match " middle " and try to
michael@0 817 // match "one" against a substitution. If it's successful, we now
michael@0 818 // have
michael@0 819 // two last
michael@0 820 // We use matchToDelimiter() a second time to match " last" and
michael@0 821 // try to match "two" against a substitution. If "two" matches
michael@0 822 // the substitution, we have a successful parse.
michael@0 823 //
michael@0 824 // Since it's possible in many cases to find multiple instances
michael@0 825 // of each of these pieces of rule text in the input string,
michael@0 826 // we need to try all the possible combinations of these
michael@0 827 // locations. This prevents us from prematurely declaring a mismatch,
michael@0 828 // and makes sure we match as much input text as we can.
michael@0 829 int highWaterMark = 0;
michael@0 830 double result = 0;
michael@0 831 int start = 0;
michael@0 832 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue);
michael@0 833
michael@0 834 UnicodeString temp;
michael@0 835 do {
michael@0 836 // our partial parse result starts out as this rule's base
michael@0 837 // value. If it finds a successful match, matchToDelimiter()
michael@0 838 // will compose this in some way with what it gets back from
michael@0 839 // the substitution, giving us a new partial parse result
michael@0 840 pp.setIndex(0);
michael@0 841
michael@0 842 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos());
michael@0 843 double partialResult = matchToDelimiter(workText, start, tempBaseValue,
michael@0 844 temp, pp, sub1,
michael@0 845 upperBound);
michael@0 846
michael@0 847 // if we got a successful match (or were trying to match a
michael@0 848 // null substitution), pp is now pointing at the first unmatched
michael@0 849 // character. Take note of that, and try matchToDelimiter()
michael@0 850 // on the input text again
michael@0 851 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) {
michael@0 852 start = pp.getIndex();
michael@0 853
michael@0 854 UnicodeString workText2;
michael@0 855 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex());
michael@0 856 ParsePosition pp2;
michael@0 857
michael@0 858 // the second matchToDelimiter() will compose our previous
michael@0 859 // partial result with whatever it gets back from its
michael@0 860 // substitution if there's a successful match, giving us
michael@0 861 // a real result
michael@0 862 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos());
michael@0 863 partialResult = matchToDelimiter(workText2, 0, partialResult,
michael@0 864 temp, pp2, sub2,
michael@0 865 upperBound);
michael@0 866
michael@0 867 // if we got a successful match on this second
michael@0 868 // matchToDelimiter() call, update the high-water mark
michael@0 869 // and result (if necessary)
michael@0 870 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) {
michael@0 871 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) {
michael@0 872 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex();
michael@0 873 result = partialResult;
michael@0 874 }
michael@0 875 }
michael@0 876 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 877 // restored for ICU4C port
michael@0 878 else {
michael@0 879 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex();
michael@0 880 if (temp> parsePosition.getErrorIndex()) {
michael@0 881 parsePosition.setErrorIndex(temp);
michael@0 882 }
michael@0 883 }
michael@0 884 }
michael@0 885 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 886 // restored for ICU4C port
michael@0 887 else {
michael@0 888 int32_t temp = sub1->getPos() + pp.getErrorIndex();
michael@0 889 if (temp > parsePosition.getErrorIndex()) {
michael@0 890 parsePosition.setErrorIndex(temp);
michael@0 891 }
michael@0 892 }
michael@0 893 // keep trying to match things until the outer matchToDelimiter()
michael@0 894 // call fails to make a match (each time, it picks up where it
michael@0 895 // left off the previous time)
michael@0 896 } while (sub1->getPos() != sub2->getPos()
michael@0 897 && pp.getIndex() > 0
michael@0 898 && pp.getIndex() < workText.length()
michael@0 899 && pp.getIndex() != start);
michael@0 900
michael@0 901 // update the caller's ParsePosition with our high-water mark
michael@0 902 // (i.e., it now points at the first character this function
michael@0 903 // didn't match-- the ParsePosition is therefore unchanged if
michael@0 904 // we didn't match anything)
michael@0 905 parsePosition.setIndex(highWaterMark);
michael@0 906 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 907 // restored for ICU4C port
michael@0 908 if (highWaterMark > 0) {
michael@0 909 parsePosition.setErrorIndex(0);
michael@0 910 }
michael@0 911
michael@0 912 // this is a hack for one unusual condition: Normally, whether this
michael@0 913 // rule belong to a fraction rule set or not is handled by its
michael@0 914 // substitutions. But if that rule HAS NO substitutions, then
michael@0 915 // we have to account for it here. By definition, if the matching
michael@0 916 // rule in a fraction rule set has no substitutions, its numerator
michael@0 917 // is 1, and so the result is the reciprocal of its base value.
michael@0 918 if (isFractionRule &&
michael@0 919 highWaterMark > 0 &&
michael@0 920 sub1->isNullSubstitution()) {
michael@0 921 result = 1 / result;
michael@0 922 }
michael@0 923
michael@0 924 resVal.setDouble(result);
michael@0 925 return TRUE; // ??? do we need to worry if it is a long or a double?
michael@0 926 }
michael@0 927
michael@0 928 /**
michael@0 929 * This function is used by parse() to match the text being parsed
michael@0 930 * against a possible prefix string. This function
michael@0 931 * matches characters from the beginning of the string being parsed
michael@0 932 * to characters from the prospective prefix. If they match, pp is
michael@0 933 * updated to the first character not matched, and the result is
michael@0 934 * the unparsed part of the string. If they don't match, the whole
michael@0 935 * string is returned, and pp is left unchanged.
michael@0 936 * @param text The string being parsed
michael@0 937 * @param prefix The text to match against
michael@0 938 * @param pp On entry, ignored and assumed to be 0. On exit, points
michael@0 939 * to the first unmatched character (assuming the whole prefix matched),
michael@0 940 * or is unchanged (if the whole prefix didn't match).
michael@0 941 * @return If things match, this is the unparsed part of "text";
michael@0 942 * if they didn't match, this is "text".
michael@0 943 */
michael@0 944 void
michael@0 945 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const
michael@0 946 {
michael@0 947 // if the prefix text is empty, dump out without doing anything
michael@0 948 if (prefix.length() != 0) {
michael@0 949 UErrorCode status = U_ZERO_ERROR;
michael@0 950 // use prefixLength() to match the beginning of
michael@0 951 // "text" against "prefix". This function returns the
michael@0 952 // number of characters from "text" that matched (or 0 if
michael@0 953 // we didn't match the whole prefix)
michael@0 954 int32_t pfl = prefixLength(text, prefix, status);
michael@0 955 if (U_FAILURE(status)) { // Memory allocation error.
michael@0 956 return;
michael@0 957 }
michael@0 958 if (pfl != 0) {
michael@0 959 // if we got a successful match, update the parse position
michael@0 960 // and strip the prefix off of "text"
michael@0 961 pp.setIndex(pp.getIndex() + pfl);
michael@0 962 text.remove(0, pfl);
michael@0 963 }
michael@0 964 }
michael@0 965 }
michael@0 966
michael@0 967 /**
michael@0 968 * Used by parse() to match a substitution and any following text.
michael@0 969 * "text" is searched for instances of "delimiter". For each instance
michael@0 970 * of delimiter, the intervening text is tested to see whether it
michael@0 971 * matches the substitution. The longest match wins.
michael@0 972 * @param text The string being parsed
michael@0 973 * @param startPos The position in "text" where we should start looking
michael@0 974 * for "delimiter".
michael@0 975 * @param baseValue A partial parse result (often the rule's base value),
michael@0 976 * which is combined with the result from matching the substitution
michael@0 977 * @param delimiter The string to search "text" for.
michael@0 978 * @param pp Ignored and presumed to be 0 on entry. If there's a match,
michael@0 979 * on exit this will point to the first unmatched character.
michael@0 980 * @param sub If we find "delimiter" in "text", this substitution is used
michael@0 981 * to match the text between the beginning of the string and the
michael@0 982 * position of "delimiter." (If "delimiter" is the empty string, then
michael@0 983 * this function just matches against this substitution and updates
michael@0 984 * everything accordingly.)
michael@0 985 * @param upperBound When matching the substitution, it will only
michael@0 986 * consider rules with base values lower than this value.
michael@0 987 * @return If there's a match, this is the result of composing
michael@0 988 * baseValue with the result of matching the substitution. Otherwise,
michael@0 989 * this is new Long(0). It's never null. If the result is an integer,
michael@0 990 * this will be an instance of Long; otherwise, it's an instance of
michael@0 991 * Double.
michael@0 992 *
michael@0 993 * !!! note {dlf} in point of fact, in the java code the caller always converts
michael@0 994 * the result to a double, so we might as well return one.
michael@0 995 */
michael@0 996 double
michael@0 997 NFRule::matchToDelimiter(const UnicodeString& text,
michael@0 998 int32_t startPos,
michael@0 999 double _baseValue,
michael@0 1000 const UnicodeString& delimiter,
michael@0 1001 ParsePosition& pp,
michael@0 1002 const NFSubstitution* sub,
michael@0 1003 double upperBound) const
michael@0 1004 {
michael@0 1005 UErrorCode status = U_ZERO_ERROR;
michael@0 1006 // if "delimiter" contains real (i.e., non-ignorable) text, search
michael@0 1007 // it for "delimiter" beginning at "start". If that succeeds, then
michael@0 1008 // use "sub"'s doParse() method to match the text before the
michael@0 1009 // instance of "delimiter" we just found.
michael@0 1010 if (!allIgnorable(delimiter, status)) {
michael@0 1011 if (U_FAILURE(status)) { //Memory allocation error.
michael@0 1012 return 0;
michael@0 1013 }
michael@0 1014 ParsePosition tempPP;
michael@0 1015 Formattable result;
michael@0 1016
michael@0 1017 // use findText() to search for "delimiter". It returns a two-
michael@0 1018 // element array: element 0 is the position of the match, and
michael@0 1019 // element 1 is the number of characters that matched
michael@0 1020 // "delimiter".
michael@0 1021 int32_t dLen;
michael@0 1022 int32_t dPos = findText(text, delimiter, startPos, &dLen);
michael@0 1023
michael@0 1024 // if findText() succeeded, isolate the text preceding the
michael@0 1025 // match, and use "sub" to match that text
michael@0 1026 while (dPos >= 0) {
michael@0 1027 UnicodeString subText;
michael@0 1028 subText.setTo(text, 0, dPos);
michael@0 1029 if (subText.length() > 0) {
michael@0 1030 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound,
michael@0 1031 #if UCONFIG_NO_COLLATION
michael@0 1032 FALSE,
michael@0 1033 #else
michael@0 1034 formatter->isLenient(),
michael@0 1035 #endif
michael@0 1036 result);
michael@0 1037
michael@0 1038 // if the substitution could match all the text up to
michael@0 1039 // where we found "delimiter", then this function has
michael@0 1040 // a successful match. Bump the caller's parse position
michael@0 1041 // to point to the first character after the text
michael@0 1042 // that matches "delimiter", and return the result
michael@0 1043 // we got from parsing the substitution.
michael@0 1044 if (success && tempPP.getIndex() == dPos) {
michael@0 1045 pp.setIndex(dPos + dLen);
michael@0 1046 return result.getDouble();
michael@0 1047 }
michael@0 1048 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 1049 // restored for ICU4C port
michael@0 1050 else {
michael@0 1051 if (tempPP.getErrorIndex() > 0) {
michael@0 1052 pp.setErrorIndex(tempPP.getErrorIndex());
michael@0 1053 } else {
michael@0 1054 pp.setErrorIndex(tempPP.getIndex());
michael@0 1055 }
michael@0 1056 }
michael@0 1057 }
michael@0 1058
michael@0 1059 // if we didn't match the substitution, search for another
michael@0 1060 // copy of "delimiter" in "text" and repeat the loop if
michael@0 1061 // we find it
michael@0 1062 tempPP.setIndex(0);
michael@0 1063 dPos = findText(text, delimiter, dPos + dLen, &dLen);
michael@0 1064 }
michael@0 1065 // if we make it here, this was an unsuccessful match, and we
michael@0 1066 // leave pp unchanged and return 0
michael@0 1067 pp.setIndex(0);
michael@0 1068 return 0;
michael@0 1069
michael@0 1070 // if "delimiter" is empty, or consists only of ignorable characters
michael@0 1071 // (i.e., is semantically empty), thwe we obviously can't search
michael@0 1072 // for "delimiter". Instead, just use "sub" to parse as much of
michael@0 1073 // "text" as possible.
michael@0 1074 } else {
michael@0 1075 ParsePosition tempPP;
michael@0 1076 Formattable result;
michael@0 1077
michael@0 1078 // try to match the whole string against the substitution
michael@0 1079 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound,
michael@0 1080 #if UCONFIG_NO_COLLATION
michael@0 1081 FALSE,
michael@0 1082 #else
michael@0 1083 formatter->isLenient(),
michael@0 1084 #endif
michael@0 1085 result);
michael@0 1086 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) {
michael@0 1087 // if there's a successful match (or it's a null
michael@0 1088 // substitution), update pp to point to the first
michael@0 1089 // character we didn't match, and pass the result from
michael@0 1090 // sub.doParse() on through to the caller
michael@0 1091 pp.setIndex(tempPP.getIndex());
michael@0 1092 return result.getDouble();
michael@0 1093 }
michael@0 1094 // commented out because ParsePosition doesn't have error index in 1.1.x
michael@0 1095 // restored for ICU4C port
michael@0 1096 else {
michael@0 1097 pp.setErrorIndex(tempPP.getErrorIndex());
michael@0 1098 }
michael@0 1099
michael@0 1100 // and if we get to here, then nothing matched, so we return
michael@0 1101 // 0 and leave pp alone
michael@0 1102 return 0;
michael@0 1103 }
michael@0 1104 }
michael@0 1105
michael@0 1106 /**
michael@0 1107 * Used by stripPrefix() to match characters. If lenient parse mode
michael@0 1108 * is off, this just calls startsWith(). If lenient parse mode is on,
michael@0 1109 * this function uses CollationElementIterators to match characters in
michael@0 1110 * the strings (only primary-order differences are significant in
michael@0 1111 * determining whether there's a match).
michael@0 1112 * @param str The string being tested
michael@0 1113 * @param prefix The text we're hoping to see at the beginning
michael@0 1114 * of "str"
michael@0 1115 * @return If "prefix" is found at the beginning of "str", this
michael@0 1116 * is the number of characters in "str" that were matched (this
michael@0 1117 * isn't necessarily the same as the length of "prefix" when matching
michael@0 1118 * text with a collator). If there's no match, this is 0.
michael@0 1119 */
michael@0 1120 int32_t
michael@0 1121 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const
michael@0 1122 {
michael@0 1123 // if we're looking for an empty prefix, it obviously matches
michael@0 1124 // zero characters. Just go ahead and return 0.
michael@0 1125 if (prefix.length() == 0) {
michael@0 1126 return 0;
michael@0 1127 }
michael@0 1128
michael@0 1129 #if !UCONFIG_NO_COLLATION
michael@0 1130 // go through all this grief if we're in lenient-parse mode
michael@0 1131 if (formatter->isLenient()) {
michael@0 1132 // get the formatter's collator and use it to create two
michael@0 1133 // collation element iterators, one over the target string
michael@0 1134 // and another over the prefix (right now, we'll throw an
michael@0 1135 // exception if the collator we get back from the formatter
michael@0 1136 // isn't a RuleBasedCollator, because RuleBasedCollator defines
michael@0 1137 // the CollationElementIterator protocol. Hopefully, this
michael@0 1138 // will change someday.)
michael@0 1139 RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator();
michael@0 1140 CollationElementIterator* strIter = collator->createCollationElementIterator(str);
michael@0 1141 CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix);
michael@0 1142 // Check for memory allocation error.
michael@0 1143 if (collator == NULL || strIter == NULL || prefixIter == NULL) {
michael@0 1144 delete collator;
michael@0 1145 delete strIter;
michael@0 1146 delete prefixIter;
michael@0 1147 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1148 return 0;
michael@0 1149 }
michael@0 1150
michael@0 1151 UErrorCode err = U_ZERO_ERROR;
michael@0 1152
michael@0 1153 // The original code was problematic. Consider this match:
michael@0 1154 // prefix = "fifty-"
michael@0 1155 // string = " fifty-7"
michael@0 1156 // The intent is to match string up to the '7', by matching 'fifty-' at position 1
michael@0 1157 // in the string. Unfortunately, we were getting a match, and then computing where
michael@0 1158 // the match terminated by rematching the string. The rematch code was using as an
michael@0 1159 // initial guess the substring of string between 0 and prefix.length. Because of
michael@0 1160 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving
michael@0 1161 // the position before the hyphen in the string. Recursing down, we then parsed the
michael@0 1162 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7).
michael@0 1163 // This was not pretty, especially since the string "fifty-7" parsed just fine.
michael@0 1164 //
michael@0 1165 // We have newer APIs now, so we can use calls on the iterator to determine what we
michael@0 1166 // matched up to. If we terminate because we hit the last element in the string,
michael@0 1167 // our match terminates at this length. If we terminate because we hit the last element
michael@0 1168 // in the target, our match terminates at one before the element iterator position.
michael@0 1169
michael@0 1170 // match collation elements between the strings
michael@0 1171 int32_t oStr = strIter->next(err);
michael@0 1172 int32_t oPrefix = prefixIter->next(err);
michael@0 1173
michael@0 1174 while (oPrefix != CollationElementIterator::NULLORDER) {
michael@0 1175 // skip over ignorable characters in the target string
michael@0 1176 while (CollationElementIterator::primaryOrder(oStr) == 0
michael@0 1177 && oStr != CollationElementIterator::NULLORDER) {
michael@0 1178 oStr = strIter->next(err);
michael@0 1179 }
michael@0 1180
michael@0 1181 // skip over ignorable characters in the prefix
michael@0 1182 while (CollationElementIterator::primaryOrder(oPrefix) == 0
michael@0 1183 && oPrefix != CollationElementIterator::NULLORDER) {
michael@0 1184 oPrefix = prefixIter->next(err);
michael@0 1185 }
michael@0 1186
michael@0 1187 // dlf: move this above following test, if we consume the
michael@0 1188 // entire target, aren't we ok even if the source was also
michael@0 1189 // entirely consumed?
michael@0 1190
michael@0 1191 // if skipping over ignorables brought to the end of
michael@0 1192 // the prefix, we DID match: drop out of the loop
michael@0 1193 if (oPrefix == CollationElementIterator::NULLORDER) {
michael@0 1194 break;
michael@0 1195 }
michael@0 1196
michael@0 1197 // if skipping over ignorables brought us to the end
michael@0 1198 // of the target string, we didn't match and return 0
michael@0 1199 if (oStr == CollationElementIterator::NULLORDER) {
michael@0 1200 delete prefixIter;
michael@0 1201 delete strIter;
michael@0 1202 return 0;
michael@0 1203 }
michael@0 1204
michael@0 1205 // match collation elements from the two strings
michael@0 1206 // (considering only primary differences). If we
michael@0 1207 // get a mismatch, dump out and return 0
michael@0 1208 if (CollationElementIterator::primaryOrder(oStr)
michael@0 1209 != CollationElementIterator::primaryOrder(oPrefix)) {
michael@0 1210 delete prefixIter;
michael@0 1211 delete strIter;
michael@0 1212 return 0;
michael@0 1213
michael@0 1214 // otherwise, advance to the next character in each string
michael@0 1215 // and loop (we drop out of the loop when we exhaust
michael@0 1216 // collation elements in the prefix)
michael@0 1217 } else {
michael@0 1218 oStr = strIter->next(err);
michael@0 1219 oPrefix = prefixIter->next(err);
michael@0 1220 }
michael@0 1221 }
michael@0 1222
michael@0 1223 int32_t result = strIter->getOffset();
michael@0 1224 if (oStr != CollationElementIterator::NULLORDER) {
michael@0 1225 --result; // back over character that we don't want to consume;
michael@0 1226 }
michael@0 1227
michael@0 1228 #ifdef RBNF_DEBUG
michael@0 1229 fprintf(stderr, "prefix length: %d\n", result);
michael@0 1230 #endif
michael@0 1231 delete prefixIter;
michael@0 1232 delete strIter;
michael@0 1233
michael@0 1234 return result;
michael@0 1235 #if 0
michael@0 1236 //----------------------------------------------------------------
michael@0 1237 // JDK 1.2-specific API call
michael@0 1238 // return strIter.getOffset();
michael@0 1239 //----------------------------------------------------------------
michael@0 1240 // JDK 1.1 HACK (take out for 1.2-specific code)
michael@0 1241
michael@0 1242 // if we make it to here, we have a successful match. Now we
michael@0 1243 // have to find out HOW MANY characters from the target string
michael@0 1244 // matched the prefix (there isn't necessarily a one-to-one
michael@0 1245 // mapping between collation elements and characters).
michael@0 1246 // In JDK 1.2, there's a simple getOffset() call we can use.
michael@0 1247 // In JDK 1.1, on the other hand, we have to go through some
michael@0 1248 // ugly contortions. First, use the collator to compare the
michael@0 1249 // same number of characters from the prefix and target string.
michael@0 1250 // If they're equal, we're done.
michael@0 1251 collator->setStrength(Collator::PRIMARY);
michael@0 1252 if (str.length() >= prefix.length()) {
michael@0 1253 UnicodeString temp;
michael@0 1254 temp.setTo(str, 0, prefix.length());
michael@0 1255 if (collator->equals(temp, prefix)) {
michael@0 1256 #ifdef RBNF_DEBUG
michael@0 1257 fprintf(stderr, "returning: %d\n", prefix.length());
michael@0 1258 #endif
michael@0 1259 return prefix.length();
michael@0 1260 }
michael@0 1261 }
michael@0 1262
michael@0 1263 // if they're not equal, then we have to compare successively
michael@0 1264 // larger and larger substrings of the target string until we
michael@0 1265 // get to one that matches the prefix. At that point, we know
michael@0 1266 // how many characters matched the prefix, and we can return.
michael@0 1267 int32_t p = 1;
michael@0 1268 while (p <= str.length()) {
michael@0 1269 UnicodeString temp;
michael@0 1270 temp.setTo(str, 0, p);
michael@0 1271 if (collator->equals(temp, prefix)) {
michael@0 1272 return p;
michael@0 1273 } else {
michael@0 1274 ++p;
michael@0 1275 }
michael@0 1276 }
michael@0 1277
michael@0 1278 // SHOULD NEVER GET HERE!!!
michael@0 1279 return 0;
michael@0 1280 //----------------------------------------------------------------
michael@0 1281 #endif
michael@0 1282
michael@0 1283 // If lenient parsing is turned off, forget all that crap above.
michael@0 1284 // Just use String.startsWith() and be done with it.
michael@0 1285 } else
michael@0 1286 #endif
michael@0 1287 {
michael@0 1288 if (str.startsWith(prefix)) {
michael@0 1289 return prefix.length();
michael@0 1290 } else {
michael@0 1291 return 0;
michael@0 1292 }
michael@0 1293 }
michael@0 1294 }
michael@0 1295
michael@0 1296 /**
michael@0 1297 * Searches a string for another string. If lenient parsing is off,
michael@0 1298 * this just calls indexOf(). If lenient parsing is on, this function
michael@0 1299 * uses CollationElementIterator to match characters, and only
michael@0 1300 * primary-order differences are significant in determining whether
michael@0 1301 * there's a match.
michael@0 1302 * @param str The string to search
michael@0 1303 * @param key The string to search "str" for
michael@0 1304 * @param startingAt The index into "str" where the search is to
michael@0 1305 * begin
michael@0 1306 * @return A two-element array of ints. Element 0 is the position
michael@0 1307 * of the match, or -1 if there was no match. Element 1 is the
michael@0 1308 * number of characters in "str" that matched (which isn't necessarily
michael@0 1309 * the same as the length of "key")
michael@0 1310 */
michael@0 1311 int32_t
michael@0 1312 NFRule::findText(const UnicodeString& str,
michael@0 1313 const UnicodeString& key,
michael@0 1314 int32_t startingAt,
michael@0 1315 int32_t* length) const
michael@0 1316 {
michael@0 1317 #if !UCONFIG_NO_COLLATION
michael@0 1318 // if lenient parsing is turned off, this is easy: just call
michael@0 1319 // String.indexOf() and we're done
michael@0 1320 if (!formatter->isLenient()) {
michael@0 1321 *length = key.length();
michael@0 1322 return str.indexOf(key, startingAt);
michael@0 1323
michael@0 1324 // but if lenient parsing is turned ON, we've got some work
michael@0 1325 // ahead of us
michael@0 1326 } else
michael@0 1327 #endif
michael@0 1328 {
michael@0 1329 //----------------------------------------------------------------
michael@0 1330 // JDK 1.1 HACK (take out of 1.2-specific code)
michael@0 1331
michael@0 1332 // in JDK 1.2, CollationElementIterator provides us with an
michael@0 1333 // API to map between character offsets and collation elements
michael@0 1334 // and we can do this by marching through the string comparing
michael@0 1335 // collation elements. We can't do that in JDK 1.1. Insted,
michael@0 1336 // we have to go through this horrible slow mess:
michael@0 1337 int32_t p = startingAt;
michael@0 1338 int32_t keyLen = 0;
michael@0 1339
michael@0 1340 // basically just isolate smaller and smaller substrings of
michael@0 1341 // the target string (each running to the end of the string,
michael@0 1342 // and with the first one running from startingAt to the end)
michael@0 1343 // and then use prefixLength() to see if the search key is at
michael@0 1344 // the beginning of each substring. This is excruciatingly
michael@0 1345 // slow, but it will locate the key and tell use how long the
michael@0 1346 // matching text was.
michael@0 1347 UnicodeString temp;
michael@0 1348 UErrorCode status = U_ZERO_ERROR;
michael@0 1349 while (p < str.length() && keyLen == 0) {
michael@0 1350 temp.setTo(str, p, str.length() - p);
michael@0 1351 keyLen = prefixLength(temp, key, status);
michael@0 1352 if (U_FAILURE(status)) {
michael@0 1353 break;
michael@0 1354 }
michael@0 1355 if (keyLen != 0) {
michael@0 1356 *length = keyLen;
michael@0 1357 return p;
michael@0 1358 }
michael@0 1359 ++p;
michael@0 1360 }
michael@0 1361 // if we make it to here, we didn't find it. Return -1 for the
michael@0 1362 // location. The length should be ignored, but set it to 0,
michael@0 1363 // which should be "safe"
michael@0 1364 *length = 0;
michael@0 1365 return -1;
michael@0 1366
michael@0 1367 //----------------------------------------------------------------
michael@0 1368 // JDK 1.2 version of this routine
michael@0 1369 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator();
michael@0 1370 //
michael@0 1371 //CollationElementIterator strIter = collator.getCollationElementIterator(str);
michael@0 1372 //CollationElementIterator keyIter = collator.getCollationElementIterator(key);
michael@0 1373 //
michael@0 1374 //int keyStart = -1;
michael@0 1375 //
michael@0 1376 //str.setOffset(startingAt);
michael@0 1377 //
michael@0 1378 //int oStr = strIter.next();
michael@0 1379 //int oKey = keyIter.next();
michael@0 1380 //while (oKey != CollationElementIterator.NULLORDER) {
michael@0 1381 // while (oStr != CollationElementIterator.NULLORDER &&
michael@0 1382 // CollationElementIterator.primaryOrder(oStr) == 0)
michael@0 1383 // oStr = strIter.next();
michael@0 1384 //
michael@0 1385 // while (oKey != CollationElementIterator.NULLORDER &&
michael@0 1386 // CollationElementIterator.primaryOrder(oKey) == 0)
michael@0 1387 // oKey = keyIter.next();
michael@0 1388 //
michael@0 1389 // if (oStr == CollationElementIterator.NULLORDER) {
michael@0 1390 // return new int[] { -1, 0 };
michael@0 1391 // }
michael@0 1392 //
michael@0 1393 // if (oKey == CollationElementIterator.NULLORDER) {
michael@0 1394 // break;
michael@0 1395 // }
michael@0 1396 //
michael@0 1397 // if (CollationElementIterator.primaryOrder(oStr) ==
michael@0 1398 // CollationElementIterator.primaryOrder(oKey)) {
michael@0 1399 // keyStart = strIter.getOffset();
michael@0 1400 // oStr = strIter.next();
michael@0 1401 // oKey = keyIter.next();
michael@0 1402 // } else {
michael@0 1403 // if (keyStart != -1) {
michael@0 1404 // keyStart = -1;
michael@0 1405 // keyIter.reset();
michael@0 1406 // } else {
michael@0 1407 // oStr = strIter.next();
michael@0 1408 // }
michael@0 1409 // }
michael@0 1410 //}
michael@0 1411 //
michael@0 1412 //if (oKey == CollationElementIterator.NULLORDER) {
michael@0 1413 // return new int[] { keyStart, strIter.getOffset() - keyStart };
michael@0 1414 //} else {
michael@0 1415 // return new int[] { -1, 0 };
michael@0 1416 //}
michael@0 1417 }
michael@0 1418 }
michael@0 1419
michael@0 1420 /**
michael@0 1421 * Checks to see whether a string consists entirely of ignorable
michael@0 1422 * characters.
michael@0 1423 * @param str The string to test.
michael@0 1424 * @return true if the string is empty of consists entirely of
michael@0 1425 * characters that the number formatter's collator says are
michael@0 1426 * ignorable at the primary-order level. false otherwise.
michael@0 1427 */
michael@0 1428 UBool
michael@0 1429 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const
michael@0 1430 {
michael@0 1431 // if the string is empty, we can just return true
michael@0 1432 if (str.length() == 0) {
michael@0 1433 return TRUE;
michael@0 1434 }
michael@0 1435
michael@0 1436 #if !UCONFIG_NO_COLLATION
michael@0 1437 // if lenient parsing is turned on, walk through the string with
michael@0 1438 // a collation element iterator and make sure each collation
michael@0 1439 // element is 0 (ignorable) at the primary level
michael@0 1440 if (formatter->isLenient()) {
michael@0 1441 RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator());
michael@0 1442 CollationElementIterator* iter = collator->createCollationElementIterator(str);
michael@0 1443
michael@0 1444 // Memory allocation error check.
michael@0 1445 if (collator == NULL || iter == NULL) {
michael@0 1446 delete collator;
michael@0 1447 delete iter;
michael@0 1448 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1449 return FALSE;
michael@0 1450 }
michael@0 1451
michael@0 1452 UErrorCode err = U_ZERO_ERROR;
michael@0 1453 int32_t o = iter->next(err);
michael@0 1454 while (o != CollationElementIterator::NULLORDER
michael@0 1455 && CollationElementIterator::primaryOrder(o) == 0) {
michael@0 1456 o = iter->next(err);
michael@0 1457 }
michael@0 1458
michael@0 1459 delete iter;
michael@0 1460 return o == CollationElementIterator::NULLORDER;
michael@0 1461 }
michael@0 1462 #endif
michael@0 1463
michael@0 1464 // if lenient parsing is turned off, there is no such thing as
michael@0 1465 // an ignorable character: return true only if the string is empty
michael@0 1466 return FALSE;
michael@0 1467 }
michael@0 1468
michael@0 1469 U_NAMESPACE_END
michael@0 1470
michael@0 1471 /* U_HAVE_RBNF */
michael@0 1472 #endif
michael@0 1473
michael@0 1474

mercurial