Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * Copyright (C) 1997-2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ****************************************************************************** |
michael@0 | 6 | * file name: nfrule.cpp |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * Modification history |
michael@0 | 12 | * Date Name Comments |
michael@0 | 13 | * 10/11/2001 Doug Ported from ICU4J |
michael@0 | 14 | */ |
michael@0 | 15 | |
michael@0 | 16 | #include "nfrule.h" |
michael@0 | 17 | |
michael@0 | 18 | #if U_HAVE_RBNF |
michael@0 | 19 | |
michael@0 | 20 | #include "unicode/rbnf.h" |
michael@0 | 21 | #include "unicode/tblcoll.h" |
michael@0 | 22 | #include "unicode/coleitr.h" |
michael@0 | 23 | #include "unicode/uchar.h" |
michael@0 | 24 | #include "nfrs.h" |
michael@0 | 25 | #include "nfrlist.h" |
michael@0 | 26 | #include "nfsubs.h" |
michael@0 | 27 | #include "patternprops.h" |
michael@0 | 28 | |
michael@0 | 29 | U_NAMESPACE_BEGIN |
michael@0 | 30 | |
michael@0 | 31 | NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) |
michael@0 | 32 | : baseValue((int32_t)0) |
michael@0 | 33 | , radix(0) |
michael@0 | 34 | , exponent(0) |
michael@0 | 35 | , ruleText() |
michael@0 | 36 | , sub1(NULL) |
michael@0 | 37 | , sub2(NULL) |
michael@0 | 38 | , formatter(_rbnf) |
michael@0 | 39 | { |
michael@0 | 40 | } |
michael@0 | 41 | |
michael@0 | 42 | NFRule::~NFRule() |
michael@0 | 43 | { |
michael@0 | 44 | delete sub1; |
michael@0 | 45 | delete sub2; |
michael@0 | 46 | } |
michael@0 | 47 | |
michael@0 | 48 | static const UChar gLeftBracket = 0x005b; |
michael@0 | 49 | static const UChar gRightBracket = 0x005d; |
michael@0 | 50 | static const UChar gColon = 0x003a; |
michael@0 | 51 | static const UChar gZero = 0x0030; |
michael@0 | 52 | static const UChar gNine = 0x0039; |
michael@0 | 53 | static const UChar gSpace = 0x0020; |
michael@0 | 54 | static const UChar gSlash = 0x002f; |
michael@0 | 55 | static const UChar gGreaterThan = 0x003e; |
michael@0 | 56 | static const UChar gLessThan = 0x003c; |
michael@0 | 57 | static const UChar gComma = 0x002c; |
michael@0 | 58 | static const UChar gDot = 0x002e; |
michael@0 | 59 | static const UChar gTick = 0x0027; |
michael@0 | 60 | //static const UChar gMinus = 0x002d; |
michael@0 | 61 | static const UChar gSemicolon = 0x003b; |
michael@0 | 62 | |
michael@0 | 63 | static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ |
michael@0 | 64 | static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ |
michael@0 | 65 | static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ |
michael@0 | 66 | static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ |
michael@0 | 67 | |
michael@0 | 68 | static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ |
michael@0 | 69 | static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ |
michael@0 | 70 | static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ |
michael@0 | 71 | static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ |
michael@0 | 72 | static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ |
michael@0 | 73 | static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ |
michael@0 | 74 | static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ |
michael@0 | 75 | static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ |
michael@0 | 76 | static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ |
michael@0 | 77 | static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ |
michael@0 | 78 | static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ |
michael@0 | 79 | static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ |
michael@0 | 80 | |
michael@0 | 81 | static const UChar * const tokenStrings[] = { |
michael@0 | 82 | gLessLess, gLessPercent, gLessHash, gLessZero, |
michael@0 | 83 | gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, |
michael@0 | 84 | gEqualPercent, gEqualHash, gEqualZero, NULL |
michael@0 | 85 | }; |
michael@0 | 86 | |
michael@0 | 87 | void |
michael@0 | 88 | NFRule::makeRules(UnicodeString& description, |
michael@0 | 89 | const NFRuleSet *ruleSet, |
michael@0 | 90 | const NFRule *predecessor, |
michael@0 | 91 | const RuleBasedNumberFormat *rbnf, |
michael@0 | 92 | NFRuleList& rules, |
michael@0 | 93 | UErrorCode& status) |
michael@0 | 94 | { |
michael@0 | 95 | // we know we're making at least one rule, so go ahead and |
michael@0 | 96 | // new it up and initialize its basevalue and divisor |
michael@0 | 97 | // (this also strips the rule descriptor, if any, off the |
michael@0 | 98 | // descripton string) |
michael@0 | 99 | NFRule* rule1 = new NFRule(rbnf); |
michael@0 | 100 | /* test for NULL */ |
michael@0 | 101 | if (rule1 == 0) { |
michael@0 | 102 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 103 | return; |
michael@0 | 104 | } |
michael@0 | 105 | rule1->parseRuleDescriptor(description, status); |
michael@0 | 106 | |
michael@0 | 107 | // check the description to see whether there's text enclosed |
michael@0 | 108 | // in brackets |
michael@0 | 109 | int32_t brack1 = description.indexOf(gLeftBracket); |
michael@0 | 110 | int32_t brack2 = description.indexOf(gRightBracket); |
michael@0 | 111 | |
michael@0 | 112 | // if the description doesn't contain a matched pair of brackets, |
michael@0 | 113 | // or if it's of a type that doesn't recognize bracketed text, |
michael@0 | 114 | // then leave the description alone, initialize the rule's |
michael@0 | 115 | // rule text and substitutions, and return that rule |
michael@0 | 116 | if (brack1 == -1 || brack2 == -1 || brack1 > brack2 |
michael@0 | 117 | || rule1->getType() == kProperFractionRule |
michael@0 | 118 | || rule1->getType() == kNegativeNumberRule) { |
michael@0 | 119 | rule1->ruleText = description; |
michael@0 | 120 | rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
michael@0 | 121 | rules.add(rule1); |
michael@0 | 122 | } else { |
michael@0 | 123 | // if the description does contain a matched pair of brackets, |
michael@0 | 124 | // then it's really shorthand for two rules (with one exception) |
michael@0 | 125 | NFRule* rule2 = NULL; |
michael@0 | 126 | UnicodeString sbuf; |
michael@0 | 127 | |
michael@0 | 128 | // we'll actually only split the rule into two rules if its |
michael@0 | 129 | // base value is an even multiple of its divisor (or it's one |
michael@0 | 130 | // of the special rules) |
michael@0 | 131 | if ((rule1->baseValue > 0 |
michael@0 | 132 | && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) |
michael@0 | 133 | || rule1->getType() == kImproperFractionRule |
michael@0 | 134 | || rule1->getType() == kMasterRule) { |
michael@0 | 135 | |
michael@0 | 136 | // if it passes that test, new up the second rule. If the |
michael@0 | 137 | // rule set both rules will belong to is a fraction rule |
michael@0 | 138 | // set, they both have the same base value; otherwise, |
michael@0 | 139 | // increment the original rule's base value ("rule1" actually |
michael@0 | 140 | // goes SECOND in the rule set's rule list) |
michael@0 | 141 | rule2 = new NFRule(rbnf); |
michael@0 | 142 | /* test for NULL */ |
michael@0 | 143 | if (rule2 == 0) { |
michael@0 | 144 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 145 | return; |
michael@0 | 146 | } |
michael@0 | 147 | if (rule1->baseValue >= 0) { |
michael@0 | 148 | rule2->baseValue = rule1->baseValue; |
michael@0 | 149 | if (!ruleSet->isFractionRuleSet()) { |
michael@0 | 150 | ++rule1->baseValue; |
michael@0 | 151 | } |
michael@0 | 152 | } |
michael@0 | 153 | |
michael@0 | 154 | // if the description began with "x.x" and contains bracketed |
michael@0 | 155 | // text, it describes both the improper fraction rule and |
michael@0 | 156 | // the proper fraction rule |
michael@0 | 157 | else if (rule1->getType() == kImproperFractionRule) { |
michael@0 | 158 | rule2->setType(kProperFractionRule); |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | // if the description began with "x.0" and contains bracketed |
michael@0 | 162 | // text, it describes both the master rule and the |
michael@0 | 163 | // improper fraction rule |
michael@0 | 164 | else if (rule1->getType() == kMasterRule) { |
michael@0 | 165 | rule2->baseValue = rule1->baseValue; |
michael@0 | 166 | rule1->setType(kImproperFractionRule); |
michael@0 | 167 | } |
michael@0 | 168 | |
michael@0 | 169 | // both rules have the same radix and exponent (i.e., the |
michael@0 | 170 | // same divisor) |
michael@0 | 171 | rule2->radix = rule1->radix; |
michael@0 | 172 | rule2->exponent = rule1->exponent; |
michael@0 | 173 | |
michael@0 | 174 | // rule2's rule text omits the stuff in brackets: initalize |
michael@0 | 175 | // its rule text and substitutions accordingly |
michael@0 | 176 | sbuf.append(description, 0, brack1); |
michael@0 | 177 | if (brack2 + 1 < description.length()) { |
michael@0 | 178 | sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
michael@0 | 179 | } |
michael@0 | 180 | rule2->ruleText.setTo(sbuf); |
michael@0 | 181 | rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
michael@0 | 182 | } |
michael@0 | 183 | |
michael@0 | 184 | // rule1's text includes the text in the brackets but omits |
michael@0 | 185 | // the brackets themselves: initialize _its_ rule text and |
michael@0 | 186 | // substitutions accordingly |
michael@0 | 187 | sbuf.setTo(description, 0, brack1); |
michael@0 | 188 | sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); |
michael@0 | 189 | if (brack2 + 1 < description.length()) { |
michael@0 | 190 | sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
michael@0 | 191 | } |
michael@0 | 192 | rule1->ruleText.setTo(sbuf); |
michael@0 | 193 | rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
michael@0 | 194 | |
michael@0 | 195 | // if we only have one rule, return it; if we have two, return |
michael@0 | 196 | // a two-element array containing them (notice that rule2 goes |
michael@0 | 197 | // BEFORE rule1 in the list: in all cases, rule2 OMITS the |
michael@0 | 198 | // material in the brackets and rule1 INCLUDES the material |
michael@0 | 199 | // in the brackets) |
michael@0 | 200 | if (rule2 != NULL) { |
michael@0 | 201 | rules.add(rule2); |
michael@0 | 202 | } |
michael@0 | 203 | rules.add(rule1); |
michael@0 | 204 | } |
michael@0 | 205 | } |
michael@0 | 206 | |
michael@0 | 207 | /** |
michael@0 | 208 | * This function parses the rule's rule descriptor (i.e., the base |
michael@0 | 209 | * value and/or other tokens that precede the rule's rule text |
michael@0 | 210 | * in the description) and sets the rule's base value, radix, and |
michael@0 | 211 | * exponent according to the descriptor. (If the description doesn't |
michael@0 | 212 | * include a rule descriptor, then this function sets everything to |
michael@0 | 213 | * default values and the rule set sets the rule's real base value). |
michael@0 | 214 | * @param description The rule's description |
michael@0 | 215 | * @return If "description" included a rule descriptor, this is |
michael@0 | 216 | * "description" with the descriptor and any trailing whitespace |
michael@0 | 217 | * stripped off. Otherwise; it's "descriptor" unchangd. |
michael@0 | 218 | */ |
michael@0 | 219 | void |
michael@0 | 220 | NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) |
michael@0 | 221 | { |
michael@0 | 222 | // the description consists of a rule descriptor and a rule body, |
michael@0 | 223 | // separated by a colon. The rule descriptor is optional. If |
michael@0 | 224 | // it's omitted, just set the base value to 0. |
michael@0 | 225 | int32_t p = description.indexOf(gColon); |
michael@0 | 226 | if (p == -1) { |
michael@0 | 227 | setBaseValue((int32_t)0, status); |
michael@0 | 228 | } else { |
michael@0 | 229 | // copy the descriptor out into its own string and strip it, |
michael@0 | 230 | // along with any trailing whitespace, out of the original |
michael@0 | 231 | // description |
michael@0 | 232 | UnicodeString descriptor; |
michael@0 | 233 | descriptor.setTo(description, 0, p); |
michael@0 | 234 | |
michael@0 | 235 | ++p; |
michael@0 | 236 | while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { |
michael@0 | 237 | ++p; |
michael@0 | 238 | } |
michael@0 | 239 | description.removeBetween(0, p); |
michael@0 | 240 | |
michael@0 | 241 | // check first to see if the rule descriptor matches the token |
michael@0 | 242 | // for one of the special rules. If it does, set the base |
michael@0 | 243 | // value to the correct identfier value |
michael@0 | 244 | if (0 == descriptor.compare(gMinusX, 2)) { |
michael@0 | 245 | setType(kNegativeNumberRule); |
michael@0 | 246 | } |
michael@0 | 247 | else if (0 == descriptor.compare(gXDotX, 3)) { |
michael@0 | 248 | setType(kImproperFractionRule); |
michael@0 | 249 | } |
michael@0 | 250 | else if (0 == descriptor.compare(gZeroDotX, 3)) { |
michael@0 | 251 | setType(kProperFractionRule); |
michael@0 | 252 | } |
michael@0 | 253 | else if (0 == descriptor.compare(gXDotZero, 3)) { |
michael@0 | 254 | setType(kMasterRule); |
michael@0 | 255 | } |
michael@0 | 256 | |
michael@0 | 257 | // if the rule descriptor begins with a digit, it's a descriptor |
michael@0 | 258 | // for a normal rule |
michael@0 | 259 | // since we don't have Long.parseLong, and this isn't much work anyway, |
michael@0 | 260 | // just build up the value as we encounter the digits. |
michael@0 | 261 | else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { |
michael@0 | 262 | int64_t val = 0; |
michael@0 | 263 | p = 0; |
michael@0 | 264 | UChar c = gSpace; |
michael@0 | 265 | |
michael@0 | 266 | // begin parsing the descriptor: copy digits |
michael@0 | 267 | // into "tempValue", skip periods, commas, and spaces, |
michael@0 | 268 | // stop on a slash or > sign (or at the end of the string), |
michael@0 | 269 | // and throw an exception on any other character |
michael@0 | 270 | int64_t ll_10 = 10; |
michael@0 | 271 | while (p < descriptor.length()) { |
michael@0 | 272 | c = descriptor.charAt(p); |
michael@0 | 273 | if (c >= gZero && c <= gNine) { |
michael@0 | 274 | val = val * ll_10 + (int32_t)(c - gZero); |
michael@0 | 275 | } |
michael@0 | 276 | else if (c == gSlash || c == gGreaterThan) { |
michael@0 | 277 | break; |
michael@0 | 278 | } |
michael@0 | 279 | else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
michael@0 | 280 | } |
michael@0 | 281 | else { |
michael@0 | 282 | // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
michael@0 | 283 | status = U_PARSE_ERROR; |
michael@0 | 284 | return; |
michael@0 | 285 | } |
michael@0 | 286 | ++p; |
michael@0 | 287 | } |
michael@0 | 288 | |
michael@0 | 289 | // we have the base value, so set it |
michael@0 | 290 | setBaseValue(val, status); |
michael@0 | 291 | |
michael@0 | 292 | // if we stopped the previous loop on a slash, we're |
michael@0 | 293 | // now parsing the rule's radix. Again, accumulate digits |
michael@0 | 294 | // in tempValue, skip punctuation, stop on a > mark, and |
michael@0 | 295 | // throw an exception on anything else |
michael@0 | 296 | if (c == gSlash) { |
michael@0 | 297 | val = 0; |
michael@0 | 298 | ++p; |
michael@0 | 299 | int64_t ll_10 = 10; |
michael@0 | 300 | while (p < descriptor.length()) { |
michael@0 | 301 | c = descriptor.charAt(p); |
michael@0 | 302 | if (c >= gZero && c <= gNine) { |
michael@0 | 303 | val = val * ll_10 + (int32_t)(c - gZero); |
michael@0 | 304 | } |
michael@0 | 305 | else if (c == gGreaterThan) { |
michael@0 | 306 | break; |
michael@0 | 307 | } |
michael@0 | 308 | else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
michael@0 | 309 | } |
michael@0 | 310 | else { |
michael@0 | 311 | // throw new IllegalArgumentException("Illegal character is rule descriptor"); |
michael@0 | 312 | status = U_PARSE_ERROR; |
michael@0 | 313 | return; |
michael@0 | 314 | } |
michael@0 | 315 | ++p; |
michael@0 | 316 | } |
michael@0 | 317 | |
michael@0 | 318 | // tempValue now contain's the rule's radix. Set it |
michael@0 | 319 | // accordingly, and recalculate the rule's exponent |
michael@0 | 320 | radix = (int32_t)val; |
michael@0 | 321 | if (radix == 0) { |
michael@0 | 322 | // throw new IllegalArgumentException("Rule can't have radix of 0"); |
michael@0 | 323 | status = U_PARSE_ERROR; |
michael@0 | 324 | } |
michael@0 | 325 | |
michael@0 | 326 | exponent = expectedExponent(); |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | // if we stopped the previous loop on a > sign, then continue |
michael@0 | 330 | // for as long as we still see > signs. For each one, |
michael@0 | 331 | // decrement the exponent (unless the exponent is already 0). |
michael@0 | 332 | // If we see another character before reaching the end of |
michael@0 | 333 | // the descriptor, that's also a syntax error. |
michael@0 | 334 | if (c == gGreaterThan) { |
michael@0 | 335 | while (p < descriptor.length()) { |
michael@0 | 336 | c = descriptor.charAt(p); |
michael@0 | 337 | if (c == gGreaterThan && exponent > 0) { |
michael@0 | 338 | --exponent; |
michael@0 | 339 | } else { |
michael@0 | 340 | // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
michael@0 | 341 | status = U_PARSE_ERROR; |
michael@0 | 342 | return; |
michael@0 | 343 | } |
michael@0 | 344 | ++p; |
michael@0 | 345 | } |
michael@0 | 346 | } |
michael@0 | 347 | } |
michael@0 | 348 | } |
michael@0 | 349 | |
michael@0 | 350 | // finally, if the rule body begins with an apostrophe, strip it off |
michael@0 | 351 | // (this is generally used to put whitespace at the beginning of |
michael@0 | 352 | // a rule's rule text) |
michael@0 | 353 | if (description.length() > 0 && description.charAt(0) == gTick) { |
michael@0 | 354 | description.removeBetween(0, 1); |
michael@0 | 355 | } |
michael@0 | 356 | |
michael@0 | 357 | // return the description with all the stuff we've just waded through |
michael@0 | 358 | // stripped off the front. It now contains just the rule body. |
michael@0 | 359 | // return description; |
michael@0 | 360 | } |
michael@0 | 361 | |
michael@0 | 362 | /** |
michael@0 | 363 | * Searches the rule's rule text for the substitution tokens, |
michael@0 | 364 | * creates the substitutions, and removes the substitution tokens |
michael@0 | 365 | * from the rule's rule text. |
michael@0 | 366 | * @param owner The rule set containing this rule |
michael@0 | 367 | * @param predecessor The rule preseding this one in "owners" rule list |
michael@0 | 368 | * @param ownersOwner The RuleBasedFormat that owns this rule |
michael@0 | 369 | */ |
michael@0 | 370 | void |
michael@0 | 371 | NFRule::extractSubstitutions(const NFRuleSet* ruleSet, |
michael@0 | 372 | const NFRule* predecessor, |
michael@0 | 373 | const RuleBasedNumberFormat* rbnf, |
michael@0 | 374 | UErrorCode& status) |
michael@0 | 375 | { |
michael@0 | 376 | if (U_SUCCESS(status)) { |
michael@0 | 377 | sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
michael@0 | 378 | sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
michael@0 | 379 | } |
michael@0 | 380 | } |
michael@0 | 381 | |
michael@0 | 382 | /** |
michael@0 | 383 | * Searches the rule's rule text for the first substitution token, |
michael@0 | 384 | * creates a substitution based on it, and removes the token from |
michael@0 | 385 | * the rule's rule text. |
michael@0 | 386 | * @param owner The rule set containing this rule |
michael@0 | 387 | * @param predecessor The rule preceding this one in the rule set's |
michael@0 | 388 | * rule list |
michael@0 | 389 | * @param ownersOwner The RuleBasedNumberFormat that owns this rule |
michael@0 | 390 | * @return The newly-created substitution. This is never null; if |
michael@0 | 391 | * the rule text doesn't contain any substitution tokens, this will |
michael@0 | 392 | * be a NullSubstitution. |
michael@0 | 393 | */ |
michael@0 | 394 | NFSubstitution * |
michael@0 | 395 | NFRule::extractSubstitution(const NFRuleSet* ruleSet, |
michael@0 | 396 | const NFRule* predecessor, |
michael@0 | 397 | const RuleBasedNumberFormat* rbnf, |
michael@0 | 398 | UErrorCode& status) |
michael@0 | 399 | { |
michael@0 | 400 | NFSubstitution* result = NULL; |
michael@0 | 401 | |
michael@0 | 402 | // search the rule's rule text for the first two characters of |
michael@0 | 403 | // a substitution token |
michael@0 | 404 | int32_t subStart = indexOfAny(tokenStrings); |
michael@0 | 405 | int32_t subEnd = subStart; |
michael@0 | 406 | |
michael@0 | 407 | // if we didn't find one, create a null substitution positioned |
michael@0 | 408 | // at the end of the rule text |
michael@0 | 409 | if (subStart == -1) { |
michael@0 | 410 | return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
michael@0 | 411 | ruleSet, rbnf, UnicodeString(), status); |
michael@0 | 412 | } |
michael@0 | 413 | |
michael@0 | 414 | // special-case the ">>>" token, since searching for the > at the |
michael@0 | 415 | // end will actually find the > in the middle |
michael@0 | 416 | if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { |
michael@0 | 417 | subEnd = subStart + 2; |
michael@0 | 418 | |
michael@0 | 419 | // otherwise the substitution token ends with the same character |
michael@0 | 420 | // it began with |
michael@0 | 421 | } else { |
michael@0 | 422 | UChar c = ruleText.charAt(subStart); |
michael@0 | 423 | subEnd = ruleText.indexOf(c, subStart + 1); |
michael@0 | 424 | // special case for '<%foo<<' |
michael@0 | 425 | if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { |
michael@0 | 426 | // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle |
michael@0 | 427 | // occurs because of the juxtaposition of two different rules. The check for '<' is a hack |
michael@0 | 428 | // to get around this. Having the duplicate at the front would cause problems with |
michael@0 | 429 | // rules like "<<%" to format, say, percents... |
michael@0 | 430 | ++subEnd; |
michael@0 | 431 | } |
michael@0 | 432 | } |
michael@0 | 433 | |
michael@0 | 434 | // if we don't find the end of the token (i.e., if we're on a single, |
michael@0 | 435 | // unmatched token character), create a null substitution positioned |
michael@0 | 436 | // at the end of the rule |
michael@0 | 437 | if (subEnd == -1) { |
michael@0 | 438 | return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
michael@0 | 439 | ruleSet, rbnf, UnicodeString(), status); |
michael@0 | 440 | } |
michael@0 | 441 | |
michael@0 | 442 | // if we get here, we have a real substitution token (or at least |
michael@0 | 443 | // some text bounded by substitution token characters). Use |
michael@0 | 444 | // makeSubstitution() to create the right kind of substitution |
michael@0 | 445 | UnicodeString subToken; |
michael@0 | 446 | subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); |
michael@0 | 447 | result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, |
michael@0 | 448 | rbnf, subToken, status); |
michael@0 | 449 | |
michael@0 | 450 | // remove the substitution from the rule text |
michael@0 | 451 | ruleText.removeBetween(subStart, subEnd+1); |
michael@0 | 452 | |
michael@0 | 453 | return result; |
michael@0 | 454 | } |
michael@0 | 455 | |
michael@0 | 456 | /** |
michael@0 | 457 | * Sets the rule's base value, and causes the radix and exponent |
michael@0 | 458 | * to be recalculated. This is used during construction when we |
michael@0 | 459 | * don't know the rule's base value until after it's been |
michael@0 | 460 | * constructed. It should be used at any other time. |
michael@0 | 461 | * @param The new base value for the rule. |
michael@0 | 462 | */ |
michael@0 | 463 | void |
michael@0 | 464 | NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) |
michael@0 | 465 | { |
michael@0 | 466 | // set the base value |
michael@0 | 467 | baseValue = newBaseValue; |
michael@0 | 468 | |
michael@0 | 469 | // if this isn't a special rule, recalculate the radix and exponent |
michael@0 | 470 | // (the radix always defaults to 10; if it's supposed to be something |
michael@0 | 471 | // else, it's cleaned up by the caller and the exponent is |
michael@0 | 472 | // recalculated again-- the only function that does this is |
michael@0 | 473 | // NFRule.parseRuleDescriptor() ) |
michael@0 | 474 | if (baseValue >= 1) { |
michael@0 | 475 | radix = 10; |
michael@0 | 476 | exponent = expectedExponent(); |
michael@0 | 477 | |
michael@0 | 478 | // this function gets called on a fully-constructed rule whose |
michael@0 | 479 | // description didn't specify a base value. This means it |
michael@0 | 480 | // has substitutions, and some substitutions hold on to copies |
michael@0 | 481 | // of the rule's divisor. Fix their copies of the divisor. |
michael@0 | 482 | if (sub1 != NULL) { |
michael@0 | 483 | sub1->setDivisor(radix, exponent, status); |
michael@0 | 484 | } |
michael@0 | 485 | if (sub2 != NULL) { |
michael@0 | 486 | sub2->setDivisor(radix, exponent, status); |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | // if this is a special rule, its radix and exponent are basically |
michael@0 | 490 | // ignored. Set them to "safe" default values |
michael@0 | 491 | } else { |
michael@0 | 492 | radix = 10; |
michael@0 | 493 | exponent = 0; |
michael@0 | 494 | } |
michael@0 | 495 | } |
michael@0 | 496 | |
michael@0 | 497 | /** |
michael@0 | 498 | * This calculates the rule's exponent based on its radix and base |
michael@0 | 499 | * value. This will be the highest power the radix can be raised to |
michael@0 | 500 | * and still produce a result less than or equal to the base value. |
michael@0 | 501 | */ |
michael@0 | 502 | int16_t |
michael@0 | 503 | NFRule::expectedExponent() const |
michael@0 | 504 | { |
michael@0 | 505 | // since the log of 0, or the log base 0 of something, causes an |
michael@0 | 506 | // error, declare the exponent in these cases to be 0 (we also |
michael@0 | 507 | // deal with the special-rule identifiers here) |
michael@0 | 508 | if (radix == 0 || baseValue < 1) { |
michael@0 | 509 | return 0; |
michael@0 | 510 | } |
michael@0 | 511 | |
michael@0 | 512 | // we get rounding error in some cases-- for example, log 1000 / log 10 |
michael@0 | 513 | // gives us 1.9999999996 instead of 2. The extra logic here is to take |
michael@0 | 514 | // that into account |
michael@0 | 515 | int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); |
michael@0 | 516 | int64_t temp = util64_pow(radix, tempResult + 1); |
michael@0 | 517 | if (temp <= baseValue) { |
michael@0 | 518 | tempResult += 1; |
michael@0 | 519 | } |
michael@0 | 520 | return tempResult; |
michael@0 | 521 | } |
michael@0 | 522 | |
michael@0 | 523 | /** |
michael@0 | 524 | * Searches the rule's rule text for any of the specified strings. |
michael@0 | 525 | * @param strings An array of strings to search the rule's rule |
michael@0 | 526 | * text for |
michael@0 | 527 | * @return The index of the first match in the rule's rule text |
michael@0 | 528 | * (i.e., the first substring in the rule's rule text that matches |
michael@0 | 529 | * _any_ of the strings in "strings"). If none of the strings in |
michael@0 | 530 | * "strings" is found in the rule's rule text, returns -1. |
michael@0 | 531 | */ |
michael@0 | 532 | int32_t |
michael@0 | 533 | NFRule::indexOfAny(const UChar* const strings[]) const |
michael@0 | 534 | { |
michael@0 | 535 | int result = -1; |
michael@0 | 536 | for (int i = 0; strings[i]; i++) { |
michael@0 | 537 | int32_t pos = ruleText.indexOf(*strings[i]); |
michael@0 | 538 | if (pos != -1 && (result == -1 || pos < result)) { |
michael@0 | 539 | result = pos; |
michael@0 | 540 | } |
michael@0 | 541 | } |
michael@0 | 542 | return result; |
michael@0 | 543 | } |
michael@0 | 544 | |
michael@0 | 545 | //----------------------------------------------------------------------- |
michael@0 | 546 | // boilerplate |
michael@0 | 547 | //----------------------------------------------------------------------- |
michael@0 | 548 | |
michael@0 | 549 | /** |
michael@0 | 550 | * Tests two rules for equality. |
michael@0 | 551 | * @param that The rule to compare this one against |
michael@0 | 552 | * @return True is the two rules are functionally equivalent |
michael@0 | 553 | */ |
michael@0 | 554 | UBool |
michael@0 | 555 | NFRule::operator==(const NFRule& rhs) const |
michael@0 | 556 | { |
michael@0 | 557 | return baseValue == rhs.baseValue |
michael@0 | 558 | && radix == rhs.radix |
michael@0 | 559 | && exponent == rhs.exponent |
michael@0 | 560 | && ruleText == rhs.ruleText |
michael@0 | 561 | && *sub1 == *rhs.sub1 |
michael@0 | 562 | && *sub2 == *rhs.sub2; |
michael@0 | 563 | } |
michael@0 | 564 | |
michael@0 | 565 | /** |
michael@0 | 566 | * Returns a textual representation of the rule. This won't |
michael@0 | 567 | * necessarily be the same as the description that this rule |
michael@0 | 568 | * was created with, but it will produce the same result. |
michael@0 | 569 | * @return A textual description of the rule |
michael@0 | 570 | */ |
michael@0 | 571 | static void util_append64(UnicodeString& result, int64_t n) |
michael@0 | 572 | { |
michael@0 | 573 | UChar buffer[256]; |
michael@0 | 574 | int32_t len = util64_tou(n, buffer, sizeof(buffer)); |
michael@0 | 575 | UnicodeString temp(buffer, len); |
michael@0 | 576 | result.append(temp); |
michael@0 | 577 | } |
michael@0 | 578 | |
michael@0 | 579 | void |
michael@0 | 580 | NFRule::_appendRuleText(UnicodeString& result) const |
michael@0 | 581 | { |
michael@0 | 582 | switch (getType()) { |
michael@0 | 583 | case kNegativeNumberRule: result.append(gMinusX, 2); break; |
michael@0 | 584 | case kImproperFractionRule: result.append(gXDotX, 3); break; |
michael@0 | 585 | case kProperFractionRule: result.append(gZeroDotX, 3); break; |
michael@0 | 586 | case kMasterRule: result.append(gXDotZero, 3); break; |
michael@0 | 587 | default: |
michael@0 | 588 | // for a normal rule, write out its base value, and if the radix is |
michael@0 | 589 | // something other than 10, write out the radix (with the preceding |
michael@0 | 590 | // slash, of course). Then calculate the expected exponent and if |
michael@0 | 591 | // if isn't the same as the actual exponent, write an appropriate |
michael@0 | 592 | // number of > signs. Finally, terminate the whole thing with |
michael@0 | 593 | // a colon. |
michael@0 | 594 | util_append64(result, baseValue); |
michael@0 | 595 | if (radix != 10) { |
michael@0 | 596 | result.append(gSlash); |
michael@0 | 597 | util_append64(result, radix); |
michael@0 | 598 | } |
michael@0 | 599 | int numCarets = expectedExponent() - exponent; |
michael@0 | 600 | for (int i = 0; i < numCarets; i++) { |
michael@0 | 601 | result.append(gGreaterThan); |
michael@0 | 602 | } |
michael@0 | 603 | break; |
michael@0 | 604 | } |
michael@0 | 605 | result.append(gColon); |
michael@0 | 606 | result.append(gSpace); |
michael@0 | 607 | |
michael@0 | 608 | // if the rule text begins with a space, write an apostrophe |
michael@0 | 609 | // (whitespace after the rule descriptor is ignored; the |
michael@0 | 610 | // apostrophe is used to make the whitespace significant) |
michael@0 | 611 | if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) { |
michael@0 | 612 | result.append(gTick); |
michael@0 | 613 | } |
michael@0 | 614 | |
michael@0 | 615 | // now, write the rule's rule text, inserting appropriate |
michael@0 | 616 | // substitution tokens in the appropriate places |
michael@0 | 617 | UnicodeString ruleTextCopy; |
michael@0 | 618 | ruleTextCopy.setTo(ruleText); |
michael@0 | 619 | |
michael@0 | 620 | UnicodeString temp; |
michael@0 | 621 | sub2->toString(temp); |
michael@0 | 622 | ruleTextCopy.insert(sub2->getPos(), temp); |
michael@0 | 623 | sub1->toString(temp); |
michael@0 | 624 | ruleTextCopy.insert(sub1->getPos(), temp); |
michael@0 | 625 | |
michael@0 | 626 | result.append(ruleTextCopy); |
michael@0 | 627 | |
michael@0 | 628 | // and finally, top the whole thing off with a semicolon and |
michael@0 | 629 | // return the result |
michael@0 | 630 | result.append(gSemicolon); |
michael@0 | 631 | } |
michael@0 | 632 | |
michael@0 | 633 | //----------------------------------------------------------------------- |
michael@0 | 634 | // formatting |
michael@0 | 635 | //----------------------------------------------------------------------- |
michael@0 | 636 | |
michael@0 | 637 | /** |
michael@0 | 638 | * Formats the number, and inserts the resulting text into |
michael@0 | 639 | * toInsertInto. |
michael@0 | 640 | * @param number The number being formatted |
michael@0 | 641 | * @param toInsertInto The string where the resultant text should |
michael@0 | 642 | * be inserted |
michael@0 | 643 | * @param pos The position in toInsertInto where the resultant text |
michael@0 | 644 | * should be inserted |
michael@0 | 645 | */ |
michael@0 | 646 | void |
michael@0 | 647 | NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const |
michael@0 | 648 | { |
michael@0 | 649 | // first, insert the rule's rule text into toInsertInto at the |
michael@0 | 650 | // specified position, then insert the results of the substitutions |
michael@0 | 651 | // into the right places in toInsertInto (notice we do the |
michael@0 | 652 | // substitutions in reverse order so that the offsets don't get |
michael@0 | 653 | // messed up) |
michael@0 | 654 | toInsertInto.insert(pos, ruleText); |
michael@0 | 655 | sub2->doSubstitution(number, toInsertInto, pos); |
michael@0 | 656 | sub1->doSubstitution(number, toInsertInto, pos); |
michael@0 | 657 | } |
michael@0 | 658 | |
michael@0 | 659 | /** |
michael@0 | 660 | * Formats the number, and inserts the resulting text into |
michael@0 | 661 | * toInsertInto. |
michael@0 | 662 | * @param number The number being formatted |
michael@0 | 663 | * @param toInsertInto The string where the resultant text should |
michael@0 | 664 | * be inserted |
michael@0 | 665 | * @param pos The position in toInsertInto where the resultant text |
michael@0 | 666 | * should be inserted |
michael@0 | 667 | */ |
michael@0 | 668 | void |
michael@0 | 669 | NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const |
michael@0 | 670 | { |
michael@0 | 671 | // first, insert the rule's rule text into toInsertInto at the |
michael@0 | 672 | // specified position, then insert the results of the substitutions |
michael@0 | 673 | // into the right places in toInsertInto |
michael@0 | 674 | // [again, we have two copies of this routine that do the same thing |
michael@0 | 675 | // so that we don't sacrifice precision in a long by casting it |
michael@0 | 676 | // to a double] |
michael@0 | 677 | toInsertInto.insert(pos, ruleText); |
michael@0 | 678 | sub2->doSubstitution(number, toInsertInto, pos); |
michael@0 | 679 | sub1->doSubstitution(number, toInsertInto, pos); |
michael@0 | 680 | } |
michael@0 | 681 | |
michael@0 | 682 | /** |
michael@0 | 683 | * Used by the owning rule set to determine whether to invoke the |
michael@0 | 684 | * rollback rule (i.e., whether this rule or the one that precedes |
michael@0 | 685 | * it in the rule set's list should be used to format the number) |
michael@0 | 686 | * @param The number being formatted |
michael@0 | 687 | * @return True if the rule set should use the rule that precedes |
michael@0 | 688 | * this one in its list; false if it should use this rule |
michael@0 | 689 | */ |
michael@0 | 690 | UBool |
michael@0 | 691 | NFRule::shouldRollBack(double number) const |
michael@0 | 692 | { |
michael@0 | 693 | // we roll back if the rule contains a modulus substitution, |
michael@0 | 694 | // the number being formatted is an even multiple of the rule's |
michael@0 | 695 | // divisor, and the rule's base value is NOT an even multiple |
michael@0 | 696 | // of its divisor |
michael@0 | 697 | // In other words, if the original description had |
michael@0 | 698 | // 100: << hundred[ >>]; |
michael@0 | 699 | // that expands into |
michael@0 | 700 | // 100: << hundred; |
michael@0 | 701 | // 101: << hundred >>; |
michael@0 | 702 | // internally. But when we're formatting 200, if we use the rule |
michael@0 | 703 | // at 101, which would normally apply, we get "two hundred zero". |
michael@0 | 704 | // To prevent this, we roll back and use the rule at 100 instead. |
michael@0 | 705 | // This is the logic that makes this happen: the rule at 101 has |
michael@0 | 706 | // a modulus substitution, its base value isn't an even multiple |
michael@0 | 707 | // of 100, and the value we're trying to format _is_ an even |
michael@0 | 708 | // multiple of 100. This is called the "rollback rule." |
michael@0 | 709 | if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { |
michael@0 | 710 | int64_t re = util64_pow(radix, exponent); |
michael@0 | 711 | return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; |
michael@0 | 712 | } |
michael@0 | 713 | return FALSE; |
michael@0 | 714 | } |
michael@0 | 715 | |
michael@0 | 716 | //----------------------------------------------------------------------- |
michael@0 | 717 | // parsing |
michael@0 | 718 | //----------------------------------------------------------------------- |
michael@0 | 719 | |
michael@0 | 720 | /** |
michael@0 | 721 | * Attempts to parse the string with this rule. |
michael@0 | 722 | * @param text The string being parsed |
michael@0 | 723 | * @param parsePosition On entry, the value is ignored and assumed to |
michael@0 | 724 | * be 0. On exit, this has been updated with the position of the first |
michael@0 | 725 | * character not consumed by matching the text against this rule |
michael@0 | 726 | * (if this rule doesn't match the text at all, the parse position |
michael@0 | 727 | * if left unchanged (presumably at 0) and the function returns |
michael@0 | 728 | * new Long(0)). |
michael@0 | 729 | * @param isFractionRule True if this rule is contained within a |
michael@0 | 730 | * fraction rule set. This is only used if the rule has no |
michael@0 | 731 | * substitutions. |
michael@0 | 732 | * @return If this rule matched the text, this is the rule's base value |
michael@0 | 733 | * combined appropriately with the results of parsing the substitutions. |
michael@0 | 734 | * If nothing matched, this is new Long(0) and the parse position is |
michael@0 | 735 | * left unchanged. The result will be an instance of Long if the |
michael@0 | 736 | * result is an integer and Double otherwise. The result is never null. |
michael@0 | 737 | */ |
michael@0 | 738 | #ifdef RBNF_DEBUG |
michael@0 | 739 | #include <stdio.h> |
michael@0 | 740 | |
michael@0 | 741 | static void dumpUS(FILE* f, const UnicodeString& us) { |
michael@0 | 742 | int len = us.length(); |
michael@0 | 743 | char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; |
michael@0 | 744 | if (buf != NULL) { |
michael@0 | 745 | us.extract(0, len, buf); |
michael@0 | 746 | buf[len] = 0; |
michael@0 | 747 | fprintf(f, "%s", buf); |
michael@0 | 748 | uprv_free(buf); //delete[] buf; |
michael@0 | 749 | } |
michael@0 | 750 | } |
michael@0 | 751 | #endif |
michael@0 | 752 | |
michael@0 | 753 | UBool |
michael@0 | 754 | NFRule::doParse(const UnicodeString& text, |
michael@0 | 755 | ParsePosition& parsePosition, |
michael@0 | 756 | UBool isFractionRule, |
michael@0 | 757 | double upperBound, |
michael@0 | 758 | Formattable& resVal) const |
michael@0 | 759 | { |
michael@0 | 760 | // internally we operate on a copy of the string being parsed |
michael@0 | 761 | // (because we're going to change it) and use our own ParsePosition |
michael@0 | 762 | ParsePosition pp; |
michael@0 | 763 | UnicodeString workText(text); |
michael@0 | 764 | |
michael@0 | 765 | // check to see whether the text before the first substitution |
michael@0 | 766 | // matches the text at the beginning of the string being |
michael@0 | 767 | // parsed. If it does, strip that off the front of workText; |
michael@0 | 768 | // otherwise, dump out with a mismatch |
michael@0 | 769 | UnicodeString prefix; |
michael@0 | 770 | prefix.setTo(ruleText, 0, sub1->getPos()); |
michael@0 | 771 | |
michael@0 | 772 | #ifdef RBNF_DEBUG |
michael@0 | 773 | fprintf(stderr, "doParse %x ", this); |
michael@0 | 774 | { |
michael@0 | 775 | UnicodeString rt; |
michael@0 | 776 | _appendRuleText(rt); |
michael@0 | 777 | dumpUS(stderr, rt); |
michael@0 | 778 | } |
michael@0 | 779 | |
michael@0 | 780 | fprintf(stderr, " text: '", this); |
michael@0 | 781 | dumpUS(stderr, text); |
michael@0 | 782 | fprintf(stderr, "' prefix: '"); |
michael@0 | 783 | dumpUS(stderr, prefix); |
michael@0 | 784 | #endif |
michael@0 | 785 | stripPrefix(workText, prefix, pp); |
michael@0 | 786 | int32_t prefixLength = text.length() - workText.length(); |
michael@0 | 787 | |
michael@0 | 788 | #ifdef RBNF_DEBUG |
michael@0 | 789 | fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); |
michael@0 | 790 | #endif |
michael@0 | 791 | |
michael@0 | 792 | if (pp.getIndex() == 0 && sub1->getPos() != 0) { |
michael@0 | 793 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 794 | // restored for ICU4C port |
michael@0 | 795 | parsePosition.setErrorIndex(pp.getErrorIndex()); |
michael@0 | 796 | resVal.setLong(0); |
michael@0 | 797 | return TRUE; |
michael@0 | 798 | } |
michael@0 | 799 | |
michael@0 | 800 | // this is the fun part. The basic guts of the rule-matching |
michael@0 | 801 | // logic is matchToDelimiter(), which is called twice. The first |
michael@0 | 802 | // time it searches the input string for the rule text BETWEEN |
michael@0 | 803 | // the substitutions and tries to match the intervening text |
michael@0 | 804 | // in the input string with the first substitution. If that |
michael@0 | 805 | // succeeds, it then calls it again, this time to look for the |
michael@0 | 806 | // rule text after the second substitution and to match the |
michael@0 | 807 | // intervening input text against the second substitution. |
michael@0 | 808 | // |
michael@0 | 809 | // For example, say we have a rule that looks like this: |
michael@0 | 810 | // first << middle >> last; |
michael@0 | 811 | // and input text that looks like this: |
michael@0 | 812 | // first one middle two last |
michael@0 | 813 | // First we use stripPrefix() to match "first " in both places and |
michael@0 | 814 | // strip it off the front, leaving |
michael@0 | 815 | // one middle two last |
michael@0 | 816 | // Then we use matchToDelimiter() to match " middle " and try to |
michael@0 | 817 | // match "one" against a substitution. If it's successful, we now |
michael@0 | 818 | // have |
michael@0 | 819 | // two last |
michael@0 | 820 | // We use matchToDelimiter() a second time to match " last" and |
michael@0 | 821 | // try to match "two" against a substitution. If "two" matches |
michael@0 | 822 | // the substitution, we have a successful parse. |
michael@0 | 823 | // |
michael@0 | 824 | // Since it's possible in many cases to find multiple instances |
michael@0 | 825 | // of each of these pieces of rule text in the input string, |
michael@0 | 826 | // we need to try all the possible combinations of these |
michael@0 | 827 | // locations. This prevents us from prematurely declaring a mismatch, |
michael@0 | 828 | // and makes sure we match as much input text as we can. |
michael@0 | 829 | int highWaterMark = 0; |
michael@0 | 830 | double result = 0; |
michael@0 | 831 | int start = 0; |
michael@0 | 832 | double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); |
michael@0 | 833 | |
michael@0 | 834 | UnicodeString temp; |
michael@0 | 835 | do { |
michael@0 | 836 | // our partial parse result starts out as this rule's base |
michael@0 | 837 | // value. If it finds a successful match, matchToDelimiter() |
michael@0 | 838 | // will compose this in some way with what it gets back from |
michael@0 | 839 | // the substitution, giving us a new partial parse result |
michael@0 | 840 | pp.setIndex(0); |
michael@0 | 841 | |
michael@0 | 842 | temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); |
michael@0 | 843 | double partialResult = matchToDelimiter(workText, start, tempBaseValue, |
michael@0 | 844 | temp, pp, sub1, |
michael@0 | 845 | upperBound); |
michael@0 | 846 | |
michael@0 | 847 | // if we got a successful match (or were trying to match a |
michael@0 | 848 | // null substitution), pp is now pointing at the first unmatched |
michael@0 | 849 | // character. Take note of that, and try matchToDelimiter() |
michael@0 | 850 | // on the input text again |
michael@0 | 851 | if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { |
michael@0 | 852 | start = pp.getIndex(); |
michael@0 | 853 | |
michael@0 | 854 | UnicodeString workText2; |
michael@0 | 855 | workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); |
michael@0 | 856 | ParsePosition pp2; |
michael@0 | 857 | |
michael@0 | 858 | // the second matchToDelimiter() will compose our previous |
michael@0 | 859 | // partial result with whatever it gets back from its |
michael@0 | 860 | // substitution if there's a successful match, giving us |
michael@0 | 861 | // a real result |
michael@0 | 862 | temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); |
michael@0 | 863 | partialResult = matchToDelimiter(workText2, 0, partialResult, |
michael@0 | 864 | temp, pp2, sub2, |
michael@0 | 865 | upperBound); |
michael@0 | 866 | |
michael@0 | 867 | // if we got a successful match on this second |
michael@0 | 868 | // matchToDelimiter() call, update the high-water mark |
michael@0 | 869 | // and result (if necessary) |
michael@0 | 870 | if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { |
michael@0 | 871 | if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { |
michael@0 | 872 | highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); |
michael@0 | 873 | result = partialResult; |
michael@0 | 874 | } |
michael@0 | 875 | } |
michael@0 | 876 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 877 | // restored for ICU4C port |
michael@0 | 878 | else { |
michael@0 | 879 | int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); |
michael@0 | 880 | if (temp> parsePosition.getErrorIndex()) { |
michael@0 | 881 | parsePosition.setErrorIndex(temp); |
michael@0 | 882 | } |
michael@0 | 883 | } |
michael@0 | 884 | } |
michael@0 | 885 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 886 | // restored for ICU4C port |
michael@0 | 887 | else { |
michael@0 | 888 | int32_t temp = sub1->getPos() + pp.getErrorIndex(); |
michael@0 | 889 | if (temp > parsePosition.getErrorIndex()) { |
michael@0 | 890 | parsePosition.setErrorIndex(temp); |
michael@0 | 891 | } |
michael@0 | 892 | } |
michael@0 | 893 | // keep trying to match things until the outer matchToDelimiter() |
michael@0 | 894 | // call fails to make a match (each time, it picks up where it |
michael@0 | 895 | // left off the previous time) |
michael@0 | 896 | } while (sub1->getPos() != sub2->getPos() |
michael@0 | 897 | && pp.getIndex() > 0 |
michael@0 | 898 | && pp.getIndex() < workText.length() |
michael@0 | 899 | && pp.getIndex() != start); |
michael@0 | 900 | |
michael@0 | 901 | // update the caller's ParsePosition with our high-water mark |
michael@0 | 902 | // (i.e., it now points at the first character this function |
michael@0 | 903 | // didn't match-- the ParsePosition is therefore unchanged if |
michael@0 | 904 | // we didn't match anything) |
michael@0 | 905 | parsePosition.setIndex(highWaterMark); |
michael@0 | 906 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 907 | // restored for ICU4C port |
michael@0 | 908 | if (highWaterMark > 0) { |
michael@0 | 909 | parsePosition.setErrorIndex(0); |
michael@0 | 910 | } |
michael@0 | 911 | |
michael@0 | 912 | // this is a hack for one unusual condition: Normally, whether this |
michael@0 | 913 | // rule belong to a fraction rule set or not is handled by its |
michael@0 | 914 | // substitutions. But if that rule HAS NO substitutions, then |
michael@0 | 915 | // we have to account for it here. By definition, if the matching |
michael@0 | 916 | // rule in a fraction rule set has no substitutions, its numerator |
michael@0 | 917 | // is 1, and so the result is the reciprocal of its base value. |
michael@0 | 918 | if (isFractionRule && |
michael@0 | 919 | highWaterMark > 0 && |
michael@0 | 920 | sub1->isNullSubstitution()) { |
michael@0 | 921 | result = 1 / result; |
michael@0 | 922 | } |
michael@0 | 923 | |
michael@0 | 924 | resVal.setDouble(result); |
michael@0 | 925 | return TRUE; // ??? do we need to worry if it is a long or a double? |
michael@0 | 926 | } |
michael@0 | 927 | |
michael@0 | 928 | /** |
michael@0 | 929 | * This function is used by parse() to match the text being parsed |
michael@0 | 930 | * against a possible prefix string. This function |
michael@0 | 931 | * matches characters from the beginning of the string being parsed |
michael@0 | 932 | * to characters from the prospective prefix. If they match, pp is |
michael@0 | 933 | * updated to the first character not matched, and the result is |
michael@0 | 934 | * the unparsed part of the string. If they don't match, the whole |
michael@0 | 935 | * string is returned, and pp is left unchanged. |
michael@0 | 936 | * @param text The string being parsed |
michael@0 | 937 | * @param prefix The text to match against |
michael@0 | 938 | * @param pp On entry, ignored and assumed to be 0. On exit, points |
michael@0 | 939 | * to the first unmatched character (assuming the whole prefix matched), |
michael@0 | 940 | * or is unchanged (if the whole prefix didn't match). |
michael@0 | 941 | * @return If things match, this is the unparsed part of "text"; |
michael@0 | 942 | * if they didn't match, this is "text". |
michael@0 | 943 | */ |
michael@0 | 944 | void |
michael@0 | 945 | NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const |
michael@0 | 946 | { |
michael@0 | 947 | // if the prefix text is empty, dump out without doing anything |
michael@0 | 948 | if (prefix.length() != 0) { |
michael@0 | 949 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 950 | // use prefixLength() to match the beginning of |
michael@0 | 951 | // "text" against "prefix". This function returns the |
michael@0 | 952 | // number of characters from "text" that matched (or 0 if |
michael@0 | 953 | // we didn't match the whole prefix) |
michael@0 | 954 | int32_t pfl = prefixLength(text, prefix, status); |
michael@0 | 955 | if (U_FAILURE(status)) { // Memory allocation error. |
michael@0 | 956 | return; |
michael@0 | 957 | } |
michael@0 | 958 | if (pfl != 0) { |
michael@0 | 959 | // if we got a successful match, update the parse position |
michael@0 | 960 | // and strip the prefix off of "text" |
michael@0 | 961 | pp.setIndex(pp.getIndex() + pfl); |
michael@0 | 962 | text.remove(0, pfl); |
michael@0 | 963 | } |
michael@0 | 964 | } |
michael@0 | 965 | } |
michael@0 | 966 | |
michael@0 | 967 | /** |
michael@0 | 968 | * Used by parse() to match a substitution and any following text. |
michael@0 | 969 | * "text" is searched for instances of "delimiter". For each instance |
michael@0 | 970 | * of delimiter, the intervening text is tested to see whether it |
michael@0 | 971 | * matches the substitution. The longest match wins. |
michael@0 | 972 | * @param text The string being parsed |
michael@0 | 973 | * @param startPos The position in "text" where we should start looking |
michael@0 | 974 | * for "delimiter". |
michael@0 | 975 | * @param baseValue A partial parse result (often the rule's base value), |
michael@0 | 976 | * which is combined with the result from matching the substitution |
michael@0 | 977 | * @param delimiter The string to search "text" for. |
michael@0 | 978 | * @param pp Ignored and presumed to be 0 on entry. If there's a match, |
michael@0 | 979 | * on exit this will point to the first unmatched character. |
michael@0 | 980 | * @param sub If we find "delimiter" in "text", this substitution is used |
michael@0 | 981 | * to match the text between the beginning of the string and the |
michael@0 | 982 | * position of "delimiter." (If "delimiter" is the empty string, then |
michael@0 | 983 | * this function just matches against this substitution and updates |
michael@0 | 984 | * everything accordingly.) |
michael@0 | 985 | * @param upperBound When matching the substitution, it will only |
michael@0 | 986 | * consider rules with base values lower than this value. |
michael@0 | 987 | * @return If there's a match, this is the result of composing |
michael@0 | 988 | * baseValue with the result of matching the substitution. Otherwise, |
michael@0 | 989 | * this is new Long(0). It's never null. If the result is an integer, |
michael@0 | 990 | * this will be an instance of Long; otherwise, it's an instance of |
michael@0 | 991 | * Double. |
michael@0 | 992 | * |
michael@0 | 993 | * !!! note {dlf} in point of fact, in the java code the caller always converts |
michael@0 | 994 | * the result to a double, so we might as well return one. |
michael@0 | 995 | */ |
michael@0 | 996 | double |
michael@0 | 997 | NFRule::matchToDelimiter(const UnicodeString& text, |
michael@0 | 998 | int32_t startPos, |
michael@0 | 999 | double _baseValue, |
michael@0 | 1000 | const UnicodeString& delimiter, |
michael@0 | 1001 | ParsePosition& pp, |
michael@0 | 1002 | const NFSubstitution* sub, |
michael@0 | 1003 | double upperBound) const |
michael@0 | 1004 | { |
michael@0 | 1005 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1006 | // if "delimiter" contains real (i.e., non-ignorable) text, search |
michael@0 | 1007 | // it for "delimiter" beginning at "start". If that succeeds, then |
michael@0 | 1008 | // use "sub"'s doParse() method to match the text before the |
michael@0 | 1009 | // instance of "delimiter" we just found. |
michael@0 | 1010 | if (!allIgnorable(delimiter, status)) { |
michael@0 | 1011 | if (U_FAILURE(status)) { //Memory allocation error. |
michael@0 | 1012 | return 0; |
michael@0 | 1013 | } |
michael@0 | 1014 | ParsePosition tempPP; |
michael@0 | 1015 | Formattable result; |
michael@0 | 1016 | |
michael@0 | 1017 | // use findText() to search for "delimiter". It returns a two- |
michael@0 | 1018 | // element array: element 0 is the position of the match, and |
michael@0 | 1019 | // element 1 is the number of characters that matched |
michael@0 | 1020 | // "delimiter". |
michael@0 | 1021 | int32_t dLen; |
michael@0 | 1022 | int32_t dPos = findText(text, delimiter, startPos, &dLen); |
michael@0 | 1023 | |
michael@0 | 1024 | // if findText() succeeded, isolate the text preceding the |
michael@0 | 1025 | // match, and use "sub" to match that text |
michael@0 | 1026 | while (dPos >= 0) { |
michael@0 | 1027 | UnicodeString subText; |
michael@0 | 1028 | subText.setTo(text, 0, dPos); |
michael@0 | 1029 | if (subText.length() > 0) { |
michael@0 | 1030 | UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, |
michael@0 | 1031 | #if UCONFIG_NO_COLLATION |
michael@0 | 1032 | FALSE, |
michael@0 | 1033 | #else |
michael@0 | 1034 | formatter->isLenient(), |
michael@0 | 1035 | #endif |
michael@0 | 1036 | result); |
michael@0 | 1037 | |
michael@0 | 1038 | // if the substitution could match all the text up to |
michael@0 | 1039 | // where we found "delimiter", then this function has |
michael@0 | 1040 | // a successful match. Bump the caller's parse position |
michael@0 | 1041 | // to point to the first character after the text |
michael@0 | 1042 | // that matches "delimiter", and return the result |
michael@0 | 1043 | // we got from parsing the substitution. |
michael@0 | 1044 | if (success && tempPP.getIndex() == dPos) { |
michael@0 | 1045 | pp.setIndex(dPos + dLen); |
michael@0 | 1046 | return result.getDouble(); |
michael@0 | 1047 | } |
michael@0 | 1048 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 1049 | // restored for ICU4C port |
michael@0 | 1050 | else { |
michael@0 | 1051 | if (tempPP.getErrorIndex() > 0) { |
michael@0 | 1052 | pp.setErrorIndex(tempPP.getErrorIndex()); |
michael@0 | 1053 | } else { |
michael@0 | 1054 | pp.setErrorIndex(tempPP.getIndex()); |
michael@0 | 1055 | } |
michael@0 | 1056 | } |
michael@0 | 1057 | } |
michael@0 | 1058 | |
michael@0 | 1059 | // if we didn't match the substitution, search for another |
michael@0 | 1060 | // copy of "delimiter" in "text" and repeat the loop if |
michael@0 | 1061 | // we find it |
michael@0 | 1062 | tempPP.setIndex(0); |
michael@0 | 1063 | dPos = findText(text, delimiter, dPos + dLen, &dLen); |
michael@0 | 1064 | } |
michael@0 | 1065 | // if we make it here, this was an unsuccessful match, and we |
michael@0 | 1066 | // leave pp unchanged and return 0 |
michael@0 | 1067 | pp.setIndex(0); |
michael@0 | 1068 | return 0; |
michael@0 | 1069 | |
michael@0 | 1070 | // if "delimiter" is empty, or consists only of ignorable characters |
michael@0 | 1071 | // (i.e., is semantically empty), thwe we obviously can't search |
michael@0 | 1072 | // for "delimiter". Instead, just use "sub" to parse as much of |
michael@0 | 1073 | // "text" as possible. |
michael@0 | 1074 | } else { |
michael@0 | 1075 | ParsePosition tempPP; |
michael@0 | 1076 | Formattable result; |
michael@0 | 1077 | |
michael@0 | 1078 | // try to match the whole string against the substitution |
michael@0 | 1079 | UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, |
michael@0 | 1080 | #if UCONFIG_NO_COLLATION |
michael@0 | 1081 | FALSE, |
michael@0 | 1082 | #else |
michael@0 | 1083 | formatter->isLenient(), |
michael@0 | 1084 | #endif |
michael@0 | 1085 | result); |
michael@0 | 1086 | if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { |
michael@0 | 1087 | // if there's a successful match (or it's a null |
michael@0 | 1088 | // substitution), update pp to point to the first |
michael@0 | 1089 | // character we didn't match, and pass the result from |
michael@0 | 1090 | // sub.doParse() on through to the caller |
michael@0 | 1091 | pp.setIndex(tempPP.getIndex()); |
michael@0 | 1092 | return result.getDouble(); |
michael@0 | 1093 | } |
michael@0 | 1094 | // commented out because ParsePosition doesn't have error index in 1.1.x |
michael@0 | 1095 | // restored for ICU4C port |
michael@0 | 1096 | else { |
michael@0 | 1097 | pp.setErrorIndex(tempPP.getErrorIndex()); |
michael@0 | 1098 | } |
michael@0 | 1099 | |
michael@0 | 1100 | // and if we get to here, then nothing matched, so we return |
michael@0 | 1101 | // 0 and leave pp alone |
michael@0 | 1102 | return 0; |
michael@0 | 1103 | } |
michael@0 | 1104 | } |
michael@0 | 1105 | |
michael@0 | 1106 | /** |
michael@0 | 1107 | * Used by stripPrefix() to match characters. If lenient parse mode |
michael@0 | 1108 | * is off, this just calls startsWith(). If lenient parse mode is on, |
michael@0 | 1109 | * this function uses CollationElementIterators to match characters in |
michael@0 | 1110 | * the strings (only primary-order differences are significant in |
michael@0 | 1111 | * determining whether there's a match). |
michael@0 | 1112 | * @param str The string being tested |
michael@0 | 1113 | * @param prefix The text we're hoping to see at the beginning |
michael@0 | 1114 | * of "str" |
michael@0 | 1115 | * @return If "prefix" is found at the beginning of "str", this |
michael@0 | 1116 | * is the number of characters in "str" that were matched (this |
michael@0 | 1117 | * isn't necessarily the same as the length of "prefix" when matching |
michael@0 | 1118 | * text with a collator). If there's no match, this is 0. |
michael@0 | 1119 | */ |
michael@0 | 1120 | int32_t |
michael@0 | 1121 | NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const |
michael@0 | 1122 | { |
michael@0 | 1123 | // if we're looking for an empty prefix, it obviously matches |
michael@0 | 1124 | // zero characters. Just go ahead and return 0. |
michael@0 | 1125 | if (prefix.length() == 0) { |
michael@0 | 1126 | return 0; |
michael@0 | 1127 | } |
michael@0 | 1128 | |
michael@0 | 1129 | #if !UCONFIG_NO_COLLATION |
michael@0 | 1130 | // go through all this grief if we're in lenient-parse mode |
michael@0 | 1131 | if (formatter->isLenient()) { |
michael@0 | 1132 | // get the formatter's collator and use it to create two |
michael@0 | 1133 | // collation element iterators, one over the target string |
michael@0 | 1134 | // and another over the prefix (right now, we'll throw an |
michael@0 | 1135 | // exception if the collator we get back from the formatter |
michael@0 | 1136 | // isn't a RuleBasedCollator, because RuleBasedCollator defines |
michael@0 | 1137 | // the CollationElementIterator protocol. Hopefully, this |
michael@0 | 1138 | // will change someday.) |
michael@0 | 1139 | RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator(); |
michael@0 | 1140 | CollationElementIterator* strIter = collator->createCollationElementIterator(str); |
michael@0 | 1141 | CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); |
michael@0 | 1142 | // Check for memory allocation error. |
michael@0 | 1143 | if (collator == NULL || strIter == NULL || prefixIter == NULL) { |
michael@0 | 1144 | delete collator; |
michael@0 | 1145 | delete strIter; |
michael@0 | 1146 | delete prefixIter; |
michael@0 | 1147 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1148 | return 0; |
michael@0 | 1149 | } |
michael@0 | 1150 | |
michael@0 | 1151 | UErrorCode err = U_ZERO_ERROR; |
michael@0 | 1152 | |
michael@0 | 1153 | // The original code was problematic. Consider this match: |
michael@0 | 1154 | // prefix = "fifty-" |
michael@0 | 1155 | // string = " fifty-7" |
michael@0 | 1156 | // The intent is to match string up to the '7', by matching 'fifty-' at position 1 |
michael@0 | 1157 | // in the string. Unfortunately, we were getting a match, and then computing where |
michael@0 | 1158 | // the match terminated by rematching the string. The rematch code was using as an |
michael@0 | 1159 | // initial guess the substring of string between 0 and prefix.length. Because of |
michael@0 | 1160 | // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving |
michael@0 | 1161 | // the position before the hyphen in the string. Recursing down, we then parsed the |
michael@0 | 1162 | // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). |
michael@0 | 1163 | // This was not pretty, especially since the string "fifty-7" parsed just fine. |
michael@0 | 1164 | // |
michael@0 | 1165 | // We have newer APIs now, so we can use calls on the iterator to determine what we |
michael@0 | 1166 | // matched up to. If we terminate because we hit the last element in the string, |
michael@0 | 1167 | // our match terminates at this length. If we terminate because we hit the last element |
michael@0 | 1168 | // in the target, our match terminates at one before the element iterator position. |
michael@0 | 1169 | |
michael@0 | 1170 | // match collation elements between the strings |
michael@0 | 1171 | int32_t oStr = strIter->next(err); |
michael@0 | 1172 | int32_t oPrefix = prefixIter->next(err); |
michael@0 | 1173 | |
michael@0 | 1174 | while (oPrefix != CollationElementIterator::NULLORDER) { |
michael@0 | 1175 | // skip over ignorable characters in the target string |
michael@0 | 1176 | while (CollationElementIterator::primaryOrder(oStr) == 0 |
michael@0 | 1177 | && oStr != CollationElementIterator::NULLORDER) { |
michael@0 | 1178 | oStr = strIter->next(err); |
michael@0 | 1179 | } |
michael@0 | 1180 | |
michael@0 | 1181 | // skip over ignorable characters in the prefix |
michael@0 | 1182 | while (CollationElementIterator::primaryOrder(oPrefix) == 0 |
michael@0 | 1183 | && oPrefix != CollationElementIterator::NULLORDER) { |
michael@0 | 1184 | oPrefix = prefixIter->next(err); |
michael@0 | 1185 | } |
michael@0 | 1186 | |
michael@0 | 1187 | // dlf: move this above following test, if we consume the |
michael@0 | 1188 | // entire target, aren't we ok even if the source was also |
michael@0 | 1189 | // entirely consumed? |
michael@0 | 1190 | |
michael@0 | 1191 | // if skipping over ignorables brought to the end of |
michael@0 | 1192 | // the prefix, we DID match: drop out of the loop |
michael@0 | 1193 | if (oPrefix == CollationElementIterator::NULLORDER) { |
michael@0 | 1194 | break; |
michael@0 | 1195 | } |
michael@0 | 1196 | |
michael@0 | 1197 | // if skipping over ignorables brought us to the end |
michael@0 | 1198 | // of the target string, we didn't match and return 0 |
michael@0 | 1199 | if (oStr == CollationElementIterator::NULLORDER) { |
michael@0 | 1200 | delete prefixIter; |
michael@0 | 1201 | delete strIter; |
michael@0 | 1202 | return 0; |
michael@0 | 1203 | } |
michael@0 | 1204 | |
michael@0 | 1205 | // match collation elements from the two strings |
michael@0 | 1206 | // (considering only primary differences). If we |
michael@0 | 1207 | // get a mismatch, dump out and return 0 |
michael@0 | 1208 | if (CollationElementIterator::primaryOrder(oStr) |
michael@0 | 1209 | != CollationElementIterator::primaryOrder(oPrefix)) { |
michael@0 | 1210 | delete prefixIter; |
michael@0 | 1211 | delete strIter; |
michael@0 | 1212 | return 0; |
michael@0 | 1213 | |
michael@0 | 1214 | // otherwise, advance to the next character in each string |
michael@0 | 1215 | // and loop (we drop out of the loop when we exhaust |
michael@0 | 1216 | // collation elements in the prefix) |
michael@0 | 1217 | } else { |
michael@0 | 1218 | oStr = strIter->next(err); |
michael@0 | 1219 | oPrefix = prefixIter->next(err); |
michael@0 | 1220 | } |
michael@0 | 1221 | } |
michael@0 | 1222 | |
michael@0 | 1223 | int32_t result = strIter->getOffset(); |
michael@0 | 1224 | if (oStr != CollationElementIterator::NULLORDER) { |
michael@0 | 1225 | --result; // back over character that we don't want to consume; |
michael@0 | 1226 | } |
michael@0 | 1227 | |
michael@0 | 1228 | #ifdef RBNF_DEBUG |
michael@0 | 1229 | fprintf(stderr, "prefix length: %d\n", result); |
michael@0 | 1230 | #endif |
michael@0 | 1231 | delete prefixIter; |
michael@0 | 1232 | delete strIter; |
michael@0 | 1233 | |
michael@0 | 1234 | return result; |
michael@0 | 1235 | #if 0 |
michael@0 | 1236 | //---------------------------------------------------------------- |
michael@0 | 1237 | // JDK 1.2-specific API call |
michael@0 | 1238 | // return strIter.getOffset(); |
michael@0 | 1239 | //---------------------------------------------------------------- |
michael@0 | 1240 | // JDK 1.1 HACK (take out for 1.2-specific code) |
michael@0 | 1241 | |
michael@0 | 1242 | // if we make it to here, we have a successful match. Now we |
michael@0 | 1243 | // have to find out HOW MANY characters from the target string |
michael@0 | 1244 | // matched the prefix (there isn't necessarily a one-to-one |
michael@0 | 1245 | // mapping between collation elements and characters). |
michael@0 | 1246 | // In JDK 1.2, there's a simple getOffset() call we can use. |
michael@0 | 1247 | // In JDK 1.1, on the other hand, we have to go through some |
michael@0 | 1248 | // ugly contortions. First, use the collator to compare the |
michael@0 | 1249 | // same number of characters from the prefix and target string. |
michael@0 | 1250 | // If they're equal, we're done. |
michael@0 | 1251 | collator->setStrength(Collator::PRIMARY); |
michael@0 | 1252 | if (str.length() >= prefix.length()) { |
michael@0 | 1253 | UnicodeString temp; |
michael@0 | 1254 | temp.setTo(str, 0, prefix.length()); |
michael@0 | 1255 | if (collator->equals(temp, prefix)) { |
michael@0 | 1256 | #ifdef RBNF_DEBUG |
michael@0 | 1257 | fprintf(stderr, "returning: %d\n", prefix.length()); |
michael@0 | 1258 | #endif |
michael@0 | 1259 | return prefix.length(); |
michael@0 | 1260 | } |
michael@0 | 1261 | } |
michael@0 | 1262 | |
michael@0 | 1263 | // if they're not equal, then we have to compare successively |
michael@0 | 1264 | // larger and larger substrings of the target string until we |
michael@0 | 1265 | // get to one that matches the prefix. At that point, we know |
michael@0 | 1266 | // how many characters matched the prefix, and we can return. |
michael@0 | 1267 | int32_t p = 1; |
michael@0 | 1268 | while (p <= str.length()) { |
michael@0 | 1269 | UnicodeString temp; |
michael@0 | 1270 | temp.setTo(str, 0, p); |
michael@0 | 1271 | if (collator->equals(temp, prefix)) { |
michael@0 | 1272 | return p; |
michael@0 | 1273 | } else { |
michael@0 | 1274 | ++p; |
michael@0 | 1275 | } |
michael@0 | 1276 | } |
michael@0 | 1277 | |
michael@0 | 1278 | // SHOULD NEVER GET HERE!!! |
michael@0 | 1279 | return 0; |
michael@0 | 1280 | //---------------------------------------------------------------- |
michael@0 | 1281 | #endif |
michael@0 | 1282 | |
michael@0 | 1283 | // If lenient parsing is turned off, forget all that crap above. |
michael@0 | 1284 | // Just use String.startsWith() and be done with it. |
michael@0 | 1285 | } else |
michael@0 | 1286 | #endif |
michael@0 | 1287 | { |
michael@0 | 1288 | if (str.startsWith(prefix)) { |
michael@0 | 1289 | return prefix.length(); |
michael@0 | 1290 | } else { |
michael@0 | 1291 | return 0; |
michael@0 | 1292 | } |
michael@0 | 1293 | } |
michael@0 | 1294 | } |
michael@0 | 1295 | |
michael@0 | 1296 | /** |
michael@0 | 1297 | * Searches a string for another string. If lenient parsing is off, |
michael@0 | 1298 | * this just calls indexOf(). If lenient parsing is on, this function |
michael@0 | 1299 | * uses CollationElementIterator to match characters, and only |
michael@0 | 1300 | * primary-order differences are significant in determining whether |
michael@0 | 1301 | * there's a match. |
michael@0 | 1302 | * @param str The string to search |
michael@0 | 1303 | * @param key The string to search "str" for |
michael@0 | 1304 | * @param startingAt The index into "str" where the search is to |
michael@0 | 1305 | * begin |
michael@0 | 1306 | * @return A two-element array of ints. Element 0 is the position |
michael@0 | 1307 | * of the match, or -1 if there was no match. Element 1 is the |
michael@0 | 1308 | * number of characters in "str" that matched (which isn't necessarily |
michael@0 | 1309 | * the same as the length of "key") |
michael@0 | 1310 | */ |
michael@0 | 1311 | int32_t |
michael@0 | 1312 | NFRule::findText(const UnicodeString& str, |
michael@0 | 1313 | const UnicodeString& key, |
michael@0 | 1314 | int32_t startingAt, |
michael@0 | 1315 | int32_t* length) const |
michael@0 | 1316 | { |
michael@0 | 1317 | #if !UCONFIG_NO_COLLATION |
michael@0 | 1318 | // if lenient parsing is turned off, this is easy: just call |
michael@0 | 1319 | // String.indexOf() and we're done |
michael@0 | 1320 | if (!formatter->isLenient()) { |
michael@0 | 1321 | *length = key.length(); |
michael@0 | 1322 | return str.indexOf(key, startingAt); |
michael@0 | 1323 | |
michael@0 | 1324 | // but if lenient parsing is turned ON, we've got some work |
michael@0 | 1325 | // ahead of us |
michael@0 | 1326 | } else |
michael@0 | 1327 | #endif |
michael@0 | 1328 | { |
michael@0 | 1329 | //---------------------------------------------------------------- |
michael@0 | 1330 | // JDK 1.1 HACK (take out of 1.2-specific code) |
michael@0 | 1331 | |
michael@0 | 1332 | // in JDK 1.2, CollationElementIterator provides us with an |
michael@0 | 1333 | // API to map between character offsets and collation elements |
michael@0 | 1334 | // and we can do this by marching through the string comparing |
michael@0 | 1335 | // collation elements. We can't do that in JDK 1.1. Insted, |
michael@0 | 1336 | // we have to go through this horrible slow mess: |
michael@0 | 1337 | int32_t p = startingAt; |
michael@0 | 1338 | int32_t keyLen = 0; |
michael@0 | 1339 | |
michael@0 | 1340 | // basically just isolate smaller and smaller substrings of |
michael@0 | 1341 | // the target string (each running to the end of the string, |
michael@0 | 1342 | // and with the first one running from startingAt to the end) |
michael@0 | 1343 | // and then use prefixLength() to see if the search key is at |
michael@0 | 1344 | // the beginning of each substring. This is excruciatingly |
michael@0 | 1345 | // slow, but it will locate the key and tell use how long the |
michael@0 | 1346 | // matching text was. |
michael@0 | 1347 | UnicodeString temp; |
michael@0 | 1348 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1349 | while (p < str.length() && keyLen == 0) { |
michael@0 | 1350 | temp.setTo(str, p, str.length() - p); |
michael@0 | 1351 | keyLen = prefixLength(temp, key, status); |
michael@0 | 1352 | if (U_FAILURE(status)) { |
michael@0 | 1353 | break; |
michael@0 | 1354 | } |
michael@0 | 1355 | if (keyLen != 0) { |
michael@0 | 1356 | *length = keyLen; |
michael@0 | 1357 | return p; |
michael@0 | 1358 | } |
michael@0 | 1359 | ++p; |
michael@0 | 1360 | } |
michael@0 | 1361 | // if we make it to here, we didn't find it. Return -1 for the |
michael@0 | 1362 | // location. The length should be ignored, but set it to 0, |
michael@0 | 1363 | // which should be "safe" |
michael@0 | 1364 | *length = 0; |
michael@0 | 1365 | return -1; |
michael@0 | 1366 | |
michael@0 | 1367 | //---------------------------------------------------------------- |
michael@0 | 1368 | // JDK 1.2 version of this routine |
michael@0 | 1369 | //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); |
michael@0 | 1370 | // |
michael@0 | 1371 | //CollationElementIterator strIter = collator.getCollationElementIterator(str); |
michael@0 | 1372 | //CollationElementIterator keyIter = collator.getCollationElementIterator(key); |
michael@0 | 1373 | // |
michael@0 | 1374 | //int keyStart = -1; |
michael@0 | 1375 | // |
michael@0 | 1376 | //str.setOffset(startingAt); |
michael@0 | 1377 | // |
michael@0 | 1378 | //int oStr = strIter.next(); |
michael@0 | 1379 | //int oKey = keyIter.next(); |
michael@0 | 1380 | //while (oKey != CollationElementIterator.NULLORDER) { |
michael@0 | 1381 | // while (oStr != CollationElementIterator.NULLORDER && |
michael@0 | 1382 | // CollationElementIterator.primaryOrder(oStr) == 0) |
michael@0 | 1383 | // oStr = strIter.next(); |
michael@0 | 1384 | // |
michael@0 | 1385 | // while (oKey != CollationElementIterator.NULLORDER && |
michael@0 | 1386 | // CollationElementIterator.primaryOrder(oKey) == 0) |
michael@0 | 1387 | // oKey = keyIter.next(); |
michael@0 | 1388 | // |
michael@0 | 1389 | // if (oStr == CollationElementIterator.NULLORDER) { |
michael@0 | 1390 | // return new int[] { -1, 0 }; |
michael@0 | 1391 | // } |
michael@0 | 1392 | // |
michael@0 | 1393 | // if (oKey == CollationElementIterator.NULLORDER) { |
michael@0 | 1394 | // break; |
michael@0 | 1395 | // } |
michael@0 | 1396 | // |
michael@0 | 1397 | // if (CollationElementIterator.primaryOrder(oStr) == |
michael@0 | 1398 | // CollationElementIterator.primaryOrder(oKey)) { |
michael@0 | 1399 | // keyStart = strIter.getOffset(); |
michael@0 | 1400 | // oStr = strIter.next(); |
michael@0 | 1401 | // oKey = keyIter.next(); |
michael@0 | 1402 | // } else { |
michael@0 | 1403 | // if (keyStart != -1) { |
michael@0 | 1404 | // keyStart = -1; |
michael@0 | 1405 | // keyIter.reset(); |
michael@0 | 1406 | // } else { |
michael@0 | 1407 | // oStr = strIter.next(); |
michael@0 | 1408 | // } |
michael@0 | 1409 | // } |
michael@0 | 1410 | //} |
michael@0 | 1411 | // |
michael@0 | 1412 | //if (oKey == CollationElementIterator.NULLORDER) { |
michael@0 | 1413 | // return new int[] { keyStart, strIter.getOffset() - keyStart }; |
michael@0 | 1414 | //} else { |
michael@0 | 1415 | // return new int[] { -1, 0 }; |
michael@0 | 1416 | //} |
michael@0 | 1417 | } |
michael@0 | 1418 | } |
michael@0 | 1419 | |
michael@0 | 1420 | /** |
michael@0 | 1421 | * Checks to see whether a string consists entirely of ignorable |
michael@0 | 1422 | * characters. |
michael@0 | 1423 | * @param str The string to test. |
michael@0 | 1424 | * @return true if the string is empty of consists entirely of |
michael@0 | 1425 | * characters that the number formatter's collator says are |
michael@0 | 1426 | * ignorable at the primary-order level. false otherwise. |
michael@0 | 1427 | */ |
michael@0 | 1428 | UBool |
michael@0 | 1429 | NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const |
michael@0 | 1430 | { |
michael@0 | 1431 | // if the string is empty, we can just return true |
michael@0 | 1432 | if (str.length() == 0) { |
michael@0 | 1433 | return TRUE; |
michael@0 | 1434 | } |
michael@0 | 1435 | |
michael@0 | 1436 | #if !UCONFIG_NO_COLLATION |
michael@0 | 1437 | // if lenient parsing is turned on, walk through the string with |
michael@0 | 1438 | // a collation element iterator and make sure each collation |
michael@0 | 1439 | // element is 0 (ignorable) at the primary level |
michael@0 | 1440 | if (formatter->isLenient()) { |
michael@0 | 1441 | RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator()); |
michael@0 | 1442 | CollationElementIterator* iter = collator->createCollationElementIterator(str); |
michael@0 | 1443 | |
michael@0 | 1444 | // Memory allocation error check. |
michael@0 | 1445 | if (collator == NULL || iter == NULL) { |
michael@0 | 1446 | delete collator; |
michael@0 | 1447 | delete iter; |
michael@0 | 1448 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1449 | return FALSE; |
michael@0 | 1450 | } |
michael@0 | 1451 | |
michael@0 | 1452 | UErrorCode err = U_ZERO_ERROR; |
michael@0 | 1453 | int32_t o = iter->next(err); |
michael@0 | 1454 | while (o != CollationElementIterator::NULLORDER |
michael@0 | 1455 | && CollationElementIterator::primaryOrder(o) == 0) { |
michael@0 | 1456 | o = iter->next(err); |
michael@0 | 1457 | } |
michael@0 | 1458 | |
michael@0 | 1459 | delete iter; |
michael@0 | 1460 | return o == CollationElementIterator::NULLORDER; |
michael@0 | 1461 | } |
michael@0 | 1462 | #endif |
michael@0 | 1463 | |
michael@0 | 1464 | // if lenient parsing is turned off, there is no such thing as |
michael@0 | 1465 | // an ignorable character: return true only if the string is empty |
michael@0 | 1466 | return FALSE; |
michael@0 | 1467 | } |
michael@0 | 1468 | |
michael@0 | 1469 | U_NAMESPACE_END |
michael@0 | 1470 | |
michael@0 | 1471 | /* U_HAVE_RBNF */ |
michael@0 | 1472 | #endif |
michael@0 | 1473 | |
michael@0 | 1474 |