|
1 /* |
|
2 ****************************************************************************** |
|
3 * Copyright (C) 1997-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ****************************************************************************** |
|
6 * file name: nfrule.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * Modification history |
|
12 * Date Name Comments |
|
13 * 10/11/2001 Doug Ported from ICU4J |
|
14 */ |
|
15 |
|
16 #include "nfrule.h" |
|
17 |
|
18 #if U_HAVE_RBNF |
|
19 |
|
20 #include "unicode/rbnf.h" |
|
21 #include "unicode/tblcoll.h" |
|
22 #include "unicode/coleitr.h" |
|
23 #include "unicode/uchar.h" |
|
24 #include "nfrs.h" |
|
25 #include "nfrlist.h" |
|
26 #include "nfsubs.h" |
|
27 #include "patternprops.h" |
|
28 |
|
29 U_NAMESPACE_BEGIN |
|
30 |
|
31 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) |
|
32 : baseValue((int32_t)0) |
|
33 , radix(0) |
|
34 , exponent(0) |
|
35 , ruleText() |
|
36 , sub1(NULL) |
|
37 , sub2(NULL) |
|
38 , formatter(_rbnf) |
|
39 { |
|
40 } |
|
41 |
|
42 NFRule::~NFRule() |
|
43 { |
|
44 delete sub1; |
|
45 delete sub2; |
|
46 } |
|
47 |
|
48 static const UChar gLeftBracket = 0x005b; |
|
49 static const UChar gRightBracket = 0x005d; |
|
50 static const UChar gColon = 0x003a; |
|
51 static const UChar gZero = 0x0030; |
|
52 static const UChar gNine = 0x0039; |
|
53 static const UChar gSpace = 0x0020; |
|
54 static const UChar gSlash = 0x002f; |
|
55 static const UChar gGreaterThan = 0x003e; |
|
56 static const UChar gLessThan = 0x003c; |
|
57 static const UChar gComma = 0x002c; |
|
58 static const UChar gDot = 0x002e; |
|
59 static const UChar gTick = 0x0027; |
|
60 //static const UChar gMinus = 0x002d; |
|
61 static const UChar gSemicolon = 0x003b; |
|
62 |
|
63 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ |
|
64 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ |
|
65 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ |
|
66 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ |
|
67 |
|
68 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ |
|
69 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ |
|
70 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ |
|
71 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ |
|
72 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ |
|
73 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ |
|
74 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ |
|
75 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ |
|
76 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ |
|
77 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ |
|
78 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ |
|
79 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ |
|
80 |
|
81 static const UChar * const tokenStrings[] = { |
|
82 gLessLess, gLessPercent, gLessHash, gLessZero, |
|
83 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, |
|
84 gEqualPercent, gEqualHash, gEqualZero, NULL |
|
85 }; |
|
86 |
|
87 void |
|
88 NFRule::makeRules(UnicodeString& description, |
|
89 const NFRuleSet *ruleSet, |
|
90 const NFRule *predecessor, |
|
91 const RuleBasedNumberFormat *rbnf, |
|
92 NFRuleList& rules, |
|
93 UErrorCode& status) |
|
94 { |
|
95 // we know we're making at least one rule, so go ahead and |
|
96 // new it up and initialize its basevalue and divisor |
|
97 // (this also strips the rule descriptor, if any, off the |
|
98 // descripton string) |
|
99 NFRule* rule1 = new NFRule(rbnf); |
|
100 /* test for NULL */ |
|
101 if (rule1 == 0) { |
|
102 status = U_MEMORY_ALLOCATION_ERROR; |
|
103 return; |
|
104 } |
|
105 rule1->parseRuleDescriptor(description, status); |
|
106 |
|
107 // check the description to see whether there's text enclosed |
|
108 // in brackets |
|
109 int32_t brack1 = description.indexOf(gLeftBracket); |
|
110 int32_t brack2 = description.indexOf(gRightBracket); |
|
111 |
|
112 // if the description doesn't contain a matched pair of brackets, |
|
113 // or if it's of a type that doesn't recognize bracketed text, |
|
114 // then leave the description alone, initialize the rule's |
|
115 // rule text and substitutions, and return that rule |
|
116 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 |
|
117 || rule1->getType() == kProperFractionRule |
|
118 || rule1->getType() == kNegativeNumberRule) { |
|
119 rule1->ruleText = description; |
|
120 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
|
121 rules.add(rule1); |
|
122 } else { |
|
123 // if the description does contain a matched pair of brackets, |
|
124 // then it's really shorthand for two rules (with one exception) |
|
125 NFRule* rule2 = NULL; |
|
126 UnicodeString sbuf; |
|
127 |
|
128 // we'll actually only split the rule into two rules if its |
|
129 // base value is an even multiple of its divisor (or it's one |
|
130 // of the special rules) |
|
131 if ((rule1->baseValue > 0 |
|
132 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) |
|
133 || rule1->getType() == kImproperFractionRule |
|
134 || rule1->getType() == kMasterRule) { |
|
135 |
|
136 // if it passes that test, new up the second rule. If the |
|
137 // rule set both rules will belong to is a fraction rule |
|
138 // set, they both have the same base value; otherwise, |
|
139 // increment the original rule's base value ("rule1" actually |
|
140 // goes SECOND in the rule set's rule list) |
|
141 rule2 = new NFRule(rbnf); |
|
142 /* test for NULL */ |
|
143 if (rule2 == 0) { |
|
144 status = U_MEMORY_ALLOCATION_ERROR; |
|
145 return; |
|
146 } |
|
147 if (rule1->baseValue >= 0) { |
|
148 rule2->baseValue = rule1->baseValue; |
|
149 if (!ruleSet->isFractionRuleSet()) { |
|
150 ++rule1->baseValue; |
|
151 } |
|
152 } |
|
153 |
|
154 // if the description began with "x.x" and contains bracketed |
|
155 // text, it describes both the improper fraction rule and |
|
156 // the proper fraction rule |
|
157 else if (rule1->getType() == kImproperFractionRule) { |
|
158 rule2->setType(kProperFractionRule); |
|
159 } |
|
160 |
|
161 // if the description began with "x.0" and contains bracketed |
|
162 // text, it describes both the master rule and the |
|
163 // improper fraction rule |
|
164 else if (rule1->getType() == kMasterRule) { |
|
165 rule2->baseValue = rule1->baseValue; |
|
166 rule1->setType(kImproperFractionRule); |
|
167 } |
|
168 |
|
169 // both rules have the same radix and exponent (i.e., the |
|
170 // same divisor) |
|
171 rule2->radix = rule1->radix; |
|
172 rule2->exponent = rule1->exponent; |
|
173 |
|
174 // rule2's rule text omits the stuff in brackets: initalize |
|
175 // its rule text and substitutions accordingly |
|
176 sbuf.append(description, 0, brack1); |
|
177 if (brack2 + 1 < description.length()) { |
|
178 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
|
179 } |
|
180 rule2->ruleText.setTo(sbuf); |
|
181 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
|
182 } |
|
183 |
|
184 // rule1's text includes the text in the brackets but omits |
|
185 // the brackets themselves: initialize _its_ rule text and |
|
186 // substitutions accordingly |
|
187 sbuf.setTo(description, 0, brack1); |
|
188 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); |
|
189 if (brack2 + 1 < description.length()) { |
|
190 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
|
191 } |
|
192 rule1->ruleText.setTo(sbuf); |
|
193 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
|
194 |
|
195 // if we only have one rule, return it; if we have two, return |
|
196 // a two-element array containing them (notice that rule2 goes |
|
197 // BEFORE rule1 in the list: in all cases, rule2 OMITS the |
|
198 // material in the brackets and rule1 INCLUDES the material |
|
199 // in the brackets) |
|
200 if (rule2 != NULL) { |
|
201 rules.add(rule2); |
|
202 } |
|
203 rules.add(rule1); |
|
204 } |
|
205 } |
|
206 |
|
207 /** |
|
208 * This function parses the rule's rule descriptor (i.e., the base |
|
209 * value and/or other tokens that precede the rule's rule text |
|
210 * in the description) and sets the rule's base value, radix, and |
|
211 * exponent according to the descriptor. (If the description doesn't |
|
212 * include a rule descriptor, then this function sets everything to |
|
213 * default values and the rule set sets the rule's real base value). |
|
214 * @param description The rule's description |
|
215 * @return If "description" included a rule descriptor, this is |
|
216 * "description" with the descriptor and any trailing whitespace |
|
217 * stripped off. Otherwise; it's "descriptor" unchangd. |
|
218 */ |
|
219 void |
|
220 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) |
|
221 { |
|
222 // the description consists of a rule descriptor and a rule body, |
|
223 // separated by a colon. The rule descriptor is optional. If |
|
224 // it's omitted, just set the base value to 0. |
|
225 int32_t p = description.indexOf(gColon); |
|
226 if (p == -1) { |
|
227 setBaseValue((int32_t)0, status); |
|
228 } else { |
|
229 // copy the descriptor out into its own string and strip it, |
|
230 // along with any trailing whitespace, out of the original |
|
231 // description |
|
232 UnicodeString descriptor; |
|
233 descriptor.setTo(description, 0, p); |
|
234 |
|
235 ++p; |
|
236 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { |
|
237 ++p; |
|
238 } |
|
239 description.removeBetween(0, p); |
|
240 |
|
241 // check first to see if the rule descriptor matches the token |
|
242 // for one of the special rules. If it does, set the base |
|
243 // value to the correct identfier value |
|
244 if (0 == descriptor.compare(gMinusX, 2)) { |
|
245 setType(kNegativeNumberRule); |
|
246 } |
|
247 else if (0 == descriptor.compare(gXDotX, 3)) { |
|
248 setType(kImproperFractionRule); |
|
249 } |
|
250 else if (0 == descriptor.compare(gZeroDotX, 3)) { |
|
251 setType(kProperFractionRule); |
|
252 } |
|
253 else if (0 == descriptor.compare(gXDotZero, 3)) { |
|
254 setType(kMasterRule); |
|
255 } |
|
256 |
|
257 // if the rule descriptor begins with a digit, it's a descriptor |
|
258 // for a normal rule |
|
259 // since we don't have Long.parseLong, and this isn't much work anyway, |
|
260 // just build up the value as we encounter the digits. |
|
261 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { |
|
262 int64_t val = 0; |
|
263 p = 0; |
|
264 UChar c = gSpace; |
|
265 |
|
266 // begin parsing the descriptor: copy digits |
|
267 // into "tempValue", skip periods, commas, and spaces, |
|
268 // stop on a slash or > sign (or at the end of the string), |
|
269 // and throw an exception on any other character |
|
270 int64_t ll_10 = 10; |
|
271 while (p < descriptor.length()) { |
|
272 c = descriptor.charAt(p); |
|
273 if (c >= gZero && c <= gNine) { |
|
274 val = val * ll_10 + (int32_t)(c - gZero); |
|
275 } |
|
276 else if (c == gSlash || c == gGreaterThan) { |
|
277 break; |
|
278 } |
|
279 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
|
280 } |
|
281 else { |
|
282 // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
|
283 status = U_PARSE_ERROR; |
|
284 return; |
|
285 } |
|
286 ++p; |
|
287 } |
|
288 |
|
289 // we have the base value, so set it |
|
290 setBaseValue(val, status); |
|
291 |
|
292 // if we stopped the previous loop on a slash, we're |
|
293 // now parsing the rule's radix. Again, accumulate digits |
|
294 // in tempValue, skip punctuation, stop on a > mark, and |
|
295 // throw an exception on anything else |
|
296 if (c == gSlash) { |
|
297 val = 0; |
|
298 ++p; |
|
299 int64_t ll_10 = 10; |
|
300 while (p < descriptor.length()) { |
|
301 c = descriptor.charAt(p); |
|
302 if (c >= gZero && c <= gNine) { |
|
303 val = val * ll_10 + (int32_t)(c - gZero); |
|
304 } |
|
305 else if (c == gGreaterThan) { |
|
306 break; |
|
307 } |
|
308 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
|
309 } |
|
310 else { |
|
311 // throw new IllegalArgumentException("Illegal character is rule descriptor"); |
|
312 status = U_PARSE_ERROR; |
|
313 return; |
|
314 } |
|
315 ++p; |
|
316 } |
|
317 |
|
318 // tempValue now contain's the rule's radix. Set it |
|
319 // accordingly, and recalculate the rule's exponent |
|
320 radix = (int32_t)val; |
|
321 if (radix == 0) { |
|
322 // throw new IllegalArgumentException("Rule can't have radix of 0"); |
|
323 status = U_PARSE_ERROR; |
|
324 } |
|
325 |
|
326 exponent = expectedExponent(); |
|
327 } |
|
328 |
|
329 // if we stopped the previous loop on a > sign, then continue |
|
330 // for as long as we still see > signs. For each one, |
|
331 // decrement the exponent (unless the exponent is already 0). |
|
332 // If we see another character before reaching the end of |
|
333 // the descriptor, that's also a syntax error. |
|
334 if (c == gGreaterThan) { |
|
335 while (p < descriptor.length()) { |
|
336 c = descriptor.charAt(p); |
|
337 if (c == gGreaterThan && exponent > 0) { |
|
338 --exponent; |
|
339 } else { |
|
340 // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
|
341 status = U_PARSE_ERROR; |
|
342 return; |
|
343 } |
|
344 ++p; |
|
345 } |
|
346 } |
|
347 } |
|
348 } |
|
349 |
|
350 // finally, if the rule body begins with an apostrophe, strip it off |
|
351 // (this is generally used to put whitespace at the beginning of |
|
352 // a rule's rule text) |
|
353 if (description.length() > 0 && description.charAt(0) == gTick) { |
|
354 description.removeBetween(0, 1); |
|
355 } |
|
356 |
|
357 // return the description with all the stuff we've just waded through |
|
358 // stripped off the front. It now contains just the rule body. |
|
359 // return description; |
|
360 } |
|
361 |
|
362 /** |
|
363 * Searches the rule's rule text for the substitution tokens, |
|
364 * creates the substitutions, and removes the substitution tokens |
|
365 * from the rule's rule text. |
|
366 * @param owner The rule set containing this rule |
|
367 * @param predecessor The rule preseding this one in "owners" rule list |
|
368 * @param ownersOwner The RuleBasedFormat that owns this rule |
|
369 */ |
|
370 void |
|
371 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, |
|
372 const NFRule* predecessor, |
|
373 const RuleBasedNumberFormat* rbnf, |
|
374 UErrorCode& status) |
|
375 { |
|
376 if (U_SUCCESS(status)) { |
|
377 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
|
378 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
|
379 } |
|
380 } |
|
381 |
|
382 /** |
|
383 * Searches the rule's rule text for the first substitution token, |
|
384 * creates a substitution based on it, and removes the token from |
|
385 * the rule's rule text. |
|
386 * @param owner The rule set containing this rule |
|
387 * @param predecessor The rule preceding this one in the rule set's |
|
388 * rule list |
|
389 * @param ownersOwner The RuleBasedNumberFormat that owns this rule |
|
390 * @return The newly-created substitution. This is never null; if |
|
391 * the rule text doesn't contain any substitution tokens, this will |
|
392 * be a NullSubstitution. |
|
393 */ |
|
394 NFSubstitution * |
|
395 NFRule::extractSubstitution(const NFRuleSet* ruleSet, |
|
396 const NFRule* predecessor, |
|
397 const RuleBasedNumberFormat* rbnf, |
|
398 UErrorCode& status) |
|
399 { |
|
400 NFSubstitution* result = NULL; |
|
401 |
|
402 // search the rule's rule text for the first two characters of |
|
403 // a substitution token |
|
404 int32_t subStart = indexOfAny(tokenStrings); |
|
405 int32_t subEnd = subStart; |
|
406 |
|
407 // if we didn't find one, create a null substitution positioned |
|
408 // at the end of the rule text |
|
409 if (subStart == -1) { |
|
410 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
|
411 ruleSet, rbnf, UnicodeString(), status); |
|
412 } |
|
413 |
|
414 // special-case the ">>>" token, since searching for the > at the |
|
415 // end will actually find the > in the middle |
|
416 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { |
|
417 subEnd = subStart + 2; |
|
418 |
|
419 // otherwise the substitution token ends with the same character |
|
420 // it began with |
|
421 } else { |
|
422 UChar c = ruleText.charAt(subStart); |
|
423 subEnd = ruleText.indexOf(c, subStart + 1); |
|
424 // special case for '<%foo<<' |
|
425 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { |
|
426 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle |
|
427 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack |
|
428 // to get around this. Having the duplicate at the front would cause problems with |
|
429 // rules like "<<%" to format, say, percents... |
|
430 ++subEnd; |
|
431 } |
|
432 } |
|
433 |
|
434 // if we don't find the end of the token (i.e., if we're on a single, |
|
435 // unmatched token character), create a null substitution positioned |
|
436 // at the end of the rule |
|
437 if (subEnd == -1) { |
|
438 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
|
439 ruleSet, rbnf, UnicodeString(), status); |
|
440 } |
|
441 |
|
442 // if we get here, we have a real substitution token (or at least |
|
443 // some text bounded by substitution token characters). Use |
|
444 // makeSubstitution() to create the right kind of substitution |
|
445 UnicodeString subToken; |
|
446 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); |
|
447 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, |
|
448 rbnf, subToken, status); |
|
449 |
|
450 // remove the substitution from the rule text |
|
451 ruleText.removeBetween(subStart, subEnd+1); |
|
452 |
|
453 return result; |
|
454 } |
|
455 |
|
456 /** |
|
457 * Sets the rule's base value, and causes the radix and exponent |
|
458 * to be recalculated. This is used during construction when we |
|
459 * don't know the rule's base value until after it's been |
|
460 * constructed. It should be used at any other time. |
|
461 * @param The new base value for the rule. |
|
462 */ |
|
463 void |
|
464 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) |
|
465 { |
|
466 // set the base value |
|
467 baseValue = newBaseValue; |
|
468 |
|
469 // if this isn't a special rule, recalculate the radix and exponent |
|
470 // (the radix always defaults to 10; if it's supposed to be something |
|
471 // else, it's cleaned up by the caller and the exponent is |
|
472 // recalculated again-- the only function that does this is |
|
473 // NFRule.parseRuleDescriptor() ) |
|
474 if (baseValue >= 1) { |
|
475 radix = 10; |
|
476 exponent = expectedExponent(); |
|
477 |
|
478 // this function gets called on a fully-constructed rule whose |
|
479 // description didn't specify a base value. This means it |
|
480 // has substitutions, and some substitutions hold on to copies |
|
481 // of the rule's divisor. Fix their copies of the divisor. |
|
482 if (sub1 != NULL) { |
|
483 sub1->setDivisor(radix, exponent, status); |
|
484 } |
|
485 if (sub2 != NULL) { |
|
486 sub2->setDivisor(radix, exponent, status); |
|
487 } |
|
488 |
|
489 // if this is a special rule, its radix and exponent are basically |
|
490 // ignored. Set them to "safe" default values |
|
491 } else { |
|
492 radix = 10; |
|
493 exponent = 0; |
|
494 } |
|
495 } |
|
496 |
|
497 /** |
|
498 * This calculates the rule's exponent based on its radix and base |
|
499 * value. This will be the highest power the radix can be raised to |
|
500 * and still produce a result less than or equal to the base value. |
|
501 */ |
|
502 int16_t |
|
503 NFRule::expectedExponent() const |
|
504 { |
|
505 // since the log of 0, or the log base 0 of something, causes an |
|
506 // error, declare the exponent in these cases to be 0 (we also |
|
507 // deal with the special-rule identifiers here) |
|
508 if (radix == 0 || baseValue < 1) { |
|
509 return 0; |
|
510 } |
|
511 |
|
512 // we get rounding error in some cases-- for example, log 1000 / log 10 |
|
513 // gives us 1.9999999996 instead of 2. The extra logic here is to take |
|
514 // that into account |
|
515 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); |
|
516 int64_t temp = util64_pow(radix, tempResult + 1); |
|
517 if (temp <= baseValue) { |
|
518 tempResult += 1; |
|
519 } |
|
520 return tempResult; |
|
521 } |
|
522 |
|
523 /** |
|
524 * Searches the rule's rule text for any of the specified strings. |
|
525 * @param strings An array of strings to search the rule's rule |
|
526 * text for |
|
527 * @return The index of the first match in the rule's rule text |
|
528 * (i.e., the first substring in the rule's rule text that matches |
|
529 * _any_ of the strings in "strings"). If none of the strings in |
|
530 * "strings" is found in the rule's rule text, returns -1. |
|
531 */ |
|
532 int32_t |
|
533 NFRule::indexOfAny(const UChar* const strings[]) const |
|
534 { |
|
535 int result = -1; |
|
536 for (int i = 0; strings[i]; i++) { |
|
537 int32_t pos = ruleText.indexOf(*strings[i]); |
|
538 if (pos != -1 && (result == -1 || pos < result)) { |
|
539 result = pos; |
|
540 } |
|
541 } |
|
542 return result; |
|
543 } |
|
544 |
|
545 //----------------------------------------------------------------------- |
|
546 // boilerplate |
|
547 //----------------------------------------------------------------------- |
|
548 |
|
549 /** |
|
550 * Tests two rules for equality. |
|
551 * @param that The rule to compare this one against |
|
552 * @return True is the two rules are functionally equivalent |
|
553 */ |
|
554 UBool |
|
555 NFRule::operator==(const NFRule& rhs) const |
|
556 { |
|
557 return baseValue == rhs.baseValue |
|
558 && radix == rhs.radix |
|
559 && exponent == rhs.exponent |
|
560 && ruleText == rhs.ruleText |
|
561 && *sub1 == *rhs.sub1 |
|
562 && *sub2 == *rhs.sub2; |
|
563 } |
|
564 |
|
565 /** |
|
566 * Returns a textual representation of the rule. This won't |
|
567 * necessarily be the same as the description that this rule |
|
568 * was created with, but it will produce the same result. |
|
569 * @return A textual description of the rule |
|
570 */ |
|
571 static void util_append64(UnicodeString& result, int64_t n) |
|
572 { |
|
573 UChar buffer[256]; |
|
574 int32_t len = util64_tou(n, buffer, sizeof(buffer)); |
|
575 UnicodeString temp(buffer, len); |
|
576 result.append(temp); |
|
577 } |
|
578 |
|
579 void |
|
580 NFRule::_appendRuleText(UnicodeString& result) const |
|
581 { |
|
582 switch (getType()) { |
|
583 case kNegativeNumberRule: result.append(gMinusX, 2); break; |
|
584 case kImproperFractionRule: result.append(gXDotX, 3); break; |
|
585 case kProperFractionRule: result.append(gZeroDotX, 3); break; |
|
586 case kMasterRule: result.append(gXDotZero, 3); break; |
|
587 default: |
|
588 // for a normal rule, write out its base value, and if the radix is |
|
589 // something other than 10, write out the radix (with the preceding |
|
590 // slash, of course). Then calculate the expected exponent and if |
|
591 // if isn't the same as the actual exponent, write an appropriate |
|
592 // number of > signs. Finally, terminate the whole thing with |
|
593 // a colon. |
|
594 util_append64(result, baseValue); |
|
595 if (radix != 10) { |
|
596 result.append(gSlash); |
|
597 util_append64(result, radix); |
|
598 } |
|
599 int numCarets = expectedExponent() - exponent; |
|
600 for (int i = 0; i < numCarets; i++) { |
|
601 result.append(gGreaterThan); |
|
602 } |
|
603 break; |
|
604 } |
|
605 result.append(gColon); |
|
606 result.append(gSpace); |
|
607 |
|
608 // if the rule text begins with a space, write an apostrophe |
|
609 // (whitespace after the rule descriptor is ignored; the |
|
610 // apostrophe is used to make the whitespace significant) |
|
611 if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) { |
|
612 result.append(gTick); |
|
613 } |
|
614 |
|
615 // now, write the rule's rule text, inserting appropriate |
|
616 // substitution tokens in the appropriate places |
|
617 UnicodeString ruleTextCopy; |
|
618 ruleTextCopy.setTo(ruleText); |
|
619 |
|
620 UnicodeString temp; |
|
621 sub2->toString(temp); |
|
622 ruleTextCopy.insert(sub2->getPos(), temp); |
|
623 sub1->toString(temp); |
|
624 ruleTextCopy.insert(sub1->getPos(), temp); |
|
625 |
|
626 result.append(ruleTextCopy); |
|
627 |
|
628 // and finally, top the whole thing off with a semicolon and |
|
629 // return the result |
|
630 result.append(gSemicolon); |
|
631 } |
|
632 |
|
633 //----------------------------------------------------------------------- |
|
634 // formatting |
|
635 //----------------------------------------------------------------------- |
|
636 |
|
637 /** |
|
638 * Formats the number, and inserts the resulting text into |
|
639 * toInsertInto. |
|
640 * @param number The number being formatted |
|
641 * @param toInsertInto The string where the resultant text should |
|
642 * be inserted |
|
643 * @param pos The position in toInsertInto where the resultant text |
|
644 * should be inserted |
|
645 */ |
|
646 void |
|
647 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const |
|
648 { |
|
649 // first, insert the rule's rule text into toInsertInto at the |
|
650 // specified position, then insert the results of the substitutions |
|
651 // into the right places in toInsertInto (notice we do the |
|
652 // substitutions in reverse order so that the offsets don't get |
|
653 // messed up) |
|
654 toInsertInto.insert(pos, ruleText); |
|
655 sub2->doSubstitution(number, toInsertInto, pos); |
|
656 sub1->doSubstitution(number, toInsertInto, pos); |
|
657 } |
|
658 |
|
659 /** |
|
660 * Formats the number, and inserts the resulting text into |
|
661 * toInsertInto. |
|
662 * @param number The number being formatted |
|
663 * @param toInsertInto The string where the resultant text should |
|
664 * be inserted |
|
665 * @param pos The position in toInsertInto where the resultant text |
|
666 * should be inserted |
|
667 */ |
|
668 void |
|
669 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const |
|
670 { |
|
671 // first, insert the rule's rule text into toInsertInto at the |
|
672 // specified position, then insert the results of the substitutions |
|
673 // into the right places in toInsertInto |
|
674 // [again, we have two copies of this routine that do the same thing |
|
675 // so that we don't sacrifice precision in a long by casting it |
|
676 // to a double] |
|
677 toInsertInto.insert(pos, ruleText); |
|
678 sub2->doSubstitution(number, toInsertInto, pos); |
|
679 sub1->doSubstitution(number, toInsertInto, pos); |
|
680 } |
|
681 |
|
682 /** |
|
683 * Used by the owning rule set to determine whether to invoke the |
|
684 * rollback rule (i.e., whether this rule or the one that precedes |
|
685 * it in the rule set's list should be used to format the number) |
|
686 * @param The number being formatted |
|
687 * @return True if the rule set should use the rule that precedes |
|
688 * this one in its list; false if it should use this rule |
|
689 */ |
|
690 UBool |
|
691 NFRule::shouldRollBack(double number) const |
|
692 { |
|
693 // we roll back if the rule contains a modulus substitution, |
|
694 // the number being formatted is an even multiple of the rule's |
|
695 // divisor, and the rule's base value is NOT an even multiple |
|
696 // of its divisor |
|
697 // In other words, if the original description had |
|
698 // 100: << hundred[ >>]; |
|
699 // that expands into |
|
700 // 100: << hundred; |
|
701 // 101: << hundred >>; |
|
702 // internally. But when we're formatting 200, if we use the rule |
|
703 // at 101, which would normally apply, we get "two hundred zero". |
|
704 // To prevent this, we roll back and use the rule at 100 instead. |
|
705 // This is the logic that makes this happen: the rule at 101 has |
|
706 // a modulus substitution, its base value isn't an even multiple |
|
707 // of 100, and the value we're trying to format _is_ an even |
|
708 // multiple of 100. This is called the "rollback rule." |
|
709 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { |
|
710 int64_t re = util64_pow(radix, exponent); |
|
711 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; |
|
712 } |
|
713 return FALSE; |
|
714 } |
|
715 |
|
716 //----------------------------------------------------------------------- |
|
717 // parsing |
|
718 //----------------------------------------------------------------------- |
|
719 |
|
720 /** |
|
721 * Attempts to parse the string with this rule. |
|
722 * @param text The string being parsed |
|
723 * @param parsePosition On entry, the value is ignored and assumed to |
|
724 * be 0. On exit, this has been updated with the position of the first |
|
725 * character not consumed by matching the text against this rule |
|
726 * (if this rule doesn't match the text at all, the parse position |
|
727 * if left unchanged (presumably at 0) and the function returns |
|
728 * new Long(0)). |
|
729 * @param isFractionRule True if this rule is contained within a |
|
730 * fraction rule set. This is only used if the rule has no |
|
731 * substitutions. |
|
732 * @return If this rule matched the text, this is the rule's base value |
|
733 * combined appropriately with the results of parsing the substitutions. |
|
734 * If nothing matched, this is new Long(0) and the parse position is |
|
735 * left unchanged. The result will be an instance of Long if the |
|
736 * result is an integer and Double otherwise. The result is never null. |
|
737 */ |
|
738 #ifdef RBNF_DEBUG |
|
739 #include <stdio.h> |
|
740 |
|
741 static void dumpUS(FILE* f, const UnicodeString& us) { |
|
742 int len = us.length(); |
|
743 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; |
|
744 if (buf != NULL) { |
|
745 us.extract(0, len, buf); |
|
746 buf[len] = 0; |
|
747 fprintf(f, "%s", buf); |
|
748 uprv_free(buf); //delete[] buf; |
|
749 } |
|
750 } |
|
751 #endif |
|
752 |
|
753 UBool |
|
754 NFRule::doParse(const UnicodeString& text, |
|
755 ParsePosition& parsePosition, |
|
756 UBool isFractionRule, |
|
757 double upperBound, |
|
758 Formattable& resVal) const |
|
759 { |
|
760 // internally we operate on a copy of the string being parsed |
|
761 // (because we're going to change it) and use our own ParsePosition |
|
762 ParsePosition pp; |
|
763 UnicodeString workText(text); |
|
764 |
|
765 // check to see whether the text before the first substitution |
|
766 // matches the text at the beginning of the string being |
|
767 // parsed. If it does, strip that off the front of workText; |
|
768 // otherwise, dump out with a mismatch |
|
769 UnicodeString prefix; |
|
770 prefix.setTo(ruleText, 0, sub1->getPos()); |
|
771 |
|
772 #ifdef RBNF_DEBUG |
|
773 fprintf(stderr, "doParse %x ", this); |
|
774 { |
|
775 UnicodeString rt; |
|
776 _appendRuleText(rt); |
|
777 dumpUS(stderr, rt); |
|
778 } |
|
779 |
|
780 fprintf(stderr, " text: '", this); |
|
781 dumpUS(stderr, text); |
|
782 fprintf(stderr, "' prefix: '"); |
|
783 dumpUS(stderr, prefix); |
|
784 #endif |
|
785 stripPrefix(workText, prefix, pp); |
|
786 int32_t prefixLength = text.length() - workText.length(); |
|
787 |
|
788 #ifdef RBNF_DEBUG |
|
789 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); |
|
790 #endif |
|
791 |
|
792 if (pp.getIndex() == 0 && sub1->getPos() != 0) { |
|
793 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
794 // restored for ICU4C port |
|
795 parsePosition.setErrorIndex(pp.getErrorIndex()); |
|
796 resVal.setLong(0); |
|
797 return TRUE; |
|
798 } |
|
799 |
|
800 // this is the fun part. The basic guts of the rule-matching |
|
801 // logic is matchToDelimiter(), which is called twice. The first |
|
802 // time it searches the input string for the rule text BETWEEN |
|
803 // the substitutions and tries to match the intervening text |
|
804 // in the input string with the first substitution. If that |
|
805 // succeeds, it then calls it again, this time to look for the |
|
806 // rule text after the second substitution and to match the |
|
807 // intervening input text against the second substitution. |
|
808 // |
|
809 // For example, say we have a rule that looks like this: |
|
810 // first << middle >> last; |
|
811 // and input text that looks like this: |
|
812 // first one middle two last |
|
813 // First we use stripPrefix() to match "first " in both places and |
|
814 // strip it off the front, leaving |
|
815 // one middle two last |
|
816 // Then we use matchToDelimiter() to match " middle " and try to |
|
817 // match "one" against a substitution. If it's successful, we now |
|
818 // have |
|
819 // two last |
|
820 // We use matchToDelimiter() a second time to match " last" and |
|
821 // try to match "two" against a substitution. If "two" matches |
|
822 // the substitution, we have a successful parse. |
|
823 // |
|
824 // Since it's possible in many cases to find multiple instances |
|
825 // of each of these pieces of rule text in the input string, |
|
826 // we need to try all the possible combinations of these |
|
827 // locations. This prevents us from prematurely declaring a mismatch, |
|
828 // and makes sure we match as much input text as we can. |
|
829 int highWaterMark = 0; |
|
830 double result = 0; |
|
831 int start = 0; |
|
832 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); |
|
833 |
|
834 UnicodeString temp; |
|
835 do { |
|
836 // our partial parse result starts out as this rule's base |
|
837 // value. If it finds a successful match, matchToDelimiter() |
|
838 // will compose this in some way with what it gets back from |
|
839 // the substitution, giving us a new partial parse result |
|
840 pp.setIndex(0); |
|
841 |
|
842 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); |
|
843 double partialResult = matchToDelimiter(workText, start, tempBaseValue, |
|
844 temp, pp, sub1, |
|
845 upperBound); |
|
846 |
|
847 // if we got a successful match (or were trying to match a |
|
848 // null substitution), pp is now pointing at the first unmatched |
|
849 // character. Take note of that, and try matchToDelimiter() |
|
850 // on the input text again |
|
851 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { |
|
852 start = pp.getIndex(); |
|
853 |
|
854 UnicodeString workText2; |
|
855 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); |
|
856 ParsePosition pp2; |
|
857 |
|
858 // the second matchToDelimiter() will compose our previous |
|
859 // partial result with whatever it gets back from its |
|
860 // substitution if there's a successful match, giving us |
|
861 // a real result |
|
862 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); |
|
863 partialResult = matchToDelimiter(workText2, 0, partialResult, |
|
864 temp, pp2, sub2, |
|
865 upperBound); |
|
866 |
|
867 // if we got a successful match on this second |
|
868 // matchToDelimiter() call, update the high-water mark |
|
869 // and result (if necessary) |
|
870 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { |
|
871 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { |
|
872 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); |
|
873 result = partialResult; |
|
874 } |
|
875 } |
|
876 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
877 // restored for ICU4C port |
|
878 else { |
|
879 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); |
|
880 if (temp> parsePosition.getErrorIndex()) { |
|
881 parsePosition.setErrorIndex(temp); |
|
882 } |
|
883 } |
|
884 } |
|
885 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
886 // restored for ICU4C port |
|
887 else { |
|
888 int32_t temp = sub1->getPos() + pp.getErrorIndex(); |
|
889 if (temp > parsePosition.getErrorIndex()) { |
|
890 parsePosition.setErrorIndex(temp); |
|
891 } |
|
892 } |
|
893 // keep trying to match things until the outer matchToDelimiter() |
|
894 // call fails to make a match (each time, it picks up where it |
|
895 // left off the previous time) |
|
896 } while (sub1->getPos() != sub2->getPos() |
|
897 && pp.getIndex() > 0 |
|
898 && pp.getIndex() < workText.length() |
|
899 && pp.getIndex() != start); |
|
900 |
|
901 // update the caller's ParsePosition with our high-water mark |
|
902 // (i.e., it now points at the first character this function |
|
903 // didn't match-- the ParsePosition is therefore unchanged if |
|
904 // we didn't match anything) |
|
905 parsePosition.setIndex(highWaterMark); |
|
906 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
907 // restored for ICU4C port |
|
908 if (highWaterMark > 0) { |
|
909 parsePosition.setErrorIndex(0); |
|
910 } |
|
911 |
|
912 // this is a hack for one unusual condition: Normally, whether this |
|
913 // rule belong to a fraction rule set or not is handled by its |
|
914 // substitutions. But if that rule HAS NO substitutions, then |
|
915 // we have to account for it here. By definition, if the matching |
|
916 // rule in a fraction rule set has no substitutions, its numerator |
|
917 // is 1, and so the result is the reciprocal of its base value. |
|
918 if (isFractionRule && |
|
919 highWaterMark > 0 && |
|
920 sub1->isNullSubstitution()) { |
|
921 result = 1 / result; |
|
922 } |
|
923 |
|
924 resVal.setDouble(result); |
|
925 return TRUE; // ??? do we need to worry if it is a long or a double? |
|
926 } |
|
927 |
|
928 /** |
|
929 * This function is used by parse() to match the text being parsed |
|
930 * against a possible prefix string. This function |
|
931 * matches characters from the beginning of the string being parsed |
|
932 * to characters from the prospective prefix. If they match, pp is |
|
933 * updated to the first character not matched, and the result is |
|
934 * the unparsed part of the string. If they don't match, the whole |
|
935 * string is returned, and pp is left unchanged. |
|
936 * @param text The string being parsed |
|
937 * @param prefix The text to match against |
|
938 * @param pp On entry, ignored and assumed to be 0. On exit, points |
|
939 * to the first unmatched character (assuming the whole prefix matched), |
|
940 * or is unchanged (if the whole prefix didn't match). |
|
941 * @return If things match, this is the unparsed part of "text"; |
|
942 * if they didn't match, this is "text". |
|
943 */ |
|
944 void |
|
945 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const |
|
946 { |
|
947 // if the prefix text is empty, dump out without doing anything |
|
948 if (prefix.length() != 0) { |
|
949 UErrorCode status = U_ZERO_ERROR; |
|
950 // use prefixLength() to match the beginning of |
|
951 // "text" against "prefix". This function returns the |
|
952 // number of characters from "text" that matched (or 0 if |
|
953 // we didn't match the whole prefix) |
|
954 int32_t pfl = prefixLength(text, prefix, status); |
|
955 if (U_FAILURE(status)) { // Memory allocation error. |
|
956 return; |
|
957 } |
|
958 if (pfl != 0) { |
|
959 // if we got a successful match, update the parse position |
|
960 // and strip the prefix off of "text" |
|
961 pp.setIndex(pp.getIndex() + pfl); |
|
962 text.remove(0, pfl); |
|
963 } |
|
964 } |
|
965 } |
|
966 |
|
967 /** |
|
968 * Used by parse() to match a substitution and any following text. |
|
969 * "text" is searched for instances of "delimiter". For each instance |
|
970 * of delimiter, the intervening text is tested to see whether it |
|
971 * matches the substitution. The longest match wins. |
|
972 * @param text The string being parsed |
|
973 * @param startPos The position in "text" where we should start looking |
|
974 * for "delimiter". |
|
975 * @param baseValue A partial parse result (often the rule's base value), |
|
976 * which is combined with the result from matching the substitution |
|
977 * @param delimiter The string to search "text" for. |
|
978 * @param pp Ignored and presumed to be 0 on entry. If there's a match, |
|
979 * on exit this will point to the first unmatched character. |
|
980 * @param sub If we find "delimiter" in "text", this substitution is used |
|
981 * to match the text between the beginning of the string and the |
|
982 * position of "delimiter." (If "delimiter" is the empty string, then |
|
983 * this function just matches against this substitution and updates |
|
984 * everything accordingly.) |
|
985 * @param upperBound When matching the substitution, it will only |
|
986 * consider rules with base values lower than this value. |
|
987 * @return If there's a match, this is the result of composing |
|
988 * baseValue with the result of matching the substitution. Otherwise, |
|
989 * this is new Long(0). It's never null. If the result is an integer, |
|
990 * this will be an instance of Long; otherwise, it's an instance of |
|
991 * Double. |
|
992 * |
|
993 * !!! note {dlf} in point of fact, in the java code the caller always converts |
|
994 * the result to a double, so we might as well return one. |
|
995 */ |
|
996 double |
|
997 NFRule::matchToDelimiter(const UnicodeString& text, |
|
998 int32_t startPos, |
|
999 double _baseValue, |
|
1000 const UnicodeString& delimiter, |
|
1001 ParsePosition& pp, |
|
1002 const NFSubstitution* sub, |
|
1003 double upperBound) const |
|
1004 { |
|
1005 UErrorCode status = U_ZERO_ERROR; |
|
1006 // if "delimiter" contains real (i.e., non-ignorable) text, search |
|
1007 // it for "delimiter" beginning at "start". If that succeeds, then |
|
1008 // use "sub"'s doParse() method to match the text before the |
|
1009 // instance of "delimiter" we just found. |
|
1010 if (!allIgnorable(delimiter, status)) { |
|
1011 if (U_FAILURE(status)) { //Memory allocation error. |
|
1012 return 0; |
|
1013 } |
|
1014 ParsePosition tempPP; |
|
1015 Formattable result; |
|
1016 |
|
1017 // use findText() to search for "delimiter". It returns a two- |
|
1018 // element array: element 0 is the position of the match, and |
|
1019 // element 1 is the number of characters that matched |
|
1020 // "delimiter". |
|
1021 int32_t dLen; |
|
1022 int32_t dPos = findText(text, delimiter, startPos, &dLen); |
|
1023 |
|
1024 // if findText() succeeded, isolate the text preceding the |
|
1025 // match, and use "sub" to match that text |
|
1026 while (dPos >= 0) { |
|
1027 UnicodeString subText; |
|
1028 subText.setTo(text, 0, dPos); |
|
1029 if (subText.length() > 0) { |
|
1030 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, |
|
1031 #if UCONFIG_NO_COLLATION |
|
1032 FALSE, |
|
1033 #else |
|
1034 formatter->isLenient(), |
|
1035 #endif |
|
1036 result); |
|
1037 |
|
1038 // if the substitution could match all the text up to |
|
1039 // where we found "delimiter", then this function has |
|
1040 // a successful match. Bump the caller's parse position |
|
1041 // to point to the first character after the text |
|
1042 // that matches "delimiter", and return the result |
|
1043 // we got from parsing the substitution. |
|
1044 if (success && tempPP.getIndex() == dPos) { |
|
1045 pp.setIndex(dPos + dLen); |
|
1046 return result.getDouble(); |
|
1047 } |
|
1048 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
1049 // restored for ICU4C port |
|
1050 else { |
|
1051 if (tempPP.getErrorIndex() > 0) { |
|
1052 pp.setErrorIndex(tempPP.getErrorIndex()); |
|
1053 } else { |
|
1054 pp.setErrorIndex(tempPP.getIndex()); |
|
1055 } |
|
1056 } |
|
1057 } |
|
1058 |
|
1059 // if we didn't match the substitution, search for another |
|
1060 // copy of "delimiter" in "text" and repeat the loop if |
|
1061 // we find it |
|
1062 tempPP.setIndex(0); |
|
1063 dPos = findText(text, delimiter, dPos + dLen, &dLen); |
|
1064 } |
|
1065 // if we make it here, this was an unsuccessful match, and we |
|
1066 // leave pp unchanged and return 0 |
|
1067 pp.setIndex(0); |
|
1068 return 0; |
|
1069 |
|
1070 // if "delimiter" is empty, or consists only of ignorable characters |
|
1071 // (i.e., is semantically empty), thwe we obviously can't search |
|
1072 // for "delimiter". Instead, just use "sub" to parse as much of |
|
1073 // "text" as possible. |
|
1074 } else { |
|
1075 ParsePosition tempPP; |
|
1076 Formattable result; |
|
1077 |
|
1078 // try to match the whole string against the substitution |
|
1079 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, |
|
1080 #if UCONFIG_NO_COLLATION |
|
1081 FALSE, |
|
1082 #else |
|
1083 formatter->isLenient(), |
|
1084 #endif |
|
1085 result); |
|
1086 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { |
|
1087 // if there's a successful match (or it's a null |
|
1088 // substitution), update pp to point to the first |
|
1089 // character we didn't match, and pass the result from |
|
1090 // sub.doParse() on through to the caller |
|
1091 pp.setIndex(tempPP.getIndex()); |
|
1092 return result.getDouble(); |
|
1093 } |
|
1094 // commented out because ParsePosition doesn't have error index in 1.1.x |
|
1095 // restored for ICU4C port |
|
1096 else { |
|
1097 pp.setErrorIndex(tempPP.getErrorIndex()); |
|
1098 } |
|
1099 |
|
1100 // and if we get to here, then nothing matched, so we return |
|
1101 // 0 and leave pp alone |
|
1102 return 0; |
|
1103 } |
|
1104 } |
|
1105 |
|
1106 /** |
|
1107 * Used by stripPrefix() to match characters. If lenient parse mode |
|
1108 * is off, this just calls startsWith(). If lenient parse mode is on, |
|
1109 * this function uses CollationElementIterators to match characters in |
|
1110 * the strings (only primary-order differences are significant in |
|
1111 * determining whether there's a match). |
|
1112 * @param str The string being tested |
|
1113 * @param prefix The text we're hoping to see at the beginning |
|
1114 * of "str" |
|
1115 * @return If "prefix" is found at the beginning of "str", this |
|
1116 * is the number of characters in "str" that were matched (this |
|
1117 * isn't necessarily the same as the length of "prefix" when matching |
|
1118 * text with a collator). If there's no match, this is 0. |
|
1119 */ |
|
1120 int32_t |
|
1121 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const |
|
1122 { |
|
1123 // if we're looking for an empty prefix, it obviously matches |
|
1124 // zero characters. Just go ahead and return 0. |
|
1125 if (prefix.length() == 0) { |
|
1126 return 0; |
|
1127 } |
|
1128 |
|
1129 #if !UCONFIG_NO_COLLATION |
|
1130 // go through all this grief if we're in lenient-parse mode |
|
1131 if (formatter->isLenient()) { |
|
1132 // get the formatter's collator and use it to create two |
|
1133 // collation element iterators, one over the target string |
|
1134 // and another over the prefix (right now, we'll throw an |
|
1135 // exception if the collator we get back from the formatter |
|
1136 // isn't a RuleBasedCollator, because RuleBasedCollator defines |
|
1137 // the CollationElementIterator protocol. Hopefully, this |
|
1138 // will change someday.) |
|
1139 RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator(); |
|
1140 CollationElementIterator* strIter = collator->createCollationElementIterator(str); |
|
1141 CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); |
|
1142 // Check for memory allocation error. |
|
1143 if (collator == NULL || strIter == NULL || prefixIter == NULL) { |
|
1144 delete collator; |
|
1145 delete strIter; |
|
1146 delete prefixIter; |
|
1147 status = U_MEMORY_ALLOCATION_ERROR; |
|
1148 return 0; |
|
1149 } |
|
1150 |
|
1151 UErrorCode err = U_ZERO_ERROR; |
|
1152 |
|
1153 // The original code was problematic. Consider this match: |
|
1154 // prefix = "fifty-" |
|
1155 // string = " fifty-7" |
|
1156 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 |
|
1157 // in the string. Unfortunately, we were getting a match, and then computing where |
|
1158 // the match terminated by rematching the string. The rematch code was using as an |
|
1159 // initial guess the substring of string between 0 and prefix.length. Because of |
|
1160 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving |
|
1161 // the position before the hyphen in the string. Recursing down, we then parsed the |
|
1162 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). |
|
1163 // This was not pretty, especially since the string "fifty-7" parsed just fine. |
|
1164 // |
|
1165 // We have newer APIs now, so we can use calls on the iterator to determine what we |
|
1166 // matched up to. If we terminate because we hit the last element in the string, |
|
1167 // our match terminates at this length. If we terminate because we hit the last element |
|
1168 // in the target, our match terminates at one before the element iterator position. |
|
1169 |
|
1170 // match collation elements between the strings |
|
1171 int32_t oStr = strIter->next(err); |
|
1172 int32_t oPrefix = prefixIter->next(err); |
|
1173 |
|
1174 while (oPrefix != CollationElementIterator::NULLORDER) { |
|
1175 // skip over ignorable characters in the target string |
|
1176 while (CollationElementIterator::primaryOrder(oStr) == 0 |
|
1177 && oStr != CollationElementIterator::NULLORDER) { |
|
1178 oStr = strIter->next(err); |
|
1179 } |
|
1180 |
|
1181 // skip over ignorable characters in the prefix |
|
1182 while (CollationElementIterator::primaryOrder(oPrefix) == 0 |
|
1183 && oPrefix != CollationElementIterator::NULLORDER) { |
|
1184 oPrefix = prefixIter->next(err); |
|
1185 } |
|
1186 |
|
1187 // dlf: move this above following test, if we consume the |
|
1188 // entire target, aren't we ok even if the source was also |
|
1189 // entirely consumed? |
|
1190 |
|
1191 // if skipping over ignorables brought to the end of |
|
1192 // the prefix, we DID match: drop out of the loop |
|
1193 if (oPrefix == CollationElementIterator::NULLORDER) { |
|
1194 break; |
|
1195 } |
|
1196 |
|
1197 // if skipping over ignorables brought us to the end |
|
1198 // of the target string, we didn't match and return 0 |
|
1199 if (oStr == CollationElementIterator::NULLORDER) { |
|
1200 delete prefixIter; |
|
1201 delete strIter; |
|
1202 return 0; |
|
1203 } |
|
1204 |
|
1205 // match collation elements from the two strings |
|
1206 // (considering only primary differences). If we |
|
1207 // get a mismatch, dump out and return 0 |
|
1208 if (CollationElementIterator::primaryOrder(oStr) |
|
1209 != CollationElementIterator::primaryOrder(oPrefix)) { |
|
1210 delete prefixIter; |
|
1211 delete strIter; |
|
1212 return 0; |
|
1213 |
|
1214 // otherwise, advance to the next character in each string |
|
1215 // and loop (we drop out of the loop when we exhaust |
|
1216 // collation elements in the prefix) |
|
1217 } else { |
|
1218 oStr = strIter->next(err); |
|
1219 oPrefix = prefixIter->next(err); |
|
1220 } |
|
1221 } |
|
1222 |
|
1223 int32_t result = strIter->getOffset(); |
|
1224 if (oStr != CollationElementIterator::NULLORDER) { |
|
1225 --result; // back over character that we don't want to consume; |
|
1226 } |
|
1227 |
|
1228 #ifdef RBNF_DEBUG |
|
1229 fprintf(stderr, "prefix length: %d\n", result); |
|
1230 #endif |
|
1231 delete prefixIter; |
|
1232 delete strIter; |
|
1233 |
|
1234 return result; |
|
1235 #if 0 |
|
1236 //---------------------------------------------------------------- |
|
1237 // JDK 1.2-specific API call |
|
1238 // return strIter.getOffset(); |
|
1239 //---------------------------------------------------------------- |
|
1240 // JDK 1.1 HACK (take out for 1.2-specific code) |
|
1241 |
|
1242 // if we make it to here, we have a successful match. Now we |
|
1243 // have to find out HOW MANY characters from the target string |
|
1244 // matched the prefix (there isn't necessarily a one-to-one |
|
1245 // mapping between collation elements and characters). |
|
1246 // In JDK 1.2, there's a simple getOffset() call we can use. |
|
1247 // In JDK 1.1, on the other hand, we have to go through some |
|
1248 // ugly contortions. First, use the collator to compare the |
|
1249 // same number of characters from the prefix and target string. |
|
1250 // If they're equal, we're done. |
|
1251 collator->setStrength(Collator::PRIMARY); |
|
1252 if (str.length() >= prefix.length()) { |
|
1253 UnicodeString temp; |
|
1254 temp.setTo(str, 0, prefix.length()); |
|
1255 if (collator->equals(temp, prefix)) { |
|
1256 #ifdef RBNF_DEBUG |
|
1257 fprintf(stderr, "returning: %d\n", prefix.length()); |
|
1258 #endif |
|
1259 return prefix.length(); |
|
1260 } |
|
1261 } |
|
1262 |
|
1263 // if they're not equal, then we have to compare successively |
|
1264 // larger and larger substrings of the target string until we |
|
1265 // get to one that matches the prefix. At that point, we know |
|
1266 // how many characters matched the prefix, and we can return. |
|
1267 int32_t p = 1; |
|
1268 while (p <= str.length()) { |
|
1269 UnicodeString temp; |
|
1270 temp.setTo(str, 0, p); |
|
1271 if (collator->equals(temp, prefix)) { |
|
1272 return p; |
|
1273 } else { |
|
1274 ++p; |
|
1275 } |
|
1276 } |
|
1277 |
|
1278 // SHOULD NEVER GET HERE!!! |
|
1279 return 0; |
|
1280 //---------------------------------------------------------------- |
|
1281 #endif |
|
1282 |
|
1283 // If lenient parsing is turned off, forget all that crap above. |
|
1284 // Just use String.startsWith() and be done with it. |
|
1285 } else |
|
1286 #endif |
|
1287 { |
|
1288 if (str.startsWith(prefix)) { |
|
1289 return prefix.length(); |
|
1290 } else { |
|
1291 return 0; |
|
1292 } |
|
1293 } |
|
1294 } |
|
1295 |
|
1296 /** |
|
1297 * Searches a string for another string. If lenient parsing is off, |
|
1298 * this just calls indexOf(). If lenient parsing is on, this function |
|
1299 * uses CollationElementIterator to match characters, and only |
|
1300 * primary-order differences are significant in determining whether |
|
1301 * there's a match. |
|
1302 * @param str The string to search |
|
1303 * @param key The string to search "str" for |
|
1304 * @param startingAt The index into "str" where the search is to |
|
1305 * begin |
|
1306 * @return A two-element array of ints. Element 0 is the position |
|
1307 * of the match, or -1 if there was no match. Element 1 is the |
|
1308 * number of characters in "str" that matched (which isn't necessarily |
|
1309 * the same as the length of "key") |
|
1310 */ |
|
1311 int32_t |
|
1312 NFRule::findText(const UnicodeString& str, |
|
1313 const UnicodeString& key, |
|
1314 int32_t startingAt, |
|
1315 int32_t* length) const |
|
1316 { |
|
1317 #if !UCONFIG_NO_COLLATION |
|
1318 // if lenient parsing is turned off, this is easy: just call |
|
1319 // String.indexOf() and we're done |
|
1320 if (!formatter->isLenient()) { |
|
1321 *length = key.length(); |
|
1322 return str.indexOf(key, startingAt); |
|
1323 |
|
1324 // but if lenient parsing is turned ON, we've got some work |
|
1325 // ahead of us |
|
1326 } else |
|
1327 #endif |
|
1328 { |
|
1329 //---------------------------------------------------------------- |
|
1330 // JDK 1.1 HACK (take out of 1.2-specific code) |
|
1331 |
|
1332 // in JDK 1.2, CollationElementIterator provides us with an |
|
1333 // API to map between character offsets and collation elements |
|
1334 // and we can do this by marching through the string comparing |
|
1335 // collation elements. We can't do that in JDK 1.1. Insted, |
|
1336 // we have to go through this horrible slow mess: |
|
1337 int32_t p = startingAt; |
|
1338 int32_t keyLen = 0; |
|
1339 |
|
1340 // basically just isolate smaller and smaller substrings of |
|
1341 // the target string (each running to the end of the string, |
|
1342 // and with the first one running from startingAt to the end) |
|
1343 // and then use prefixLength() to see if the search key is at |
|
1344 // the beginning of each substring. This is excruciatingly |
|
1345 // slow, but it will locate the key and tell use how long the |
|
1346 // matching text was. |
|
1347 UnicodeString temp; |
|
1348 UErrorCode status = U_ZERO_ERROR; |
|
1349 while (p < str.length() && keyLen == 0) { |
|
1350 temp.setTo(str, p, str.length() - p); |
|
1351 keyLen = prefixLength(temp, key, status); |
|
1352 if (U_FAILURE(status)) { |
|
1353 break; |
|
1354 } |
|
1355 if (keyLen != 0) { |
|
1356 *length = keyLen; |
|
1357 return p; |
|
1358 } |
|
1359 ++p; |
|
1360 } |
|
1361 // if we make it to here, we didn't find it. Return -1 for the |
|
1362 // location. The length should be ignored, but set it to 0, |
|
1363 // which should be "safe" |
|
1364 *length = 0; |
|
1365 return -1; |
|
1366 |
|
1367 //---------------------------------------------------------------- |
|
1368 // JDK 1.2 version of this routine |
|
1369 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); |
|
1370 // |
|
1371 //CollationElementIterator strIter = collator.getCollationElementIterator(str); |
|
1372 //CollationElementIterator keyIter = collator.getCollationElementIterator(key); |
|
1373 // |
|
1374 //int keyStart = -1; |
|
1375 // |
|
1376 //str.setOffset(startingAt); |
|
1377 // |
|
1378 //int oStr = strIter.next(); |
|
1379 //int oKey = keyIter.next(); |
|
1380 //while (oKey != CollationElementIterator.NULLORDER) { |
|
1381 // while (oStr != CollationElementIterator.NULLORDER && |
|
1382 // CollationElementIterator.primaryOrder(oStr) == 0) |
|
1383 // oStr = strIter.next(); |
|
1384 // |
|
1385 // while (oKey != CollationElementIterator.NULLORDER && |
|
1386 // CollationElementIterator.primaryOrder(oKey) == 0) |
|
1387 // oKey = keyIter.next(); |
|
1388 // |
|
1389 // if (oStr == CollationElementIterator.NULLORDER) { |
|
1390 // return new int[] { -1, 0 }; |
|
1391 // } |
|
1392 // |
|
1393 // if (oKey == CollationElementIterator.NULLORDER) { |
|
1394 // break; |
|
1395 // } |
|
1396 // |
|
1397 // if (CollationElementIterator.primaryOrder(oStr) == |
|
1398 // CollationElementIterator.primaryOrder(oKey)) { |
|
1399 // keyStart = strIter.getOffset(); |
|
1400 // oStr = strIter.next(); |
|
1401 // oKey = keyIter.next(); |
|
1402 // } else { |
|
1403 // if (keyStart != -1) { |
|
1404 // keyStart = -1; |
|
1405 // keyIter.reset(); |
|
1406 // } else { |
|
1407 // oStr = strIter.next(); |
|
1408 // } |
|
1409 // } |
|
1410 //} |
|
1411 // |
|
1412 //if (oKey == CollationElementIterator.NULLORDER) { |
|
1413 // return new int[] { keyStart, strIter.getOffset() - keyStart }; |
|
1414 //} else { |
|
1415 // return new int[] { -1, 0 }; |
|
1416 //} |
|
1417 } |
|
1418 } |
|
1419 |
|
1420 /** |
|
1421 * Checks to see whether a string consists entirely of ignorable |
|
1422 * characters. |
|
1423 * @param str The string to test. |
|
1424 * @return true if the string is empty of consists entirely of |
|
1425 * characters that the number formatter's collator says are |
|
1426 * ignorable at the primary-order level. false otherwise. |
|
1427 */ |
|
1428 UBool |
|
1429 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const |
|
1430 { |
|
1431 // if the string is empty, we can just return true |
|
1432 if (str.length() == 0) { |
|
1433 return TRUE; |
|
1434 } |
|
1435 |
|
1436 #if !UCONFIG_NO_COLLATION |
|
1437 // if lenient parsing is turned on, walk through the string with |
|
1438 // a collation element iterator and make sure each collation |
|
1439 // element is 0 (ignorable) at the primary level |
|
1440 if (formatter->isLenient()) { |
|
1441 RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator()); |
|
1442 CollationElementIterator* iter = collator->createCollationElementIterator(str); |
|
1443 |
|
1444 // Memory allocation error check. |
|
1445 if (collator == NULL || iter == NULL) { |
|
1446 delete collator; |
|
1447 delete iter; |
|
1448 status = U_MEMORY_ALLOCATION_ERROR; |
|
1449 return FALSE; |
|
1450 } |
|
1451 |
|
1452 UErrorCode err = U_ZERO_ERROR; |
|
1453 int32_t o = iter->next(err); |
|
1454 while (o != CollationElementIterator::NULLORDER |
|
1455 && CollationElementIterator::primaryOrder(o) == 0) { |
|
1456 o = iter->next(err); |
|
1457 } |
|
1458 |
|
1459 delete iter; |
|
1460 return o == CollationElementIterator::NULLORDER; |
|
1461 } |
|
1462 #endif |
|
1463 |
|
1464 // if lenient parsing is turned off, there is no such thing as |
|
1465 // an ignorable character: return true only if the string is empty |
|
1466 return FALSE; |
|
1467 } |
|
1468 |
|
1469 U_NAMESPACE_END |
|
1470 |
|
1471 /* U_HAVE_RBNF */ |
|
1472 #endif |
|
1473 |
|
1474 |