| |
1 /* |
| |
2 ****************************************************************************** |
| |
3 * Copyright (C) 1997-2011, International Business Machines |
| |
4 * Corporation and others. All Rights Reserved. |
| |
5 ****************************************************************************** |
| |
6 * file name: nfrule.cpp |
| |
7 * encoding: US-ASCII |
| |
8 * tab size: 8 (not used) |
| |
9 * indentation:4 |
| |
10 * |
| |
11 * Modification history |
| |
12 * Date Name Comments |
| |
13 * 10/11/2001 Doug Ported from ICU4J |
| |
14 */ |
| |
15 |
| |
16 #include "nfrule.h" |
| |
17 |
| |
18 #if U_HAVE_RBNF |
| |
19 |
| |
20 #include "unicode/rbnf.h" |
| |
21 #include "unicode/tblcoll.h" |
| |
22 #include "unicode/coleitr.h" |
| |
23 #include "unicode/uchar.h" |
| |
24 #include "nfrs.h" |
| |
25 #include "nfrlist.h" |
| |
26 #include "nfsubs.h" |
| |
27 #include "patternprops.h" |
| |
28 |
| |
29 U_NAMESPACE_BEGIN |
| |
30 |
| |
31 NFRule::NFRule(const RuleBasedNumberFormat* _rbnf) |
| |
32 : baseValue((int32_t)0) |
| |
33 , radix(0) |
| |
34 , exponent(0) |
| |
35 , ruleText() |
| |
36 , sub1(NULL) |
| |
37 , sub2(NULL) |
| |
38 , formatter(_rbnf) |
| |
39 { |
| |
40 } |
| |
41 |
| |
42 NFRule::~NFRule() |
| |
43 { |
| |
44 delete sub1; |
| |
45 delete sub2; |
| |
46 } |
| |
47 |
| |
48 static const UChar gLeftBracket = 0x005b; |
| |
49 static const UChar gRightBracket = 0x005d; |
| |
50 static const UChar gColon = 0x003a; |
| |
51 static const UChar gZero = 0x0030; |
| |
52 static const UChar gNine = 0x0039; |
| |
53 static const UChar gSpace = 0x0020; |
| |
54 static const UChar gSlash = 0x002f; |
| |
55 static const UChar gGreaterThan = 0x003e; |
| |
56 static const UChar gLessThan = 0x003c; |
| |
57 static const UChar gComma = 0x002c; |
| |
58 static const UChar gDot = 0x002e; |
| |
59 static const UChar gTick = 0x0027; |
| |
60 //static const UChar gMinus = 0x002d; |
| |
61 static const UChar gSemicolon = 0x003b; |
| |
62 |
| |
63 static const UChar gMinusX[] = {0x2D, 0x78, 0}; /* "-x" */ |
| |
64 static const UChar gXDotX[] = {0x78, 0x2E, 0x78, 0}; /* "x.x" */ |
| |
65 static const UChar gXDotZero[] = {0x78, 0x2E, 0x30, 0}; /* "x.0" */ |
| |
66 static const UChar gZeroDotX[] = {0x30, 0x2E, 0x78, 0}; /* "0.x" */ |
| |
67 |
| |
68 static const UChar gLessLess[] = {0x3C, 0x3C, 0}; /* "<<" */ |
| |
69 static const UChar gLessPercent[] = {0x3C, 0x25, 0}; /* "<%" */ |
| |
70 static const UChar gLessHash[] = {0x3C, 0x23, 0}; /* "<#" */ |
| |
71 static const UChar gLessZero[] = {0x3C, 0x30, 0}; /* "<0" */ |
| |
72 static const UChar gGreaterGreater[] = {0x3E, 0x3E, 0}; /* ">>" */ |
| |
73 static const UChar gGreaterPercent[] = {0x3E, 0x25, 0}; /* ">%" */ |
| |
74 static const UChar gGreaterHash[] = {0x3E, 0x23, 0}; /* ">#" */ |
| |
75 static const UChar gGreaterZero[] = {0x3E, 0x30, 0}; /* ">0" */ |
| |
76 static const UChar gEqualPercent[] = {0x3D, 0x25, 0}; /* "=%" */ |
| |
77 static const UChar gEqualHash[] = {0x3D, 0x23, 0}; /* "=#" */ |
| |
78 static const UChar gEqualZero[] = {0x3D, 0x30, 0}; /* "=0" */ |
| |
79 static const UChar gGreaterGreaterGreater[] = {0x3E, 0x3E, 0x3E, 0}; /* ">>>" */ |
| |
80 |
| |
81 static const UChar * const tokenStrings[] = { |
| |
82 gLessLess, gLessPercent, gLessHash, gLessZero, |
| |
83 gGreaterGreater, gGreaterPercent,gGreaterHash, gGreaterZero, |
| |
84 gEqualPercent, gEqualHash, gEqualZero, NULL |
| |
85 }; |
| |
86 |
| |
87 void |
| |
88 NFRule::makeRules(UnicodeString& description, |
| |
89 const NFRuleSet *ruleSet, |
| |
90 const NFRule *predecessor, |
| |
91 const RuleBasedNumberFormat *rbnf, |
| |
92 NFRuleList& rules, |
| |
93 UErrorCode& status) |
| |
94 { |
| |
95 // we know we're making at least one rule, so go ahead and |
| |
96 // new it up and initialize its basevalue and divisor |
| |
97 // (this also strips the rule descriptor, if any, off the |
| |
98 // descripton string) |
| |
99 NFRule* rule1 = new NFRule(rbnf); |
| |
100 /* test for NULL */ |
| |
101 if (rule1 == 0) { |
| |
102 status = U_MEMORY_ALLOCATION_ERROR; |
| |
103 return; |
| |
104 } |
| |
105 rule1->parseRuleDescriptor(description, status); |
| |
106 |
| |
107 // check the description to see whether there's text enclosed |
| |
108 // in brackets |
| |
109 int32_t brack1 = description.indexOf(gLeftBracket); |
| |
110 int32_t brack2 = description.indexOf(gRightBracket); |
| |
111 |
| |
112 // if the description doesn't contain a matched pair of brackets, |
| |
113 // or if it's of a type that doesn't recognize bracketed text, |
| |
114 // then leave the description alone, initialize the rule's |
| |
115 // rule text and substitutions, and return that rule |
| |
116 if (brack1 == -1 || brack2 == -1 || brack1 > brack2 |
| |
117 || rule1->getType() == kProperFractionRule |
| |
118 || rule1->getType() == kNegativeNumberRule) { |
| |
119 rule1->ruleText = description; |
| |
120 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| |
121 rules.add(rule1); |
| |
122 } else { |
| |
123 // if the description does contain a matched pair of brackets, |
| |
124 // then it's really shorthand for two rules (with one exception) |
| |
125 NFRule* rule2 = NULL; |
| |
126 UnicodeString sbuf; |
| |
127 |
| |
128 // we'll actually only split the rule into two rules if its |
| |
129 // base value is an even multiple of its divisor (or it's one |
| |
130 // of the special rules) |
| |
131 if ((rule1->baseValue > 0 |
| |
132 && (rule1->baseValue % util64_pow(rule1->radix, rule1->exponent)) == 0) |
| |
133 || rule1->getType() == kImproperFractionRule |
| |
134 || rule1->getType() == kMasterRule) { |
| |
135 |
| |
136 // if it passes that test, new up the second rule. If the |
| |
137 // rule set both rules will belong to is a fraction rule |
| |
138 // set, they both have the same base value; otherwise, |
| |
139 // increment the original rule's base value ("rule1" actually |
| |
140 // goes SECOND in the rule set's rule list) |
| |
141 rule2 = new NFRule(rbnf); |
| |
142 /* test for NULL */ |
| |
143 if (rule2 == 0) { |
| |
144 status = U_MEMORY_ALLOCATION_ERROR; |
| |
145 return; |
| |
146 } |
| |
147 if (rule1->baseValue >= 0) { |
| |
148 rule2->baseValue = rule1->baseValue; |
| |
149 if (!ruleSet->isFractionRuleSet()) { |
| |
150 ++rule1->baseValue; |
| |
151 } |
| |
152 } |
| |
153 |
| |
154 // if the description began with "x.x" and contains bracketed |
| |
155 // text, it describes both the improper fraction rule and |
| |
156 // the proper fraction rule |
| |
157 else if (rule1->getType() == kImproperFractionRule) { |
| |
158 rule2->setType(kProperFractionRule); |
| |
159 } |
| |
160 |
| |
161 // if the description began with "x.0" and contains bracketed |
| |
162 // text, it describes both the master rule and the |
| |
163 // improper fraction rule |
| |
164 else if (rule1->getType() == kMasterRule) { |
| |
165 rule2->baseValue = rule1->baseValue; |
| |
166 rule1->setType(kImproperFractionRule); |
| |
167 } |
| |
168 |
| |
169 // both rules have the same radix and exponent (i.e., the |
| |
170 // same divisor) |
| |
171 rule2->radix = rule1->radix; |
| |
172 rule2->exponent = rule1->exponent; |
| |
173 |
| |
174 // rule2's rule text omits the stuff in brackets: initalize |
| |
175 // its rule text and substitutions accordingly |
| |
176 sbuf.append(description, 0, brack1); |
| |
177 if (brack2 + 1 < description.length()) { |
| |
178 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
| |
179 } |
| |
180 rule2->ruleText.setTo(sbuf); |
| |
181 rule2->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| |
182 } |
| |
183 |
| |
184 // rule1's text includes the text in the brackets but omits |
| |
185 // the brackets themselves: initialize _its_ rule text and |
| |
186 // substitutions accordingly |
| |
187 sbuf.setTo(description, 0, brack1); |
| |
188 sbuf.append(description, brack1 + 1, brack2 - brack1 - 1); |
| |
189 if (brack2 + 1 < description.length()) { |
| |
190 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1); |
| |
191 } |
| |
192 rule1->ruleText.setTo(sbuf); |
| |
193 rule1->extractSubstitutions(ruleSet, predecessor, rbnf, status); |
| |
194 |
| |
195 // if we only have one rule, return it; if we have two, return |
| |
196 // a two-element array containing them (notice that rule2 goes |
| |
197 // BEFORE rule1 in the list: in all cases, rule2 OMITS the |
| |
198 // material in the brackets and rule1 INCLUDES the material |
| |
199 // in the brackets) |
| |
200 if (rule2 != NULL) { |
| |
201 rules.add(rule2); |
| |
202 } |
| |
203 rules.add(rule1); |
| |
204 } |
| |
205 } |
| |
206 |
| |
207 /** |
| |
208 * This function parses the rule's rule descriptor (i.e., the base |
| |
209 * value and/or other tokens that precede the rule's rule text |
| |
210 * in the description) and sets the rule's base value, radix, and |
| |
211 * exponent according to the descriptor. (If the description doesn't |
| |
212 * include a rule descriptor, then this function sets everything to |
| |
213 * default values and the rule set sets the rule's real base value). |
| |
214 * @param description The rule's description |
| |
215 * @return If "description" included a rule descriptor, this is |
| |
216 * "description" with the descriptor and any trailing whitespace |
| |
217 * stripped off. Otherwise; it's "descriptor" unchangd. |
| |
218 */ |
| |
219 void |
| |
220 NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status) |
| |
221 { |
| |
222 // the description consists of a rule descriptor and a rule body, |
| |
223 // separated by a colon. The rule descriptor is optional. If |
| |
224 // it's omitted, just set the base value to 0. |
| |
225 int32_t p = description.indexOf(gColon); |
| |
226 if (p == -1) { |
| |
227 setBaseValue((int32_t)0, status); |
| |
228 } else { |
| |
229 // copy the descriptor out into its own string and strip it, |
| |
230 // along with any trailing whitespace, out of the original |
| |
231 // description |
| |
232 UnicodeString descriptor; |
| |
233 descriptor.setTo(description, 0, p); |
| |
234 |
| |
235 ++p; |
| |
236 while (p < description.length() && PatternProps::isWhiteSpace(description.charAt(p))) { |
| |
237 ++p; |
| |
238 } |
| |
239 description.removeBetween(0, p); |
| |
240 |
| |
241 // check first to see if the rule descriptor matches the token |
| |
242 // for one of the special rules. If it does, set the base |
| |
243 // value to the correct identfier value |
| |
244 if (0 == descriptor.compare(gMinusX, 2)) { |
| |
245 setType(kNegativeNumberRule); |
| |
246 } |
| |
247 else if (0 == descriptor.compare(gXDotX, 3)) { |
| |
248 setType(kImproperFractionRule); |
| |
249 } |
| |
250 else if (0 == descriptor.compare(gZeroDotX, 3)) { |
| |
251 setType(kProperFractionRule); |
| |
252 } |
| |
253 else if (0 == descriptor.compare(gXDotZero, 3)) { |
| |
254 setType(kMasterRule); |
| |
255 } |
| |
256 |
| |
257 // if the rule descriptor begins with a digit, it's a descriptor |
| |
258 // for a normal rule |
| |
259 // since we don't have Long.parseLong, and this isn't much work anyway, |
| |
260 // just build up the value as we encounter the digits. |
| |
261 else if (descriptor.charAt(0) >= gZero && descriptor.charAt(0) <= gNine) { |
| |
262 int64_t val = 0; |
| |
263 p = 0; |
| |
264 UChar c = gSpace; |
| |
265 |
| |
266 // begin parsing the descriptor: copy digits |
| |
267 // into "tempValue", skip periods, commas, and spaces, |
| |
268 // stop on a slash or > sign (or at the end of the string), |
| |
269 // and throw an exception on any other character |
| |
270 int64_t ll_10 = 10; |
| |
271 while (p < descriptor.length()) { |
| |
272 c = descriptor.charAt(p); |
| |
273 if (c >= gZero && c <= gNine) { |
| |
274 val = val * ll_10 + (int32_t)(c - gZero); |
| |
275 } |
| |
276 else if (c == gSlash || c == gGreaterThan) { |
| |
277 break; |
| |
278 } |
| |
279 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
| |
280 } |
| |
281 else { |
| |
282 // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
| |
283 status = U_PARSE_ERROR; |
| |
284 return; |
| |
285 } |
| |
286 ++p; |
| |
287 } |
| |
288 |
| |
289 // we have the base value, so set it |
| |
290 setBaseValue(val, status); |
| |
291 |
| |
292 // if we stopped the previous loop on a slash, we're |
| |
293 // now parsing the rule's radix. Again, accumulate digits |
| |
294 // in tempValue, skip punctuation, stop on a > mark, and |
| |
295 // throw an exception on anything else |
| |
296 if (c == gSlash) { |
| |
297 val = 0; |
| |
298 ++p; |
| |
299 int64_t ll_10 = 10; |
| |
300 while (p < descriptor.length()) { |
| |
301 c = descriptor.charAt(p); |
| |
302 if (c >= gZero && c <= gNine) { |
| |
303 val = val * ll_10 + (int32_t)(c - gZero); |
| |
304 } |
| |
305 else if (c == gGreaterThan) { |
| |
306 break; |
| |
307 } |
| |
308 else if (PatternProps::isWhiteSpace(c) || c == gComma || c == gDot) { |
| |
309 } |
| |
310 else { |
| |
311 // throw new IllegalArgumentException("Illegal character is rule descriptor"); |
| |
312 status = U_PARSE_ERROR; |
| |
313 return; |
| |
314 } |
| |
315 ++p; |
| |
316 } |
| |
317 |
| |
318 // tempValue now contain's the rule's radix. Set it |
| |
319 // accordingly, and recalculate the rule's exponent |
| |
320 radix = (int32_t)val; |
| |
321 if (radix == 0) { |
| |
322 // throw new IllegalArgumentException("Rule can't have radix of 0"); |
| |
323 status = U_PARSE_ERROR; |
| |
324 } |
| |
325 |
| |
326 exponent = expectedExponent(); |
| |
327 } |
| |
328 |
| |
329 // if we stopped the previous loop on a > sign, then continue |
| |
330 // for as long as we still see > signs. For each one, |
| |
331 // decrement the exponent (unless the exponent is already 0). |
| |
332 // If we see another character before reaching the end of |
| |
333 // the descriptor, that's also a syntax error. |
| |
334 if (c == gGreaterThan) { |
| |
335 while (p < descriptor.length()) { |
| |
336 c = descriptor.charAt(p); |
| |
337 if (c == gGreaterThan && exponent > 0) { |
| |
338 --exponent; |
| |
339 } else { |
| |
340 // throw new IllegalArgumentException("Illegal character in rule descriptor"); |
| |
341 status = U_PARSE_ERROR; |
| |
342 return; |
| |
343 } |
| |
344 ++p; |
| |
345 } |
| |
346 } |
| |
347 } |
| |
348 } |
| |
349 |
| |
350 // finally, if the rule body begins with an apostrophe, strip it off |
| |
351 // (this is generally used to put whitespace at the beginning of |
| |
352 // a rule's rule text) |
| |
353 if (description.length() > 0 && description.charAt(0) == gTick) { |
| |
354 description.removeBetween(0, 1); |
| |
355 } |
| |
356 |
| |
357 // return the description with all the stuff we've just waded through |
| |
358 // stripped off the front. It now contains just the rule body. |
| |
359 // return description; |
| |
360 } |
| |
361 |
| |
362 /** |
| |
363 * Searches the rule's rule text for the substitution tokens, |
| |
364 * creates the substitutions, and removes the substitution tokens |
| |
365 * from the rule's rule text. |
| |
366 * @param owner The rule set containing this rule |
| |
367 * @param predecessor The rule preseding this one in "owners" rule list |
| |
368 * @param ownersOwner The RuleBasedFormat that owns this rule |
| |
369 */ |
| |
370 void |
| |
371 NFRule::extractSubstitutions(const NFRuleSet* ruleSet, |
| |
372 const NFRule* predecessor, |
| |
373 const RuleBasedNumberFormat* rbnf, |
| |
374 UErrorCode& status) |
| |
375 { |
| |
376 if (U_SUCCESS(status)) { |
| |
377 sub1 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
| |
378 sub2 = extractSubstitution(ruleSet, predecessor, rbnf, status); |
| |
379 } |
| |
380 } |
| |
381 |
| |
382 /** |
| |
383 * Searches the rule's rule text for the first substitution token, |
| |
384 * creates a substitution based on it, and removes the token from |
| |
385 * the rule's rule text. |
| |
386 * @param owner The rule set containing this rule |
| |
387 * @param predecessor The rule preceding this one in the rule set's |
| |
388 * rule list |
| |
389 * @param ownersOwner The RuleBasedNumberFormat that owns this rule |
| |
390 * @return The newly-created substitution. This is never null; if |
| |
391 * the rule text doesn't contain any substitution tokens, this will |
| |
392 * be a NullSubstitution. |
| |
393 */ |
| |
394 NFSubstitution * |
| |
395 NFRule::extractSubstitution(const NFRuleSet* ruleSet, |
| |
396 const NFRule* predecessor, |
| |
397 const RuleBasedNumberFormat* rbnf, |
| |
398 UErrorCode& status) |
| |
399 { |
| |
400 NFSubstitution* result = NULL; |
| |
401 |
| |
402 // search the rule's rule text for the first two characters of |
| |
403 // a substitution token |
| |
404 int32_t subStart = indexOfAny(tokenStrings); |
| |
405 int32_t subEnd = subStart; |
| |
406 |
| |
407 // if we didn't find one, create a null substitution positioned |
| |
408 // at the end of the rule text |
| |
409 if (subStart == -1) { |
| |
410 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
| |
411 ruleSet, rbnf, UnicodeString(), status); |
| |
412 } |
| |
413 |
| |
414 // special-case the ">>>" token, since searching for the > at the |
| |
415 // end will actually find the > in the middle |
| |
416 if (ruleText.indexOf(gGreaterGreaterGreater, 3, 0) == subStart) { |
| |
417 subEnd = subStart + 2; |
| |
418 |
| |
419 // otherwise the substitution token ends with the same character |
| |
420 // it began with |
| |
421 } else { |
| |
422 UChar c = ruleText.charAt(subStart); |
| |
423 subEnd = ruleText.indexOf(c, subStart + 1); |
| |
424 // special case for '<%foo<<' |
| |
425 if (c == gLessThan && subEnd != -1 && subEnd < ruleText.length() - 1 && ruleText.charAt(subEnd+1) == c) { |
| |
426 // ordinals use "=#,##0==%abbrev=" as their rule. Notice that the '==' in the middle |
| |
427 // occurs because of the juxtaposition of two different rules. The check for '<' is a hack |
| |
428 // to get around this. Having the duplicate at the front would cause problems with |
| |
429 // rules like "<<%" to format, say, percents... |
| |
430 ++subEnd; |
| |
431 } |
| |
432 } |
| |
433 |
| |
434 // if we don't find the end of the token (i.e., if we're on a single, |
| |
435 // unmatched token character), create a null substitution positioned |
| |
436 // at the end of the rule |
| |
437 if (subEnd == -1) { |
| |
438 return NFSubstitution::makeSubstitution(ruleText.length(), this, predecessor, |
| |
439 ruleSet, rbnf, UnicodeString(), status); |
| |
440 } |
| |
441 |
| |
442 // if we get here, we have a real substitution token (or at least |
| |
443 // some text bounded by substitution token characters). Use |
| |
444 // makeSubstitution() to create the right kind of substitution |
| |
445 UnicodeString subToken; |
| |
446 subToken.setTo(ruleText, subStart, subEnd + 1 - subStart); |
| |
447 result = NFSubstitution::makeSubstitution(subStart, this, predecessor, ruleSet, |
| |
448 rbnf, subToken, status); |
| |
449 |
| |
450 // remove the substitution from the rule text |
| |
451 ruleText.removeBetween(subStart, subEnd+1); |
| |
452 |
| |
453 return result; |
| |
454 } |
| |
455 |
| |
456 /** |
| |
457 * Sets the rule's base value, and causes the radix and exponent |
| |
458 * to be recalculated. This is used during construction when we |
| |
459 * don't know the rule's base value until after it's been |
| |
460 * constructed. It should be used at any other time. |
| |
461 * @param The new base value for the rule. |
| |
462 */ |
| |
463 void |
| |
464 NFRule::setBaseValue(int64_t newBaseValue, UErrorCode& status) |
| |
465 { |
| |
466 // set the base value |
| |
467 baseValue = newBaseValue; |
| |
468 |
| |
469 // if this isn't a special rule, recalculate the radix and exponent |
| |
470 // (the radix always defaults to 10; if it's supposed to be something |
| |
471 // else, it's cleaned up by the caller and the exponent is |
| |
472 // recalculated again-- the only function that does this is |
| |
473 // NFRule.parseRuleDescriptor() ) |
| |
474 if (baseValue >= 1) { |
| |
475 radix = 10; |
| |
476 exponent = expectedExponent(); |
| |
477 |
| |
478 // this function gets called on a fully-constructed rule whose |
| |
479 // description didn't specify a base value. This means it |
| |
480 // has substitutions, and some substitutions hold on to copies |
| |
481 // of the rule's divisor. Fix their copies of the divisor. |
| |
482 if (sub1 != NULL) { |
| |
483 sub1->setDivisor(radix, exponent, status); |
| |
484 } |
| |
485 if (sub2 != NULL) { |
| |
486 sub2->setDivisor(radix, exponent, status); |
| |
487 } |
| |
488 |
| |
489 // if this is a special rule, its radix and exponent are basically |
| |
490 // ignored. Set them to "safe" default values |
| |
491 } else { |
| |
492 radix = 10; |
| |
493 exponent = 0; |
| |
494 } |
| |
495 } |
| |
496 |
| |
497 /** |
| |
498 * This calculates the rule's exponent based on its radix and base |
| |
499 * value. This will be the highest power the radix can be raised to |
| |
500 * and still produce a result less than or equal to the base value. |
| |
501 */ |
| |
502 int16_t |
| |
503 NFRule::expectedExponent() const |
| |
504 { |
| |
505 // since the log of 0, or the log base 0 of something, causes an |
| |
506 // error, declare the exponent in these cases to be 0 (we also |
| |
507 // deal with the special-rule identifiers here) |
| |
508 if (radix == 0 || baseValue < 1) { |
| |
509 return 0; |
| |
510 } |
| |
511 |
| |
512 // we get rounding error in some cases-- for example, log 1000 / log 10 |
| |
513 // gives us 1.9999999996 instead of 2. The extra logic here is to take |
| |
514 // that into account |
| |
515 int16_t tempResult = (int16_t)(uprv_log((double)baseValue) / uprv_log((double)radix)); |
| |
516 int64_t temp = util64_pow(radix, tempResult + 1); |
| |
517 if (temp <= baseValue) { |
| |
518 tempResult += 1; |
| |
519 } |
| |
520 return tempResult; |
| |
521 } |
| |
522 |
| |
523 /** |
| |
524 * Searches the rule's rule text for any of the specified strings. |
| |
525 * @param strings An array of strings to search the rule's rule |
| |
526 * text for |
| |
527 * @return The index of the first match in the rule's rule text |
| |
528 * (i.e., the first substring in the rule's rule text that matches |
| |
529 * _any_ of the strings in "strings"). If none of the strings in |
| |
530 * "strings" is found in the rule's rule text, returns -1. |
| |
531 */ |
| |
532 int32_t |
| |
533 NFRule::indexOfAny(const UChar* const strings[]) const |
| |
534 { |
| |
535 int result = -1; |
| |
536 for (int i = 0; strings[i]; i++) { |
| |
537 int32_t pos = ruleText.indexOf(*strings[i]); |
| |
538 if (pos != -1 && (result == -1 || pos < result)) { |
| |
539 result = pos; |
| |
540 } |
| |
541 } |
| |
542 return result; |
| |
543 } |
| |
544 |
| |
545 //----------------------------------------------------------------------- |
| |
546 // boilerplate |
| |
547 //----------------------------------------------------------------------- |
| |
548 |
| |
549 /** |
| |
550 * Tests two rules for equality. |
| |
551 * @param that The rule to compare this one against |
| |
552 * @return True is the two rules are functionally equivalent |
| |
553 */ |
| |
554 UBool |
| |
555 NFRule::operator==(const NFRule& rhs) const |
| |
556 { |
| |
557 return baseValue == rhs.baseValue |
| |
558 && radix == rhs.radix |
| |
559 && exponent == rhs.exponent |
| |
560 && ruleText == rhs.ruleText |
| |
561 && *sub1 == *rhs.sub1 |
| |
562 && *sub2 == *rhs.sub2; |
| |
563 } |
| |
564 |
| |
565 /** |
| |
566 * Returns a textual representation of the rule. This won't |
| |
567 * necessarily be the same as the description that this rule |
| |
568 * was created with, but it will produce the same result. |
| |
569 * @return A textual description of the rule |
| |
570 */ |
| |
571 static void util_append64(UnicodeString& result, int64_t n) |
| |
572 { |
| |
573 UChar buffer[256]; |
| |
574 int32_t len = util64_tou(n, buffer, sizeof(buffer)); |
| |
575 UnicodeString temp(buffer, len); |
| |
576 result.append(temp); |
| |
577 } |
| |
578 |
| |
579 void |
| |
580 NFRule::_appendRuleText(UnicodeString& result) const |
| |
581 { |
| |
582 switch (getType()) { |
| |
583 case kNegativeNumberRule: result.append(gMinusX, 2); break; |
| |
584 case kImproperFractionRule: result.append(gXDotX, 3); break; |
| |
585 case kProperFractionRule: result.append(gZeroDotX, 3); break; |
| |
586 case kMasterRule: result.append(gXDotZero, 3); break; |
| |
587 default: |
| |
588 // for a normal rule, write out its base value, and if the radix is |
| |
589 // something other than 10, write out the radix (with the preceding |
| |
590 // slash, of course). Then calculate the expected exponent and if |
| |
591 // if isn't the same as the actual exponent, write an appropriate |
| |
592 // number of > signs. Finally, terminate the whole thing with |
| |
593 // a colon. |
| |
594 util_append64(result, baseValue); |
| |
595 if (radix != 10) { |
| |
596 result.append(gSlash); |
| |
597 util_append64(result, radix); |
| |
598 } |
| |
599 int numCarets = expectedExponent() - exponent; |
| |
600 for (int i = 0; i < numCarets; i++) { |
| |
601 result.append(gGreaterThan); |
| |
602 } |
| |
603 break; |
| |
604 } |
| |
605 result.append(gColon); |
| |
606 result.append(gSpace); |
| |
607 |
| |
608 // if the rule text begins with a space, write an apostrophe |
| |
609 // (whitespace after the rule descriptor is ignored; the |
| |
610 // apostrophe is used to make the whitespace significant) |
| |
611 if (ruleText.charAt(0) == gSpace && sub1->getPos() != 0) { |
| |
612 result.append(gTick); |
| |
613 } |
| |
614 |
| |
615 // now, write the rule's rule text, inserting appropriate |
| |
616 // substitution tokens in the appropriate places |
| |
617 UnicodeString ruleTextCopy; |
| |
618 ruleTextCopy.setTo(ruleText); |
| |
619 |
| |
620 UnicodeString temp; |
| |
621 sub2->toString(temp); |
| |
622 ruleTextCopy.insert(sub2->getPos(), temp); |
| |
623 sub1->toString(temp); |
| |
624 ruleTextCopy.insert(sub1->getPos(), temp); |
| |
625 |
| |
626 result.append(ruleTextCopy); |
| |
627 |
| |
628 // and finally, top the whole thing off with a semicolon and |
| |
629 // return the result |
| |
630 result.append(gSemicolon); |
| |
631 } |
| |
632 |
| |
633 //----------------------------------------------------------------------- |
| |
634 // formatting |
| |
635 //----------------------------------------------------------------------- |
| |
636 |
| |
637 /** |
| |
638 * Formats the number, and inserts the resulting text into |
| |
639 * toInsertInto. |
| |
640 * @param number The number being formatted |
| |
641 * @param toInsertInto The string where the resultant text should |
| |
642 * be inserted |
| |
643 * @param pos The position in toInsertInto where the resultant text |
| |
644 * should be inserted |
| |
645 */ |
| |
646 void |
| |
647 NFRule::doFormat(int64_t number, UnicodeString& toInsertInto, int32_t pos) const |
| |
648 { |
| |
649 // first, insert the rule's rule text into toInsertInto at the |
| |
650 // specified position, then insert the results of the substitutions |
| |
651 // into the right places in toInsertInto (notice we do the |
| |
652 // substitutions in reverse order so that the offsets don't get |
| |
653 // messed up) |
| |
654 toInsertInto.insert(pos, ruleText); |
| |
655 sub2->doSubstitution(number, toInsertInto, pos); |
| |
656 sub1->doSubstitution(number, toInsertInto, pos); |
| |
657 } |
| |
658 |
| |
659 /** |
| |
660 * Formats the number, and inserts the resulting text into |
| |
661 * toInsertInto. |
| |
662 * @param number The number being formatted |
| |
663 * @param toInsertInto The string where the resultant text should |
| |
664 * be inserted |
| |
665 * @param pos The position in toInsertInto where the resultant text |
| |
666 * should be inserted |
| |
667 */ |
| |
668 void |
| |
669 NFRule::doFormat(double number, UnicodeString& toInsertInto, int32_t pos) const |
| |
670 { |
| |
671 // first, insert the rule's rule text into toInsertInto at the |
| |
672 // specified position, then insert the results of the substitutions |
| |
673 // into the right places in toInsertInto |
| |
674 // [again, we have two copies of this routine that do the same thing |
| |
675 // so that we don't sacrifice precision in a long by casting it |
| |
676 // to a double] |
| |
677 toInsertInto.insert(pos, ruleText); |
| |
678 sub2->doSubstitution(number, toInsertInto, pos); |
| |
679 sub1->doSubstitution(number, toInsertInto, pos); |
| |
680 } |
| |
681 |
| |
682 /** |
| |
683 * Used by the owning rule set to determine whether to invoke the |
| |
684 * rollback rule (i.e., whether this rule or the one that precedes |
| |
685 * it in the rule set's list should be used to format the number) |
| |
686 * @param The number being formatted |
| |
687 * @return True if the rule set should use the rule that precedes |
| |
688 * this one in its list; false if it should use this rule |
| |
689 */ |
| |
690 UBool |
| |
691 NFRule::shouldRollBack(double number) const |
| |
692 { |
| |
693 // we roll back if the rule contains a modulus substitution, |
| |
694 // the number being formatted is an even multiple of the rule's |
| |
695 // divisor, and the rule's base value is NOT an even multiple |
| |
696 // of its divisor |
| |
697 // In other words, if the original description had |
| |
698 // 100: << hundred[ >>]; |
| |
699 // that expands into |
| |
700 // 100: << hundred; |
| |
701 // 101: << hundred >>; |
| |
702 // internally. But when we're formatting 200, if we use the rule |
| |
703 // at 101, which would normally apply, we get "two hundred zero". |
| |
704 // To prevent this, we roll back and use the rule at 100 instead. |
| |
705 // This is the logic that makes this happen: the rule at 101 has |
| |
706 // a modulus substitution, its base value isn't an even multiple |
| |
707 // of 100, and the value we're trying to format _is_ an even |
| |
708 // multiple of 100. This is called the "rollback rule." |
| |
709 if ((sub1->isModulusSubstitution()) || (sub2->isModulusSubstitution())) { |
| |
710 int64_t re = util64_pow(radix, exponent); |
| |
711 return uprv_fmod(number, (double)re) == 0 && (baseValue % re) != 0; |
| |
712 } |
| |
713 return FALSE; |
| |
714 } |
| |
715 |
| |
716 //----------------------------------------------------------------------- |
| |
717 // parsing |
| |
718 //----------------------------------------------------------------------- |
| |
719 |
| |
720 /** |
| |
721 * Attempts to parse the string with this rule. |
| |
722 * @param text The string being parsed |
| |
723 * @param parsePosition On entry, the value is ignored and assumed to |
| |
724 * be 0. On exit, this has been updated with the position of the first |
| |
725 * character not consumed by matching the text against this rule |
| |
726 * (if this rule doesn't match the text at all, the parse position |
| |
727 * if left unchanged (presumably at 0) and the function returns |
| |
728 * new Long(0)). |
| |
729 * @param isFractionRule True if this rule is contained within a |
| |
730 * fraction rule set. This is only used if the rule has no |
| |
731 * substitutions. |
| |
732 * @return If this rule matched the text, this is the rule's base value |
| |
733 * combined appropriately with the results of parsing the substitutions. |
| |
734 * If nothing matched, this is new Long(0) and the parse position is |
| |
735 * left unchanged. The result will be an instance of Long if the |
| |
736 * result is an integer and Double otherwise. The result is never null. |
| |
737 */ |
| |
738 #ifdef RBNF_DEBUG |
| |
739 #include <stdio.h> |
| |
740 |
| |
741 static void dumpUS(FILE* f, const UnicodeString& us) { |
| |
742 int len = us.length(); |
| |
743 char* buf = (char *)uprv_malloc((len+1)*sizeof(char)); //new char[len+1]; |
| |
744 if (buf != NULL) { |
| |
745 us.extract(0, len, buf); |
| |
746 buf[len] = 0; |
| |
747 fprintf(f, "%s", buf); |
| |
748 uprv_free(buf); //delete[] buf; |
| |
749 } |
| |
750 } |
| |
751 #endif |
| |
752 |
| |
753 UBool |
| |
754 NFRule::doParse(const UnicodeString& text, |
| |
755 ParsePosition& parsePosition, |
| |
756 UBool isFractionRule, |
| |
757 double upperBound, |
| |
758 Formattable& resVal) const |
| |
759 { |
| |
760 // internally we operate on a copy of the string being parsed |
| |
761 // (because we're going to change it) and use our own ParsePosition |
| |
762 ParsePosition pp; |
| |
763 UnicodeString workText(text); |
| |
764 |
| |
765 // check to see whether the text before the first substitution |
| |
766 // matches the text at the beginning of the string being |
| |
767 // parsed. If it does, strip that off the front of workText; |
| |
768 // otherwise, dump out with a mismatch |
| |
769 UnicodeString prefix; |
| |
770 prefix.setTo(ruleText, 0, sub1->getPos()); |
| |
771 |
| |
772 #ifdef RBNF_DEBUG |
| |
773 fprintf(stderr, "doParse %x ", this); |
| |
774 { |
| |
775 UnicodeString rt; |
| |
776 _appendRuleText(rt); |
| |
777 dumpUS(stderr, rt); |
| |
778 } |
| |
779 |
| |
780 fprintf(stderr, " text: '", this); |
| |
781 dumpUS(stderr, text); |
| |
782 fprintf(stderr, "' prefix: '"); |
| |
783 dumpUS(stderr, prefix); |
| |
784 #endif |
| |
785 stripPrefix(workText, prefix, pp); |
| |
786 int32_t prefixLength = text.length() - workText.length(); |
| |
787 |
| |
788 #ifdef RBNF_DEBUG |
| |
789 fprintf(stderr, "' pl: %d ppi: %d s1p: %d\n", prefixLength, pp.getIndex(), sub1->getPos()); |
| |
790 #endif |
| |
791 |
| |
792 if (pp.getIndex() == 0 && sub1->getPos() != 0) { |
| |
793 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
794 // restored for ICU4C port |
| |
795 parsePosition.setErrorIndex(pp.getErrorIndex()); |
| |
796 resVal.setLong(0); |
| |
797 return TRUE; |
| |
798 } |
| |
799 |
| |
800 // this is the fun part. The basic guts of the rule-matching |
| |
801 // logic is matchToDelimiter(), which is called twice. The first |
| |
802 // time it searches the input string for the rule text BETWEEN |
| |
803 // the substitutions and tries to match the intervening text |
| |
804 // in the input string with the first substitution. If that |
| |
805 // succeeds, it then calls it again, this time to look for the |
| |
806 // rule text after the second substitution and to match the |
| |
807 // intervening input text against the second substitution. |
| |
808 // |
| |
809 // For example, say we have a rule that looks like this: |
| |
810 // first << middle >> last; |
| |
811 // and input text that looks like this: |
| |
812 // first one middle two last |
| |
813 // First we use stripPrefix() to match "first " in both places and |
| |
814 // strip it off the front, leaving |
| |
815 // one middle two last |
| |
816 // Then we use matchToDelimiter() to match " middle " and try to |
| |
817 // match "one" against a substitution. If it's successful, we now |
| |
818 // have |
| |
819 // two last |
| |
820 // We use matchToDelimiter() a second time to match " last" and |
| |
821 // try to match "two" against a substitution. If "two" matches |
| |
822 // the substitution, we have a successful parse. |
| |
823 // |
| |
824 // Since it's possible in many cases to find multiple instances |
| |
825 // of each of these pieces of rule text in the input string, |
| |
826 // we need to try all the possible combinations of these |
| |
827 // locations. This prevents us from prematurely declaring a mismatch, |
| |
828 // and makes sure we match as much input text as we can. |
| |
829 int highWaterMark = 0; |
| |
830 double result = 0; |
| |
831 int start = 0; |
| |
832 double tempBaseValue = (double)(baseValue <= 0 ? 0 : baseValue); |
| |
833 |
| |
834 UnicodeString temp; |
| |
835 do { |
| |
836 // our partial parse result starts out as this rule's base |
| |
837 // value. If it finds a successful match, matchToDelimiter() |
| |
838 // will compose this in some way with what it gets back from |
| |
839 // the substitution, giving us a new partial parse result |
| |
840 pp.setIndex(0); |
| |
841 |
| |
842 temp.setTo(ruleText, sub1->getPos(), sub2->getPos() - sub1->getPos()); |
| |
843 double partialResult = matchToDelimiter(workText, start, tempBaseValue, |
| |
844 temp, pp, sub1, |
| |
845 upperBound); |
| |
846 |
| |
847 // if we got a successful match (or were trying to match a |
| |
848 // null substitution), pp is now pointing at the first unmatched |
| |
849 // character. Take note of that, and try matchToDelimiter() |
| |
850 // on the input text again |
| |
851 if (pp.getIndex() != 0 || sub1->isNullSubstitution()) { |
| |
852 start = pp.getIndex(); |
| |
853 |
| |
854 UnicodeString workText2; |
| |
855 workText2.setTo(workText, pp.getIndex(), workText.length() - pp.getIndex()); |
| |
856 ParsePosition pp2; |
| |
857 |
| |
858 // the second matchToDelimiter() will compose our previous |
| |
859 // partial result with whatever it gets back from its |
| |
860 // substitution if there's a successful match, giving us |
| |
861 // a real result |
| |
862 temp.setTo(ruleText, sub2->getPos(), ruleText.length() - sub2->getPos()); |
| |
863 partialResult = matchToDelimiter(workText2, 0, partialResult, |
| |
864 temp, pp2, sub2, |
| |
865 upperBound); |
| |
866 |
| |
867 // if we got a successful match on this second |
| |
868 // matchToDelimiter() call, update the high-water mark |
| |
869 // and result (if necessary) |
| |
870 if (pp2.getIndex() != 0 || sub2->isNullSubstitution()) { |
| |
871 if (prefixLength + pp.getIndex() + pp2.getIndex() > highWaterMark) { |
| |
872 highWaterMark = prefixLength + pp.getIndex() + pp2.getIndex(); |
| |
873 result = partialResult; |
| |
874 } |
| |
875 } |
| |
876 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
877 // restored for ICU4C port |
| |
878 else { |
| |
879 int32_t temp = pp2.getErrorIndex() + sub1->getPos() + pp.getIndex(); |
| |
880 if (temp> parsePosition.getErrorIndex()) { |
| |
881 parsePosition.setErrorIndex(temp); |
| |
882 } |
| |
883 } |
| |
884 } |
| |
885 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
886 // restored for ICU4C port |
| |
887 else { |
| |
888 int32_t temp = sub1->getPos() + pp.getErrorIndex(); |
| |
889 if (temp > parsePosition.getErrorIndex()) { |
| |
890 parsePosition.setErrorIndex(temp); |
| |
891 } |
| |
892 } |
| |
893 // keep trying to match things until the outer matchToDelimiter() |
| |
894 // call fails to make a match (each time, it picks up where it |
| |
895 // left off the previous time) |
| |
896 } while (sub1->getPos() != sub2->getPos() |
| |
897 && pp.getIndex() > 0 |
| |
898 && pp.getIndex() < workText.length() |
| |
899 && pp.getIndex() != start); |
| |
900 |
| |
901 // update the caller's ParsePosition with our high-water mark |
| |
902 // (i.e., it now points at the first character this function |
| |
903 // didn't match-- the ParsePosition is therefore unchanged if |
| |
904 // we didn't match anything) |
| |
905 parsePosition.setIndex(highWaterMark); |
| |
906 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
907 // restored for ICU4C port |
| |
908 if (highWaterMark > 0) { |
| |
909 parsePosition.setErrorIndex(0); |
| |
910 } |
| |
911 |
| |
912 // this is a hack for one unusual condition: Normally, whether this |
| |
913 // rule belong to a fraction rule set or not is handled by its |
| |
914 // substitutions. But if that rule HAS NO substitutions, then |
| |
915 // we have to account for it here. By definition, if the matching |
| |
916 // rule in a fraction rule set has no substitutions, its numerator |
| |
917 // is 1, and so the result is the reciprocal of its base value. |
| |
918 if (isFractionRule && |
| |
919 highWaterMark > 0 && |
| |
920 sub1->isNullSubstitution()) { |
| |
921 result = 1 / result; |
| |
922 } |
| |
923 |
| |
924 resVal.setDouble(result); |
| |
925 return TRUE; // ??? do we need to worry if it is a long or a double? |
| |
926 } |
| |
927 |
| |
928 /** |
| |
929 * This function is used by parse() to match the text being parsed |
| |
930 * against a possible prefix string. This function |
| |
931 * matches characters from the beginning of the string being parsed |
| |
932 * to characters from the prospective prefix. If they match, pp is |
| |
933 * updated to the first character not matched, and the result is |
| |
934 * the unparsed part of the string. If they don't match, the whole |
| |
935 * string is returned, and pp is left unchanged. |
| |
936 * @param text The string being parsed |
| |
937 * @param prefix The text to match against |
| |
938 * @param pp On entry, ignored and assumed to be 0. On exit, points |
| |
939 * to the first unmatched character (assuming the whole prefix matched), |
| |
940 * or is unchanged (if the whole prefix didn't match). |
| |
941 * @return If things match, this is the unparsed part of "text"; |
| |
942 * if they didn't match, this is "text". |
| |
943 */ |
| |
944 void |
| |
945 NFRule::stripPrefix(UnicodeString& text, const UnicodeString& prefix, ParsePosition& pp) const |
| |
946 { |
| |
947 // if the prefix text is empty, dump out without doing anything |
| |
948 if (prefix.length() != 0) { |
| |
949 UErrorCode status = U_ZERO_ERROR; |
| |
950 // use prefixLength() to match the beginning of |
| |
951 // "text" against "prefix". This function returns the |
| |
952 // number of characters from "text" that matched (or 0 if |
| |
953 // we didn't match the whole prefix) |
| |
954 int32_t pfl = prefixLength(text, prefix, status); |
| |
955 if (U_FAILURE(status)) { // Memory allocation error. |
| |
956 return; |
| |
957 } |
| |
958 if (pfl != 0) { |
| |
959 // if we got a successful match, update the parse position |
| |
960 // and strip the prefix off of "text" |
| |
961 pp.setIndex(pp.getIndex() + pfl); |
| |
962 text.remove(0, pfl); |
| |
963 } |
| |
964 } |
| |
965 } |
| |
966 |
| |
967 /** |
| |
968 * Used by parse() to match a substitution and any following text. |
| |
969 * "text" is searched for instances of "delimiter". For each instance |
| |
970 * of delimiter, the intervening text is tested to see whether it |
| |
971 * matches the substitution. The longest match wins. |
| |
972 * @param text The string being parsed |
| |
973 * @param startPos The position in "text" where we should start looking |
| |
974 * for "delimiter". |
| |
975 * @param baseValue A partial parse result (often the rule's base value), |
| |
976 * which is combined with the result from matching the substitution |
| |
977 * @param delimiter The string to search "text" for. |
| |
978 * @param pp Ignored and presumed to be 0 on entry. If there's a match, |
| |
979 * on exit this will point to the first unmatched character. |
| |
980 * @param sub If we find "delimiter" in "text", this substitution is used |
| |
981 * to match the text between the beginning of the string and the |
| |
982 * position of "delimiter." (If "delimiter" is the empty string, then |
| |
983 * this function just matches against this substitution and updates |
| |
984 * everything accordingly.) |
| |
985 * @param upperBound When matching the substitution, it will only |
| |
986 * consider rules with base values lower than this value. |
| |
987 * @return If there's a match, this is the result of composing |
| |
988 * baseValue with the result of matching the substitution. Otherwise, |
| |
989 * this is new Long(0). It's never null. If the result is an integer, |
| |
990 * this will be an instance of Long; otherwise, it's an instance of |
| |
991 * Double. |
| |
992 * |
| |
993 * !!! note {dlf} in point of fact, in the java code the caller always converts |
| |
994 * the result to a double, so we might as well return one. |
| |
995 */ |
| |
996 double |
| |
997 NFRule::matchToDelimiter(const UnicodeString& text, |
| |
998 int32_t startPos, |
| |
999 double _baseValue, |
| |
1000 const UnicodeString& delimiter, |
| |
1001 ParsePosition& pp, |
| |
1002 const NFSubstitution* sub, |
| |
1003 double upperBound) const |
| |
1004 { |
| |
1005 UErrorCode status = U_ZERO_ERROR; |
| |
1006 // if "delimiter" contains real (i.e., non-ignorable) text, search |
| |
1007 // it for "delimiter" beginning at "start". If that succeeds, then |
| |
1008 // use "sub"'s doParse() method to match the text before the |
| |
1009 // instance of "delimiter" we just found. |
| |
1010 if (!allIgnorable(delimiter, status)) { |
| |
1011 if (U_FAILURE(status)) { //Memory allocation error. |
| |
1012 return 0; |
| |
1013 } |
| |
1014 ParsePosition tempPP; |
| |
1015 Formattable result; |
| |
1016 |
| |
1017 // use findText() to search for "delimiter". It returns a two- |
| |
1018 // element array: element 0 is the position of the match, and |
| |
1019 // element 1 is the number of characters that matched |
| |
1020 // "delimiter". |
| |
1021 int32_t dLen; |
| |
1022 int32_t dPos = findText(text, delimiter, startPos, &dLen); |
| |
1023 |
| |
1024 // if findText() succeeded, isolate the text preceding the |
| |
1025 // match, and use "sub" to match that text |
| |
1026 while (dPos >= 0) { |
| |
1027 UnicodeString subText; |
| |
1028 subText.setTo(text, 0, dPos); |
| |
1029 if (subText.length() > 0) { |
| |
1030 UBool success = sub->doParse(subText, tempPP, _baseValue, upperBound, |
| |
1031 #if UCONFIG_NO_COLLATION |
| |
1032 FALSE, |
| |
1033 #else |
| |
1034 formatter->isLenient(), |
| |
1035 #endif |
| |
1036 result); |
| |
1037 |
| |
1038 // if the substitution could match all the text up to |
| |
1039 // where we found "delimiter", then this function has |
| |
1040 // a successful match. Bump the caller's parse position |
| |
1041 // to point to the first character after the text |
| |
1042 // that matches "delimiter", and return the result |
| |
1043 // we got from parsing the substitution. |
| |
1044 if (success && tempPP.getIndex() == dPos) { |
| |
1045 pp.setIndex(dPos + dLen); |
| |
1046 return result.getDouble(); |
| |
1047 } |
| |
1048 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
1049 // restored for ICU4C port |
| |
1050 else { |
| |
1051 if (tempPP.getErrorIndex() > 0) { |
| |
1052 pp.setErrorIndex(tempPP.getErrorIndex()); |
| |
1053 } else { |
| |
1054 pp.setErrorIndex(tempPP.getIndex()); |
| |
1055 } |
| |
1056 } |
| |
1057 } |
| |
1058 |
| |
1059 // if we didn't match the substitution, search for another |
| |
1060 // copy of "delimiter" in "text" and repeat the loop if |
| |
1061 // we find it |
| |
1062 tempPP.setIndex(0); |
| |
1063 dPos = findText(text, delimiter, dPos + dLen, &dLen); |
| |
1064 } |
| |
1065 // if we make it here, this was an unsuccessful match, and we |
| |
1066 // leave pp unchanged and return 0 |
| |
1067 pp.setIndex(0); |
| |
1068 return 0; |
| |
1069 |
| |
1070 // if "delimiter" is empty, or consists only of ignorable characters |
| |
1071 // (i.e., is semantically empty), thwe we obviously can't search |
| |
1072 // for "delimiter". Instead, just use "sub" to parse as much of |
| |
1073 // "text" as possible. |
| |
1074 } else { |
| |
1075 ParsePosition tempPP; |
| |
1076 Formattable result; |
| |
1077 |
| |
1078 // try to match the whole string against the substitution |
| |
1079 UBool success = sub->doParse(text, tempPP, _baseValue, upperBound, |
| |
1080 #if UCONFIG_NO_COLLATION |
| |
1081 FALSE, |
| |
1082 #else |
| |
1083 formatter->isLenient(), |
| |
1084 #endif |
| |
1085 result); |
| |
1086 if (success && (tempPP.getIndex() != 0 || sub->isNullSubstitution())) { |
| |
1087 // if there's a successful match (or it's a null |
| |
1088 // substitution), update pp to point to the first |
| |
1089 // character we didn't match, and pass the result from |
| |
1090 // sub.doParse() on through to the caller |
| |
1091 pp.setIndex(tempPP.getIndex()); |
| |
1092 return result.getDouble(); |
| |
1093 } |
| |
1094 // commented out because ParsePosition doesn't have error index in 1.1.x |
| |
1095 // restored for ICU4C port |
| |
1096 else { |
| |
1097 pp.setErrorIndex(tempPP.getErrorIndex()); |
| |
1098 } |
| |
1099 |
| |
1100 // and if we get to here, then nothing matched, so we return |
| |
1101 // 0 and leave pp alone |
| |
1102 return 0; |
| |
1103 } |
| |
1104 } |
| |
1105 |
| |
1106 /** |
| |
1107 * Used by stripPrefix() to match characters. If lenient parse mode |
| |
1108 * is off, this just calls startsWith(). If lenient parse mode is on, |
| |
1109 * this function uses CollationElementIterators to match characters in |
| |
1110 * the strings (only primary-order differences are significant in |
| |
1111 * determining whether there's a match). |
| |
1112 * @param str The string being tested |
| |
1113 * @param prefix The text we're hoping to see at the beginning |
| |
1114 * of "str" |
| |
1115 * @return If "prefix" is found at the beginning of "str", this |
| |
1116 * is the number of characters in "str" that were matched (this |
| |
1117 * isn't necessarily the same as the length of "prefix" when matching |
| |
1118 * text with a collator). If there's no match, this is 0. |
| |
1119 */ |
| |
1120 int32_t |
| |
1121 NFRule::prefixLength(const UnicodeString& str, const UnicodeString& prefix, UErrorCode& status) const |
| |
1122 { |
| |
1123 // if we're looking for an empty prefix, it obviously matches |
| |
1124 // zero characters. Just go ahead and return 0. |
| |
1125 if (prefix.length() == 0) { |
| |
1126 return 0; |
| |
1127 } |
| |
1128 |
| |
1129 #if !UCONFIG_NO_COLLATION |
| |
1130 // go through all this grief if we're in lenient-parse mode |
| |
1131 if (formatter->isLenient()) { |
| |
1132 // get the formatter's collator and use it to create two |
| |
1133 // collation element iterators, one over the target string |
| |
1134 // and another over the prefix (right now, we'll throw an |
| |
1135 // exception if the collator we get back from the formatter |
| |
1136 // isn't a RuleBasedCollator, because RuleBasedCollator defines |
| |
1137 // the CollationElementIterator protocol. Hopefully, this |
| |
1138 // will change someday.) |
| |
1139 RuleBasedCollator* collator = (RuleBasedCollator*)formatter->getCollator(); |
| |
1140 CollationElementIterator* strIter = collator->createCollationElementIterator(str); |
| |
1141 CollationElementIterator* prefixIter = collator->createCollationElementIterator(prefix); |
| |
1142 // Check for memory allocation error. |
| |
1143 if (collator == NULL || strIter == NULL || prefixIter == NULL) { |
| |
1144 delete collator; |
| |
1145 delete strIter; |
| |
1146 delete prefixIter; |
| |
1147 status = U_MEMORY_ALLOCATION_ERROR; |
| |
1148 return 0; |
| |
1149 } |
| |
1150 |
| |
1151 UErrorCode err = U_ZERO_ERROR; |
| |
1152 |
| |
1153 // The original code was problematic. Consider this match: |
| |
1154 // prefix = "fifty-" |
| |
1155 // string = " fifty-7" |
| |
1156 // The intent is to match string up to the '7', by matching 'fifty-' at position 1 |
| |
1157 // in the string. Unfortunately, we were getting a match, and then computing where |
| |
1158 // the match terminated by rematching the string. The rematch code was using as an |
| |
1159 // initial guess the substring of string between 0 and prefix.length. Because of |
| |
1160 // the leading space and trailing hyphen (both ignorable) this was succeeding, leaving |
| |
1161 // the position before the hyphen in the string. Recursing down, we then parsed the |
| |
1162 // remaining string '-7' as numeric. The resulting number turned out as 43 (50 - 7). |
| |
1163 // This was not pretty, especially since the string "fifty-7" parsed just fine. |
| |
1164 // |
| |
1165 // We have newer APIs now, so we can use calls on the iterator to determine what we |
| |
1166 // matched up to. If we terminate because we hit the last element in the string, |
| |
1167 // our match terminates at this length. If we terminate because we hit the last element |
| |
1168 // in the target, our match terminates at one before the element iterator position. |
| |
1169 |
| |
1170 // match collation elements between the strings |
| |
1171 int32_t oStr = strIter->next(err); |
| |
1172 int32_t oPrefix = prefixIter->next(err); |
| |
1173 |
| |
1174 while (oPrefix != CollationElementIterator::NULLORDER) { |
| |
1175 // skip over ignorable characters in the target string |
| |
1176 while (CollationElementIterator::primaryOrder(oStr) == 0 |
| |
1177 && oStr != CollationElementIterator::NULLORDER) { |
| |
1178 oStr = strIter->next(err); |
| |
1179 } |
| |
1180 |
| |
1181 // skip over ignorable characters in the prefix |
| |
1182 while (CollationElementIterator::primaryOrder(oPrefix) == 0 |
| |
1183 && oPrefix != CollationElementIterator::NULLORDER) { |
| |
1184 oPrefix = prefixIter->next(err); |
| |
1185 } |
| |
1186 |
| |
1187 // dlf: move this above following test, if we consume the |
| |
1188 // entire target, aren't we ok even if the source was also |
| |
1189 // entirely consumed? |
| |
1190 |
| |
1191 // if skipping over ignorables brought to the end of |
| |
1192 // the prefix, we DID match: drop out of the loop |
| |
1193 if (oPrefix == CollationElementIterator::NULLORDER) { |
| |
1194 break; |
| |
1195 } |
| |
1196 |
| |
1197 // if skipping over ignorables brought us to the end |
| |
1198 // of the target string, we didn't match and return 0 |
| |
1199 if (oStr == CollationElementIterator::NULLORDER) { |
| |
1200 delete prefixIter; |
| |
1201 delete strIter; |
| |
1202 return 0; |
| |
1203 } |
| |
1204 |
| |
1205 // match collation elements from the two strings |
| |
1206 // (considering only primary differences). If we |
| |
1207 // get a mismatch, dump out and return 0 |
| |
1208 if (CollationElementIterator::primaryOrder(oStr) |
| |
1209 != CollationElementIterator::primaryOrder(oPrefix)) { |
| |
1210 delete prefixIter; |
| |
1211 delete strIter; |
| |
1212 return 0; |
| |
1213 |
| |
1214 // otherwise, advance to the next character in each string |
| |
1215 // and loop (we drop out of the loop when we exhaust |
| |
1216 // collation elements in the prefix) |
| |
1217 } else { |
| |
1218 oStr = strIter->next(err); |
| |
1219 oPrefix = prefixIter->next(err); |
| |
1220 } |
| |
1221 } |
| |
1222 |
| |
1223 int32_t result = strIter->getOffset(); |
| |
1224 if (oStr != CollationElementIterator::NULLORDER) { |
| |
1225 --result; // back over character that we don't want to consume; |
| |
1226 } |
| |
1227 |
| |
1228 #ifdef RBNF_DEBUG |
| |
1229 fprintf(stderr, "prefix length: %d\n", result); |
| |
1230 #endif |
| |
1231 delete prefixIter; |
| |
1232 delete strIter; |
| |
1233 |
| |
1234 return result; |
| |
1235 #if 0 |
| |
1236 //---------------------------------------------------------------- |
| |
1237 // JDK 1.2-specific API call |
| |
1238 // return strIter.getOffset(); |
| |
1239 //---------------------------------------------------------------- |
| |
1240 // JDK 1.1 HACK (take out for 1.2-specific code) |
| |
1241 |
| |
1242 // if we make it to here, we have a successful match. Now we |
| |
1243 // have to find out HOW MANY characters from the target string |
| |
1244 // matched the prefix (there isn't necessarily a one-to-one |
| |
1245 // mapping between collation elements and characters). |
| |
1246 // In JDK 1.2, there's a simple getOffset() call we can use. |
| |
1247 // In JDK 1.1, on the other hand, we have to go through some |
| |
1248 // ugly contortions. First, use the collator to compare the |
| |
1249 // same number of characters from the prefix and target string. |
| |
1250 // If they're equal, we're done. |
| |
1251 collator->setStrength(Collator::PRIMARY); |
| |
1252 if (str.length() >= prefix.length()) { |
| |
1253 UnicodeString temp; |
| |
1254 temp.setTo(str, 0, prefix.length()); |
| |
1255 if (collator->equals(temp, prefix)) { |
| |
1256 #ifdef RBNF_DEBUG |
| |
1257 fprintf(stderr, "returning: %d\n", prefix.length()); |
| |
1258 #endif |
| |
1259 return prefix.length(); |
| |
1260 } |
| |
1261 } |
| |
1262 |
| |
1263 // if they're not equal, then we have to compare successively |
| |
1264 // larger and larger substrings of the target string until we |
| |
1265 // get to one that matches the prefix. At that point, we know |
| |
1266 // how many characters matched the prefix, and we can return. |
| |
1267 int32_t p = 1; |
| |
1268 while (p <= str.length()) { |
| |
1269 UnicodeString temp; |
| |
1270 temp.setTo(str, 0, p); |
| |
1271 if (collator->equals(temp, prefix)) { |
| |
1272 return p; |
| |
1273 } else { |
| |
1274 ++p; |
| |
1275 } |
| |
1276 } |
| |
1277 |
| |
1278 // SHOULD NEVER GET HERE!!! |
| |
1279 return 0; |
| |
1280 //---------------------------------------------------------------- |
| |
1281 #endif |
| |
1282 |
| |
1283 // If lenient parsing is turned off, forget all that crap above. |
| |
1284 // Just use String.startsWith() and be done with it. |
| |
1285 } else |
| |
1286 #endif |
| |
1287 { |
| |
1288 if (str.startsWith(prefix)) { |
| |
1289 return prefix.length(); |
| |
1290 } else { |
| |
1291 return 0; |
| |
1292 } |
| |
1293 } |
| |
1294 } |
| |
1295 |
| |
1296 /** |
| |
1297 * Searches a string for another string. If lenient parsing is off, |
| |
1298 * this just calls indexOf(). If lenient parsing is on, this function |
| |
1299 * uses CollationElementIterator to match characters, and only |
| |
1300 * primary-order differences are significant in determining whether |
| |
1301 * there's a match. |
| |
1302 * @param str The string to search |
| |
1303 * @param key The string to search "str" for |
| |
1304 * @param startingAt The index into "str" where the search is to |
| |
1305 * begin |
| |
1306 * @return A two-element array of ints. Element 0 is the position |
| |
1307 * of the match, or -1 if there was no match. Element 1 is the |
| |
1308 * number of characters in "str" that matched (which isn't necessarily |
| |
1309 * the same as the length of "key") |
| |
1310 */ |
| |
1311 int32_t |
| |
1312 NFRule::findText(const UnicodeString& str, |
| |
1313 const UnicodeString& key, |
| |
1314 int32_t startingAt, |
| |
1315 int32_t* length) const |
| |
1316 { |
| |
1317 #if !UCONFIG_NO_COLLATION |
| |
1318 // if lenient parsing is turned off, this is easy: just call |
| |
1319 // String.indexOf() and we're done |
| |
1320 if (!formatter->isLenient()) { |
| |
1321 *length = key.length(); |
| |
1322 return str.indexOf(key, startingAt); |
| |
1323 |
| |
1324 // but if lenient parsing is turned ON, we've got some work |
| |
1325 // ahead of us |
| |
1326 } else |
| |
1327 #endif |
| |
1328 { |
| |
1329 //---------------------------------------------------------------- |
| |
1330 // JDK 1.1 HACK (take out of 1.2-specific code) |
| |
1331 |
| |
1332 // in JDK 1.2, CollationElementIterator provides us with an |
| |
1333 // API to map between character offsets and collation elements |
| |
1334 // and we can do this by marching through the string comparing |
| |
1335 // collation elements. We can't do that in JDK 1.1. Insted, |
| |
1336 // we have to go through this horrible slow mess: |
| |
1337 int32_t p = startingAt; |
| |
1338 int32_t keyLen = 0; |
| |
1339 |
| |
1340 // basically just isolate smaller and smaller substrings of |
| |
1341 // the target string (each running to the end of the string, |
| |
1342 // and with the first one running from startingAt to the end) |
| |
1343 // and then use prefixLength() to see if the search key is at |
| |
1344 // the beginning of each substring. This is excruciatingly |
| |
1345 // slow, but it will locate the key and tell use how long the |
| |
1346 // matching text was. |
| |
1347 UnicodeString temp; |
| |
1348 UErrorCode status = U_ZERO_ERROR; |
| |
1349 while (p < str.length() && keyLen == 0) { |
| |
1350 temp.setTo(str, p, str.length() - p); |
| |
1351 keyLen = prefixLength(temp, key, status); |
| |
1352 if (U_FAILURE(status)) { |
| |
1353 break; |
| |
1354 } |
| |
1355 if (keyLen != 0) { |
| |
1356 *length = keyLen; |
| |
1357 return p; |
| |
1358 } |
| |
1359 ++p; |
| |
1360 } |
| |
1361 // if we make it to here, we didn't find it. Return -1 for the |
| |
1362 // location. The length should be ignored, but set it to 0, |
| |
1363 // which should be "safe" |
| |
1364 *length = 0; |
| |
1365 return -1; |
| |
1366 |
| |
1367 //---------------------------------------------------------------- |
| |
1368 // JDK 1.2 version of this routine |
| |
1369 //RuleBasedCollator collator = (RuleBasedCollator)formatter.getCollator(); |
| |
1370 // |
| |
1371 //CollationElementIterator strIter = collator.getCollationElementIterator(str); |
| |
1372 //CollationElementIterator keyIter = collator.getCollationElementIterator(key); |
| |
1373 // |
| |
1374 //int keyStart = -1; |
| |
1375 // |
| |
1376 //str.setOffset(startingAt); |
| |
1377 // |
| |
1378 //int oStr = strIter.next(); |
| |
1379 //int oKey = keyIter.next(); |
| |
1380 //while (oKey != CollationElementIterator.NULLORDER) { |
| |
1381 // while (oStr != CollationElementIterator.NULLORDER && |
| |
1382 // CollationElementIterator.primaryOrder(oStr) == 0) |
| |
1383 // oStr = strIter.next(); |
| |
1384 // |
| |
1385 // while (oKey != CollationElementIterator.NULLORDER && |
| |
1386 // CollationElementIterator.primaryOrder(oKey) == 0) |
| |
1387 // oKey = keyIter.next(); |
| |
1388 // |
| |
1389 // if (oStr == CollationElementIterator.NULLORDER) { |
| |
1390 // return new int[] { -1, 0 }; |
| |
1391 // } |
| |
1392 // |
| |
1393 // if (oKey == CollationElementIterator.NULLORDER) { |
| |
1394 // break; |
| |
1395 // } |
| |
1396 // |
| |
1397 // if (CollationElementIterator.primaryOrder(oStr) == |
| |
1398 // CollationElementIterator.primaryOrder(oKey)) { |
| |
1399 // keyStart = strIter.getOffset(); |
| |
1400 // oStr = strIter.next(); |
| |
1401 // oKey = keyIter.next(); |
| |
1402 // } else { |
| |
1403 // if (keyStart != -1) { |
| |
1404 // keyStart = -1; |
| |
1405 // keyIter.reset(); |
| |
1406 // } else { |
| |
1407 // oStr = strIter.next(); |
| |
1408 // } |
| |
1409 // } |
| |
1410 //} |
| |
1411 // |
| |
1412 //if (oKey == CollationElementIterator.NULLORDER) { |
| |
1413 // return new int[] { keyStart, strIter.getOffset() - keyStart }; |
| |
1414 //} else { |
| |
1415 // return new int[] { -1, 0 }; |
| |
1416 //} |
| |
1417 } |
| |
1418 } |
| |
1419 |
| |
1420 /** |
| |
1421 * Checks to see whether a string consists entirely of ignorable |
| |
1422 * characters. |
| |
1423 * @param str The string to test. |
| |
1424 * @return true if the string is empty of consists entirely of |
| |
1425 * characters that the number formatter's collator says are |
| |
1426 * ignorable at the primary-order level. false otherwise. |
| |
1427 */ |
| |
1428 UBool |
| |
1429 NFRule::allIgnorable(const UnicodeString& str, UErrorCode& status) const |
| |
1430 { |
| |
1431 // if the string is empty, we can just return true |
| |
1432 if (str.length() == 0) { |
| |
1433 return TRUE; |
| |
1434 } |
| |
1435 |
| |
1436 #if !UCONFIG_NO_COLLATION |
| |
1437 // if lenient parsing is turned on, walk through the string with |
| |
1438 // a collation element iterator and make sure each collation |
| |
1439 // element is 0 (ignorable) at the primary level |
| |
1440 if (formatter->isLenient()) { |
| |
1441 RuleBasedCollator* collator = (RuleBasedCollator*)(formatter->getCollator()); |
| |
1442 CollationElementIterator* iter = collator->createCollationElementIterator(str); |
| |
1443 |
| |
1444 // Memory allocation error check. |
| |
1445 if (collator == NULL || iter == NULL) { |
| |
1446 delete collator; |
| |
1447 delete iter; |
| |
1448 status = U_MEMORY_ALLOCATION_ERROR; |
| |
1449 return FALSE; |
| |
1450 } |
| |
1451 |
| |
1452 UErrorCode err = U_ZERO_ERROR; |
| |
1453 int32_t o = iter->next(err); |
| |
1454 while (o != CollationElementIterator::NULLORDER |
| |
1455 && CollationElementIterator::primaryOrder(o) == 0) { |
| |
1456 o = iter->next(err); |
| |
1457 } |
| |
1458 |
| |
1459 delete iter; |
| |
1460 return o == CollationElementIterator::NULLORDER; |
| |
1461 } |
| |
1462 #endif |
| |
1463 |
| |
1464 // if lenient parsing is turned off, there is no such thing as |
| |
1465 // an ignorable character: return true only if the string is empty |
| |
1466 return FALSE; |
| |
1467 } |
| |
1468 |
| |
1469 U_NAMESPACE_END |
| |
1470 |
| |
1471 /* U_HAVE_RBNF */ |
| |
1472 #endif |
| |
1473 |
| |
1474 |