|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ********************************************************************** |
|
6 * Date Name Description |
|
7 * 11/17/99 aliu Creation. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #include "unicode/utypes.h" |
|
12 |
|
13 #if !UCONFIG_NO_TRANSLITERATION |
|
14 |
|
15 #include "unicode/rep.h" |
|
16 #include "unicode/unifilt.h" |
|
17 #include "unicode/uniset.h" |
|
18 #include "unicode/utf16.h" |
|
19 #include "rbt_rule.h" |
|
20 #include "rbt_data.h" |
|
21 #include "cmemory.h" |
|
22 #include "strmatch.h" |
|
23 #include "strrepl.h" |
|
24 #include "util.h" |
|
25 #include "putilimp.h" |
|
26 |
|
27 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " |
|
28 |
|
29 U_NAMESPACE_BEGIN |
|
30 |
|
31 /** |
|
32 * Construct a new rule with the given input, output text, and other |
|
33 * attributes. A cursor position may be specified for the output text. |
|
34 * @param input input string, including key and optional ante and |
|
35 * post context |
|
36 * @param anteContextPos offset into input to end of ante context, or -1 if |
|
37 * none. Must be <= input.length() if not -1. |
|
38 * @param postContextPos offset into input to start of post context, or -1 |
|
39 * if none. Must be <= input.length() if not -1, and must be >= |
|
40 * anteContextPos. |
|
41 * @param output output string |
|
42 * @param cursorPosition offset into output at which cursor is located, or -1 if |
|
43 * none. If less than zero, then the cursor is placed after the |
|
44 * <code>output</code>; that is, -1 is equivalent to |
|
45 * <code>output.length()</code>. If greater than |
|
46 * <code>output.length()</code> then an exception is thrown. |
|
47 * @param segs array of UnicodeFunctors corresponding to input pattern |
|
48 * segments, or null if there are none. The array itself is adopted, |
|
49 * but the pointers within it are not. |
|
50 * @param segsCount number of elements in segs[] |
|
51 * @param anchorStart TRUE if the the rule is anchored on the left to |
|
52 * the context start |
|
53 * @param anchorEnd TRUE if the rule is anchored on the right to the |
|
54 * context limit |
|
55 */ |
|
56 TransliterationRule::TransliterationRule(const UnicodeString& input, |
|
57 int32_t anteContextPos, int32_t postContextPos, |
|
58 const UnicodeString& outputStr, |
|
59 int32_t cursorPosition, int32_t cursorOffset, |
|
60 UnicodeFunctor** segs, |
|
61 int32_t segsCount, |
|
62 UBool anchorStart, UBool anchorEnd, |
|
63 const TransliterationRuleData* theData, |
|
64 UErrorCode& status) : |
|
65 UMemory(), |
|
66 segments(0), |
|
67 data(theData) { |
|
68 |
|
69 if (U_FAILURE(status)) { |
|
70 return; |
|
71 } |
|
72 // Do range checks only when warranted to save time |
|
73 if (anteContextPos < 0) { |
|
74 anteContextLength = 0; |
|
75 } else { |
|
76 if (anteContextPos > input.length()) { |
|
77 // throw new IllegalArgumentException("Invalid ante context"); |
|
78 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
79 return; |
|
80 } |
|
81 anteContextLength = anteContextPos; |
|
82 } |
|
83 if (postContextPos < 0) { |
|
84 keyLength = input.length() - anteContextLength; |
|
85 } else { |
|
86 if (postContextPos < anteContextLength || |
|
87 postContextPos > input.length()) { |
|
88 // throw new IllegalArgumentException("Invalid post context"); |
|
89 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
90 return; |
|
91 } |
|
92 keyLength = postContextPos - anteContextLength; |
|
93 } |
|
94 if (cursorPosition < 0) { |
|
95 cursorPosition = outputStr.length(); |
|
96 } else if (cursorPosition > outputStr.length()) { |
|
97 // throw new IllegalArgumentException("Invalid cursor position"); |
|
98 status = U_ILLEGAL_ARGUMENT_ERROR; |
|
99 return; |
|
100 } |
|
101 // We don't validate the segments array. The caller must |
|
102 // guarantee that the segments are well-formed (that is, that |
|
103 // all $n references in the output refer to indices of this |
|
104 // array, and that no array elements are null). |
|
105 this->segments = segs; |
|
106 this->segmentsCount = segsCount; |
|
107 |
|
108 pattern = input; |
|
109 flags = 0; |
|
110 if (anchorStart) { |
|
111 flags |= ANCHOR_START; |
|
112 } |
|
113 if (anchorEnd) { |
|
114 flags |= ANCHOR_END; |
|
115 } |
|
116 |
|
117 anteContext = NULL; |
|
118 if (anteContextLength > 0) { |
|
119 anteContext = new StringMatcher(pattern, 0, anteContextLength, |
|
120 FALSE, *data); |
|
121 /* test for NULL */ |
|
122 if (anteContext == 0) { |
|
123 status = U_MEMORY_ALLOCATION_ERROR; |
|
124 return; |
|
125 } |
|
126 } |
|
127 |
|
128 key = NULL; |
|
129 if (keyLength > 0) { |
|
130 key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, |
|
131 FALSE, *data); |
|
132 /* test for NULL */ |
|
133 if (key == 0) { |
|
134 status = U_MEMORY_ALLOCATION_ERROR; |
|
135 return; |
|
136 } |
|
137 } |
|
138 |
|
139 int32_t postContextLength = pattern.length() - keyLength - anteContextLength; |
|
140 postContext = NULL; |
|
141 if (postContextLength > 0) { |
|
142 postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), |
|
143 FALSE, *data); |
|
144 /* test for NULL */ |
|
145 if (postContext == 0) { |
|
146 status = U_MEMORY_ALLOCATION_ERROR; |
|
147 return; |
|
148 } |
|
149 } |
|
150 |
|
151 this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); |
|
152 /* test for NULL */ |
|
153 if (this->output == 0) { |
|
154 status = U_MEMORY_ALLOCATION_ERROR; |
|
155 return; |
|
156 } |
|
157 } |
|
158 |
|
159 /** |
|
160 * Copy constructor. |
|
161 */ |
|
162 TransliterationRule::TransliterationRule(TransliterationRule& other) : |
|
163 UMemory(other), |
|
164 anteContext(NULL), |
|
165 key(NULL), |
|
166 postContext(NULL), |
|
167 pattern(other.pattern), |
|
168 anteContextLength(other.anteContextLength), |
|
169 keyLength(other.keyLength), |
|
170 flags(other.flags), |
|
171 data(other.data) { |
|
172 |
|
173 segments = NULL; |
|
174 segmentsCount = 0; |
|
175 if (other.segmentsCount > 0) { |
|
176 segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); |
|
177 uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); |
|
178 } |
|
179 |
|
180 if (other.anteContext != NULL) { |
|
181 anteContext = (StringMatcher*) other.anteContext->clone(); |
|
182 } |
|
183 if (other.key != NULL) { |
|
184 key = (StringMatcher*) other.key->clone(); |
|
185 } |
|
186 if (other.postContext != NULL) { |
|
187 postContext = (StringMatcher*) other.postContext->clone(); |
|
188 } |
|
189 output = other.output->clone(); |
|
190 } |
|
191 |
|
192 TransliterationRule::~TransliterationRule() { |
|
193 uprv_free(segments); |
|
194 delete anteContext; |
|
195 delete key; |
|
196 delete postContext; |
|
197 delete output; |
|
198 } |
|
199 |
|
200 /** |
|
201 * Return the preceding context length. This method is needed to |
|
202 * support the <code>Transliterator</code> method |
|
203 * <code>getMaximumContextLength()</code>. Internally, this is |
|
204 * implemented as the anteContextLength, optionally plus one if |
|
205 * there is a start anchor. The one character anchor gap is |
|
206 * needed to make repeated incremental transliteration with |
|
207 * anchors work. |
|
208 */ |
|
209 int32_t TransliterationRule::getContextLength(void) const { |
|
210 return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); |
|
211 } |
|
212 |
|
213 /** |
|
214 * Internal method. Returns 8-bit index value for this rule. |
|
215 * This is the low byte of the first character of the key, |
|
216 * unless the first character of the key is a set. If it's a |
|
217 * set, or otherwise can match multiple keys, the index value is -1. |
|
218 */ |
|
219 int16_t TransliterationRule::getIndexValue() const { |
|
220 if (anteContextLength == pattern.length()) { |
|
221 // A pattern with just ante context {such as foo)>bar} can |
|
222 // match any key. |
|
223 return -1; |
|
224 } |
|
225 UChar32 c = pattern.char32At(anteContextLength); |
|
226 return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); |
|
227 } |
|
228 |
|
229 /** |
|
230 * Internal method. Returns true if this rule matches the given |
|
231 * index value. The index value is an 8-bit integer, 0..255, |
|
232 * representing the low byte of the first character of the key. |
|
233 * It matches this rule if it matches the first character of the |
|
234 * key, or if the first character of the key is a set, and the set |
|
235 * contains any character with a low byte equal to the index |
|
236 * value. If the rule contains only ante context, as in foo)>bar, |
|
237 * then it will match any key. |
|
238 */ |
|
239 UBool TransliterationRule::matchesIndexValue(uint8_t v) const { |
|
240 // Delegate to the key, or if there is none, to the postContext. |
|
241 // If there is neither then we match any key; return true. |
|
242 UnicodeMatcher *m = (key != NULL) ? key : postContext; |
|
243 return (m != NULL) ? m->matchesIndexValue(v) : TRUE; |
|
244 } |
|
245 |
|
246 /** |
|
247 * Return true if this rule masks another rule. If r1 masks r2 then |
|
248 * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
|
249 * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
|
250 * "[c]a>x" masks "[dc]a>y". |
|
251 */ |
|
252 UBool TransliterationRule::masks(const TransliterationRule& r2) const { |
|
253 /* Rule r1 masks rule r2 if the string formed of the |
|
254 * antecontext, key, and postcontext overlaps in the following |
|
255 * way: |
|
256 * |
|
257 * r1: aakkkpppp |
|
258 * r2: aaakkkkkpppp |
|
259 * ^ |
|
260 * |
|
261 * The strings must be aligned at the first character of the |
|
262 * key. The length of r1 to the left of the alignment point |
|
263 * must be <= the length of r2 to the left; ditto for the |
|
264 * right. The characters of r1 must equal (or be a superset |
|
265 * of) the corresponding characters of r2. The superset |
|
266 * operation should be performed to check for UnicodeSet |
|
267 * masking. |
|
268 * |
|
269 * Anchors: Two patterns that differ only in anchors only |
|
270 * mask one another if they are exactly equal, and r2 has |
|
271 * all the anchors r1 has (optionally, plus some). Here Y |
|
272 * means the row masks the column, N means it doesn't. |
|
273 * |
|
274 * ab ^ab ab$ ^ab$ |
|
275 * ab Y Y Y Y |
|
276 * ^ab N Y N Y |
|
277 * ab$ N N Y Y |
|
278 * ^ab$ N N N Y |
|
279 * |
|
280 * Post context: {a}b masks ab, but not vice versa, since {a}b |
|
281 * matches everything ab matches, and {a}b matches {|a|}b but ab |
|
282 * does not. Pre context is different (a{b} does not align with |
|
283 * ab). |
|
284 */ |
|
285 |
|
286 /* LIMITATION of the current mask algorithm: Some rule |
|
287 * maskings are currently not detected. For example, |
|
288 * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO |
|
289 */ |
|
290 |
|
291 int32_t len = pattern.length(); |
|
292 int32_t left = anteContextLength; |
|
293 int32_t left2 = r2.anteContextLength; |
|
294 int32_t right = len - left; |
|
295 int32_t right2 = r2.pattern.length() - left2; |
|
296 int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); |
|
297 |
|
298 // TODO Clean this up -- some logic might be combinable with the |
|
299 // next statement. |
|
300 |
|
301 // Test for anchor masking |
|
302 if (left == left2 && right == right2 && |
|
303 keyLength <= r2.keyLength && |
|
304 0 == cachedCompare) { |
|
305 // The following boolean logic implements the table above |
|
306 return (flags == r2.flags) || |
|
307 (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || |
|
308 ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); |
|
309 } |
|
310 |
|
311 return left <= left2 && |
|
312 (right < right2 || |
|
313 (right == right2 && keyLength <= r2.keyLength)) && |
|
314 (0 == cachedCompare); |
|
315 } |
|
316 |
|
317 static inline int32_t posBefore(const Replaceable& str, int32_t pos) { |
|
318 return (pos > 0) ? |
|
319 pos - U16_LENGTH(str.char32At(pos-1)) : |
|
320 pos - 1; |
|
321 } |
|
322 |
|
323 static inline int32_t posAfter(const Replaceable& str, int32_t pos) { |
|
324 return (pos >= 0 && pos < str.length()) ? |
|
325 pos + U16_LENGTH(str.char32At(pos)) : |
|
326 pos + 1; |
|
327 } |
|
328 |
|
329 /** |
|
330 * Attempt a match and replacement at the given position. Return |
|
331 * the degree of match between this rule and the given text. The |
|
332 * degree of match may be mismatch, a partial match, or a full |
|
333 * match. A mismatch means at least one character of the text |
|
334 * does not match the context or key. A partial match means some |
|
335 * context and key characters match, but the text is not long |
|
336 * enough to match all of them. A full match means all context |
|
337 * and key characters match. |
|
338 * |
|
339 * If a full match is obtained, perform a replacement, update pos, |
|
340 * and return U_MATCH. Otherwise both text and pos are unchanged. |
|
341 * |
|
342 * @param text the text |
|
343 * @param pos the position indices |
|
344 * @param incremental if TRUE, test for partial matches that may |
|
345 * be completed by additional text inserted at pos.limit. |
|
346 * @return one of <code>U_MISMATCH</code>, |
|
347 * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
|
348 * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
|
349 */ |
|
350 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, |
|
351 UTransPosition& pos, |
|
352 UBool incremental) const { |
|
353 // Matching and replacing are done in one method because the |
|
354 // replacement operation needs information obtained during the |
|
355 // match. Another way to do this is to have the match method |
|
356 // create a match result struct with relevant offsets, and to pass |
|
357 // this into the replace method. |
|
358 |
|
359 // ============================ MATCH =========================== |
|
360 |
|
361 // Reset segment match data |
|
362 if (segments != NULL) { |
|
363 for (int32_t i=0; i<segmentsCount; ++i) { |
|
364 ((StringMatcher*) segments[i])->resetMatch(); |
|
365 } |
|
366 } |
|
367 |
|
368 // int32_t lenDelta, keyLimit; |
|
369 int32_t keyLimit; |
|
370 |
|
371 // ------------------------ Ante Context ------------------------ |
|
372 |
|
373 // A mismatch in the ante context, or with the start anchor, |
|
374 // is an outright U_MISMATCH regardless of whether we are |
|
375 // incremental or not. |
|
376 int32_t oText; // offset into 'text' |
|
377 // int32_t newStart = 0; |
|
378 int32_t minOText; |
|
379 |
|
380 // Note (1): We process text in 16-bit code units, rather than |
|
381 // 32-bit code points. This works because stand-ins are |
|
382 // always in the BMP and because we are doing a literal match |
|
383 // operation, which can be done 16-bits at a time. |
|
384 |
|
385 int32_t anteLimit = posBefore(text, pos.contextStart); |
|
386 |
|
387 UMatchDegree match; |
|
388 |
|
389 // Start reverse match at char before pos.start |
|
390 oText = posBefore(text, pos.start); |
|
391 |
|
392 if (anteContext != NULL) { |
|
393 match = anteContext->matches(text, oText, anteLimit, FALSE); |
|
394 if (match != U_MATCH) { |
|
395 return U_MISMATCH; |
|
396 } |
|
397 } |
|
398 |
|
399 minOText = posAfter(text, oText); |
|
400 |
|
401 // ------------------------ Start Anchor ------------------------ |
|
402 |
|
403 if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { |
|
404 return U_MISMATCH; |
|
405 } |
|
406 |
|
407 // -------------------- Key and Post Context -------------------- |
|
408 |
|
409 oText = pos.start; |
|
410 |
|
411 if (key != NULL) { |
|
412 match = key->matches(text, oText, pos.limit, incremental); |
|
413 if (match != U_MATCH) { |
|
414 return match; |
|
415 } |
|
416 } |
|
417 |
|
418 keyLimit = oText; |
|
419 |
|
420 if (postContext != NULL) { |
|
421 if (incremental && keyLimit == pos.limit) { |
|
422 // The key matches just before pos.limit, and there is |
|
423 // a postContext. Since we are in incremental mode, |
|
424 // we must assume more characters may be inserted at |
|
425 // pos.limit -- this is a partial match. |
|
426 return U_PARTIAL_MATCH; |
|
427 } |
|
428 |
|
429 match = postContext->matches(text, oText, pos.contextLimit, incremental); |
|
430 if (match != U_MATCH) { |
|
431 return match; |
|
432 } |
|
433 } |
|
434 |
|
435 // ------------------------- Stop Anchor ------------------------ |
|
436 |
|
437 if (((flags & ANCHOR_END)) != 0) { |
|
438 if (oText != pos.contextLimit) { |
|
439 return U_MISMATCH; |
|
440 } |
|
441 if (incremental) { |
|
442 return U_PARTIAL_MATCH; |
|
443 } |
|
444 } |
|
445 |
|
446 // =========================== REPLACE ========================== |
|
447 |
|
448 // We have a full match. The key is between pos.start and |
|
449 // keyLimit. |
|
450 |
|
451 int32_t newStart; |
|
452 int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); |
|
453 int32_t lenDelta = newLength - (keyLimit - pos.start); |
|
454 |
|
455 oText += lenDelta; |
|
456 pos.limit += lenDelta; |
|
457 pos.contextLimit += lenDelta; |
|
458 // Restrict new value of start to [minOText, min(oText, pos.limit)]. |
|
459 pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); |
|
460 return U_MATCH; |
|
461 } |
|
462 |
|
463 /** |
|
464 * Create a source string that represents this rule. Append it to the |
|
465 * given string. |
|
466 */ |
|
467 UnicodeString& TransliterationRule::toRule(UnicodeString& rule, |
|
468 UBool escapeUnprintable) const { |
|
469 |
|
470 // Accumulate special characters (and non-specials following them) |
|
471 // into quoteBuf. Append quoteBuf, within single quotes, when |
|
472 // a non-quoted element must be inserted. |
|
473 UnicodeString str, quoteBuf; |
|
474 |
|
475 // Do not emit the braces '{' '}' around the pattern if there |
|
476 // is neither anteContext nor postContext. |
|
477 UBool emitBraces = |
|
478 (anteContext != NULL) || (postContext != NULL); |
|
479 |
|
480 // Emit start anchor |
|
481 if ((flags & ANCHOR_START) != 0) { |
|
482 rule.append((UChar)94/*^*/); |
|
483 } |
|
484 |
|
485 // Emit the input pattern |
|
486 ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); |
|
487 |
|
488 if (emitBraces) { |
|
489 ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); |
|
490 } |
|
491 |
|
492 ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); |
|
493 |
|
494 if (emitBraces) { |
|
495 ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); |
|
496 } |
|
497 |
|
498 ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); |
|
499 |
|
500 // Emit end anchor |
|
501 if ((flags & ANCHOR_END) != 0) { |
|
502 rule.append((UChar)36/*$*/); |
|
503 } |
|
504 |
|
505 ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); |
|
506 |
|
507 // Emit the output pattern |
|
508 |
|
509 ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), |
|
510 TRUE, escapeUnprintable, quoteBuf); |
|
511 |
|
512 ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); |
|
513 |
|
514 return rule; |
|
515 } |
|
516 |
|
517 void TransliterationRule::setData(const TransliterationRuleData* d) { |
|
518 data = d; |
|
519 if (anteContext != NULL) anteContext->setData(d); |
|
520 if (postContext != NULL) postContext->setData(d); |
|
521 if (key != NULL) key->setData(d); |
|
522 // assert(output != NULL); |
|
523 output->setData(d); |
|
524 // Don't have to do segments since they are in the context or key |
|
525 } |
|
526 |
|
527 /** |
|
528 * Union the set of all characters that may be modified by this rule |
|
529 * into the given set. |
|
530 */ |
|
531 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { |
|
532 int32_t limit = anteContextLength + keyLength; |
|
533 for (int32_t i=anteContextLength; i<limit; ) { |
|
534 UChar32 ch = pattern.char32At(i); |
|
535 i += U16_LENGTH(ch); |
|
536 const UnicodeMatcher* matcher = data->lookupMatcher(ch); |
|
537 if (matcher == NULL) { |
|
538 toUnionTo.add(ch); |
|
539 } else { |
|
540 matcher->addMatchSetTo(toUnionTo); |
|
541 } |
|
542 } |
|
543 } |
|
544 |
|
545 /** |
|
546 * Union the set of all characters that may be emitted by this rule |
|
547 * into the given set. |
|
548 */ |
|
549 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { |
|
550 output->toReplacer()->addReplacementSetTo(toUnionTo); |
|
551 } |
|
552 |
|
553 U_NAMESPACE_END |
|
554 |
|
555 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
|
556 |
|
557 //eof |