Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2011, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 11/17/99 aliu Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/rep.h" |
michael@0 | 16 | #include "unicode/unifilt.h" |
michael@0 | 17 | #include "unicode/uniset.h" |
michael@0 | 18 | #include "unicode/utf16.h" |
michael@0 | 19 | #include "rbt_rule.h" |
michael@0 | 20 | #include "rbt_data.h" |
michael@0 | 21 | #include "cmemory.h" |
michael@0 | 22 | #include "strmatch.h" |
michael@0 | 23 | #include "strrepl.h" |
michael@0 | 24 | #include "util.h" |
michael@0 | 25 | #include "putilimp.h" |
michael@0 | 26 | |
michael@0 | 27 | static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " |
michael@0 | 28 | |
michael@0 | 29 | U_NAMESPACE_BEGIN |
michael@0 | 30 | |
michael@0 | 31 | /** |
michael@0 | 32 | * Construct a new rule with the given input, output text, and other |
michael@0 | 33 | * attributes. A cursor position may be specified for the output text. |
michael@0 | 34 | * @param input input string, including key and optional ante and |
michael@0 | 35 | * post context |
michael@0 | 36 | * @param anteContextPos offset into input to end of ante context, or -1 if |
michael@0 | 37 | * none. Must be <= input.length() if not -1. |
michael@0 | 38 | * @param postContextPos offset into input to start of post context, or -1 |
michael@0 | 39 | * if none. Must be <= input.length() if not -1, and must be >= |
michael@0 | 40 | * anteContextPos. |
michael@0 | 41 | * @param output output string |
michael@0 | 42 | * @param cursorPosition offset into output at which cursor is located, or -1 if |
michael@0 | 43 | * none. If less than zero, then the cursor is placed after the |
michael@0 | 44 | * <code>output</code>; that is, -1 is equivalent to |
michael@0 | 45 | * <code>output.length()</code>. If greater than |
michael@0 | 46 | * <code>output.length()</code> then an exception is thrown. |
michael@0 | 47 | * @param segs array of UnicodeFunctors corresponding to input pattern |
michael@0 | 48 | * segments, or null if there are none. The array itself is adopted, |
michael@0 | 49 | * but the pointers within it are not. |
michael@0 | 50 | * @param segsCount number of elements in segs[] |
michael@0 | 51 | * @param anchorStart TRUE if the the rule is anchored on the left to |
michael@0 | 52 | * the context start |
michael@0 | 53 | * @param anchorEnd TRUE if the rule is anchored on the right to the |
michael@0 | 54 | * context limit |
michael@0 | 55 | */ |
michael@0 | 56 | TransliterationRule::TransliterationRule(const UnicodeString& input, |
michael@0 | 57 | int32_t anteContextPos, int32_t postContextPos, |
michael@0 | 58 | const UnicodeString& outputStr, |
michael@0 | 59 | int32_t cursorPosition, int32_t cursorOffset, |
michael@0 | 60 | UnicodeFunctor** segs, |
michael@0 | 61 | int32_t segsCount, |
michael@0 | 62 | UBool anchorStart, UBool anchorEnd, |
michael@0 | 63 | const TransliterationRuleData* theData, |
michael@0 | 64 | UErrorCode& status) : |
michael@0 | 65 | UMemory(), |
michael@0 | 66 | segments(0), |
michael@0 | 67 | data(theData) { |
michael@0 | 68 | |
michael@0 | 69 | if (U_FAILURE(status)) { |
michael@0 | 70 | return; |
michael@0 | 71 | } |
michael@0 | 72 | // Do range checks only when warranted to save time |
michael@0 | 73 | if (anteContextPos < 0) { |
michael@0 | 74 | anteContextLength = 0; |
michael@0 | 75 | } else { |
michael@0 | 76 | if (anteContextPos > input.length()) { |
michael@0 | 77 | // throw new IllegalArgumentException("Invalid ante context"); |
michael@0 | 78 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 79 | return; |
michael@0 | 80 | } |
michael@0 | 81 | anteContextLength = anteContextPos; |
michael@0 | 82 | } |
michael@0 | 83 | if (postContextPos < 0) { |
michael@0 | 84 | keyLength = input.length() - anteContextLength; |
michael@0 | 85 | } else { |
michael@0 | 86 | if (postContextPos < anteContextLength || |
michael@0 | 87 | postContextPos > input.length()) { |
michael@0 | 88 | // throw new IllegalArgumentException("Invalid post context"); |
michael@0 | 89 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 90 | return; |
michael@0 | 91 | } |
michael@0 | 92 | keyLength = postContextPos - anteContextLength; |
michael@0 | 93 | } |
michael@0 | 94 | if (cursorPosition < 0) { |
michael@0 | 95 | cursorPosition = outputStr.length(); |
michael@0 | 96 | } else if (cursorPosition > outputStr.length()) { |
michael@0 | 97 | // throw new IllegalArgumentException("Invalid cursor position"); |
michael@0 | 98 | status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 99 | return; |
michael@0 | 100 | } |
michael@0 | 101 | // We don't validate the segments array. The caller must |
michael@0 | 102 | // guarantee that the segments are well-formed (that is, that |
michael@0 | 103 | // all $n references in the output refer to indices of this |
michael@0 | 104 | // array, and that no array elements are null). |
michael@0 | 105 | this->segments = segs; |
michael@0 | 106 | this->segmentsCount = segsCount; |
michael@0 | 107 | |
michael@0 | 108 | pattern = input; |
michael@0 | 109 | flags = 0; |
michael@0 | 110 | if (anchorStart) { |
michael@0 | 111 | flags |= ANCHOR_START; |
michael@0 | 112 | } |
michael@0 | 113 | if (anchorEnd) { |
michael@0 | 114 | flags |= ANCHOR_END; |
michael@0 | 115 | } |
michael@0 | 116 | |
michael@0 | 117 | anteContext = NULL; |
michael@0 | 118 | if (anteContextLength > 0) { |
michael@0 | 119 | anteContext = new StringMatcher(pattern, 0, anteContextLength, |
michael@0 | 120 | FALSE, *data); |
michael@0 | 121 | /* test for NULL */ |
michael@0 | 122 | if (anteContext == 0) { |
michael@0 | 123 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 124 | return; |
michael@0 | 125 | } |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | key = NULL; |
michael@0 | 129 | if (keyLength > 0) { |
michael@0 | 130 | key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, |
michael@0 | 131 | FALSE, *data); |
michael@0 | 132 | /* test for NULL */ |
michael@0 | 133 | if (key == 0) { |
michael@0 | 134 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 135 | return; |
michael@0 | 136 | } |
michael@0 | 137 | } |
michael@0 | 138 | |
michael@0 | 139 | int32_t postContextLength = pattern.length() - keyLength - anteContextLength; |
michael@0 | 140 | postContext = NULL; |
michael@0 | 141 | if (postContextLength > 0) { |
michael@0 | 142 | postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), |
michael@0 | 143 | FALSE, *data); |
michael@0 | 144 | /* test for NULL */ |
michael@0 | 145 | if (postContext == 0) { |
michael@0 | 146 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 147 | return; |
michael@0 | 148 | } |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); |
michael@0 | 152 | /* test for NULL */ |
michael@0 | 153 | if (this->output == 0) { |
michael@0 | 154 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 155 | return; |
michael@0 | 156 | } |
michael@0 | 157 | } |
michael@0 | 158 | |
michael@0 | 159 | /** |
michael@0 | 160 | * Copy constructor. |
michael@0 | 161 | */ |
michael@0 | 162 | TransliterationRule::TransliterationRule(TransliterationRule& other) : |
michael@0 | 163 | UMemory(other), |
michael@0 | 164 | anteContext(NULL), |
michael@0 | 165 | key(NULL), |
michael@0 | 166 | postContext(NULL), |
michael@0 | 167 | pattern(other.pattern), |
michael@0 | 168 | anteContextLength(other.anteContextLength), |
michael@0 | 169 | keyLength(other.keyLength), |
michael@0 | 170 | flags(other.flags), |
michael@0 | 171 | data(other.data) { |
michael@0 | 172 | |
michael@0 | 173 | segments = NULL; |
michael@0 | 174 | segmentsCount = 0; |
michael@0 | 175 | if (other.segmentsCount > 0) { |
michael@0 | 176 | segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); |
michael@0 | 177 | uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); |
michael@0 | 178 | } |
michael@0 | 179 | |
michael@0 | 180 | if (other.anteContext != NULL) { |
michael@0 | 181 | anteContext = (StringMatcher*) other.anteContext->clone(); |
michael@0 | 182 | } |
michael@0 | 183 | if (other.key != NULL) { |
michael@0 | 184 | key = (StringMatcher*) other.key->clone(); |
michael@0 | 185 | } |
michael@0 | 186 | if (other.postContext != NULL) { |
michael@0 | 187 | postContext = (StringMatcher*) other.postContext->clone(); |
michael@0 | 188 | } |
michael@0 | 189 | output = other.output->clone(); |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | TransliterationRule::~TransliterationRule() { |
michael@0 | 193 | uprv_free(segments); |
michael@0 | 194 | delete anteContext; |
michael@0 | 195 | delete key; |
michael@0 | 196 | delete postContext; |
michael@0 | 197 | delete output; |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | /** |
michael@0 | 201 | * Return the preceding context length. This method is needed to |
michael@0 | 202 | * support the <code>Transliterator</code> method |
michael@0 | 203 | * <code>getMaximumContextLength()</code>. Internally, this is |
michael@0 | 204 | * implemented as the anteContextLength, optionally plus one if |
michael@0 | 205 | * there is a start anchor. The one character anchor gap is |
michael@0 | 206 | * needed to make repeated incremental transliteration with |
michael@0 | 207 | * anchors work. |
michael@0 | 208 | */ |
michael@0 | 209 | int32_t TransliterationRule::getContextLength(void) const { |
michael@0 | 210 | return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); |
michael@0 | 211 | } |
michael@0 | 212 | |
michael@0 | 213 | /** |
michael@0 | 214 | * Internal method. Returns 8-bit index value for this rule. |
michael@0 | 215 | * This is the low byte of the first character of the key, |
michael@0 | 216 | * unless the first character of the key is a set. If it's a |
michael@0 | 217 | * set, or otherwise can match multiple keys, the index value is -1. |
michael@0 | 218 | */ |
michael@0 | 219 | int16_t TransliterationRule::getIndexValue() const { |
michael@0 | 220 | if (anteContextLength == pattern.length()) { |
michael@0 | 221 | // A pattern with just ante context {such as foo)>bar} can |
michael@0 | 222 | // match any key. |
michael@0 | 223 | return -1; |
michael@0 | 224 | } |
michael@0 | 225 | UChar32 c = pattern.char32At(anteContextLength); |
michael@0 | 226 | return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); |
michael@0 | 227 | } |
michael@0 | 228 | |
michael@0 | 229 | /** |
michael@0 | 230 | * Internal method. Returns true if this rule matches the given |
michael@0 | 231 | * index value. The index value is an 8-bit integer, 0..255, |
michael@0 | 232 | * representing the low byte of the first character of the key. |
michael@0 | 233 | * It matches this rule if it matches the first character of the |
michael@0 | 234 | * key, or if the first character of the key is a set, and the set |
michael@0 | 235 | * contains any character with a low byte equal to the index |
michael@0 | 236 | * value. If the rule contains only ante context, as in foo)>bar, |
michael@0 | 237 | * then it will match any key. |
michael@0 | 238 | */ |
michael@0 | 239 | UBool TransliterationRule::matchesIndexValue(uint8_t v) const { |
michael@0 | 240 | // Delegate to the key, or if there is none, to the postContext. |
michael@0 | 241 | // If there is neither then we match any key; return true. |
michael@0 | 242 | UnicodeMatcher *m = (key != NULL) ? key : postContext; |
michael@0 | 243 | return (m != NULL) ? m->matchesIndexValue(v) : TRUE; |
michael@0 | 244 | } |
michael@0 | 245 | |
michael@0 | 246 | /** |
michael@0 | 247 | * Return true if this rule masks another rule. If r1 masks r2 then |
michael@0 | 248 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
michael@0 | 249 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
michael@0 | 250 | * "[c]a>x" masks "[dc]a>y". |
michael@0 | 251 | */ |
michael@0 | 252 | UBool TransliterationRule::masks(const TransliterationRule& r2) const { |
michael@0 | 253 | /* Rule r1 masks rule r2 if the string formed of the |
michael@0 | 254 | * antecontext, key, and postcontext overlaps in the following |
michael@0 | 255 | * way: |
michael@0 | 256 | * |
michael@0 | 257 | * r1: aakkkpppp |
michael@0 | 258 | * r2: aaakkkkkpppp |
michael@0 | 259 | * ^ |
michael@0 | 260 | * |
michael@0 | 261 | * The strings must be aligned at the first character of the |
michael@0 | 262 | * key. The length of r1 to the left of the alignment point |
michael@0 | 263 | * must be <= the length of r2 to the left; ditto for the |
michael@0 | 264 | * right. The characters of r1 must equal (or be a superset |
michael@0 | 265 | * of) the corresponding characters of r2. The superset |
michael@0 | 266 | * operation should be performed to check for UnicodeSet |
michael@0 | 267 | * masking. |
michael@0 | 268 | * |
michael@0 | 269 | * Anchors: Two patterns that differ only in anchors only |
michael@0 | 270 | * mask one another if they are exactly equal, and r2 has |
michael@0 | 271 | * all the anchors r1 has (optionally, plus some). Here Y |
michael@0 | 272 | * means the row masks the column, N means it doesn't. |
michael@0 | 273 | * |
michael@0 | 274 | * ab ^ab ab$ ^ab$ |
michael@0 | 275 | * ab Y Y Y Y |
michael@0 | 276 | * ^ab N Y N Y |
michael@0 | 277 | * ab$ N N Y Y |
michael@0 | 278 | * ^ab$ N N N Y |
michael@0 | 279 | * |
michael@0 | 280 | * Post context: {a}b masks ab, but not vice versa, since {a}b |
michael@0 | 281 | * matches everything ab matches, and {a}b matches {|a|}b but ab |
michael@0 | 282 | * does not. Pre context is different (a{b} does not align with |
michael@0 | 283 | * ab). |
michael@0 | 284 | */ |
michael@0 | 285 | |
michael@0 | 286 | /* LIMITATION of the current mask algorithm: Some rule |
michael@0 | 287 | * maskings are currently not detected. For example, |
michael@0 | 288 | * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO |
michael@0 | 289 | */ |
michael@0 | 290 | |
michael@0 | 291 | int32_t len = pattern.length(); |
michael@0 | 292 | int32_t left = anteContextLength; |
michael@0 | 293 | int32_t left2 = r2.anteContextLength; |
michael@0 | 294 | int32_t right = len - left; |
michael@0 | 295 | int32_t right2 = r2.pattern.length() - left2; |
michael@0 | 296 | int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); |
michael@0 | 297 | |
michael@0 | 298 | // TODO Clean this up -- some logic might be combinable with the |
michael@0 | 299 | // next statement. |
michael@0 | 300 | |
michael@0 | 301 | // Test for anchor masking |
michael@0 | 302 | if (left == left2 && right == right2 && |
michael@0 | 303 | keyLength <= r2.keyLength && |
michael@0 | 304 | 0 == cachedCompare) { |
michael@0 | 305 | // The following boolean logic implements the table above |
michael@0 | 306 | return (flags == r2.flags) || |
michael@0 | 307 | (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || |
michael@0 | 308 | ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); |
michael@0 | 309 | } |
michael@0 | 310 | |
michael@0 | 311 | return left <= left2 && |
michael@0 | 312 | (right < right2 || |
michael@0 | 313 | (right == right2 && keyLength <= r2.keyLength)) && |
michael@0 | 314 | (0 == cachedCompare); |
michael@0 | 315 | } |
michael@0 | 316 | |
michael@0 | 317 | static inline int32_t posBefore(const Replaceable& str, int32_t pos) { |
michael@0 | 318 | return (pos > 0) ? |
michael@0 | 319 | pos - U16_LENGTH(str.char32At(pos-1)) : |
michael@0 | 320 | pos - 1; |
michael@0 | 321 | } |
michael@0 | 322 | |
michael@0 | 323 | static inline int32_t posAfter(const Replaceable& str, int32_t pos) { |
michael@0 | 324 | return (pos >= 0 && pos < str.length()) ? |
michael@0 | 325 | pos + U16_LENGTH(str.char32At(pos)) : |
michael@0 | 326 | pos + 1; |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | /** |
michael@0 | 330 | * Attempt a match and replacement at the given position. Return |
michael@0 | 331 | * the degree of match between this rule and the given text. The |
michael@0 | 332 | * degree of match may be mismatch, a partial match, or a full |
michael@0 | 333 | * match. A mismatch means at least one character of the text |
michael@0 | 334 | * does not match the context or key. A partial match means some |
michael@0 | 335 | * context and key characters match, but the text is not long |
michael@0 | 336 | * enough to match all of them. A full match means all context |
michael@0 | 337 | * and key characters match. |
michael@0 | 338 | * |
michael@0 | 339 | * If a full match is obtained, perform a replacement, update pos, |
michael@0 | 340 | * and return U_MATCH. Otherwise both text and pos are unchanged. |
michael@0 | 341 | * |
michael@0 | 342 | * @param text the text |
michael@0 | 343 | * @param pos the position indices |
michael@0 | 344 | * @param incremental if TRUE, test for partial matches that may |
michael@0 | 345 | * be completed by additional text inserted at pos.limit. |
michael@0 | 346 | * @return one of <code>U_MISMATCH</code>, |
michael@0 | 347 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
michael@0 | 348 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
michael@0 | 349 | */ |
michael@0 | 350 | UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, |
michael@0 | 351 | UTransPosition& pos, |
michael@0 | 352 | UBool incremental) const { |
michael@0 | 353 | // Matching and replacing are done in one method because the |
michael@0 | 354 | // replacement operation needs information obtained during the |
michael@0 | 355 | // match. Another way to do this is to have the match method |
michael@0 | 356 | // create a match result struct with relevant offsets, and to pass |
michael@0 | 357 | // this into the replace method. |
michael@0 | 358 | |
michael@0 | 359 | // ============================ MATCH =========================== |
michael@0 | 360 | |
michael@0 | 361 | // Reset segment match data |
michael@0 | 362 | if (segments != NULL) { |
michael@0 | 363 | for (int32_t i=0; i<segmentsCount; ++i) { |
michael@0 | 364 | ((StringMatcher*) segments[i])->resetMatch(); |
michael@0 | 365 | } |
michael@0 | 366 | } |
michael@0 | 367 | |
michael@0 | 368 | // int32_t lenDelta, keyLimit; |
michael@0 | 369 | int32_t keyLimit; |
michael@0 | 370 | |
michael@0 | 371 | // ------------------------ Ante Context ------------------------ |
michael@0 | 372 | |
michael@0 | 373 | // A mismatch in the ante context, or with the start anchor, |
michael@0 | 374 | // is an outright U_MISMATCH regardless of whether we are |
michael@0 | 375 | // incremental or not. |
michael@0 | 376 | int32_t oText; // offset into 'text' |
michael@0 | 377 | // int32_t newStart = 0; |
michael@0 | 378 | int32_t minOText; |
michael@0 | 379 | |
michael@0 | 380 | // Note (1): We process text in 16-bit code units, rather than |
michael@0 | 381 | // 32-bit code points. This works because stand-ins are |
michael@0 | 382 | // always in the BMP and because we are doing a literal match |
michael@0 | 383 | // operation, which can be done 16-bits at a time. |
michael@0 | 384 | |
michael@0 | 385 | int32_t anteLimit = posBefore(text, pos.contextStart); |
michael@0 | 386 | |
michael@0 | 387 | UMatchDegree match; |
michael@0 | 388 | |
michael@0 | 389 | // Start reverse match at char before pos.start |
michael@0 | 390 | oText = posBefore(text, pos.start); |
michael@0 | 391 | |
michael@0 | 392 | if (anteContext != NULL) { |
michael@0 | 393 | match = anteContext->matches(text, oText, anteLimit, FALSE); |
michael@0 | 394 | if (match != U_MATCH) { |
michael@0 | 395 | return U_MISMATCH; |
michael@0 | 396 | } |
michael@0 | 397 | } |
michael@0 | 398 | |
michael@0 | 399 | minOText = posAfter(text, oText); |
michael@0 | 400 | |
michael@0 | 401 | // ------------------------ Start Anchor ------------------------ |
michael@0 | 402 | |
michael@0 | 403 | if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { |
michael@0 | 404 | return U_MISMATCH; |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | // -------------------- Key and Post Context -------------------- |
michael@0 | 408 | |
michael@0 | 409 | oText = pos.start; |
michael@0 | 410 | |
michael@0 | 411 | if (key != NULL) { |
michael@0 | 412 | match = key->matches(text, oText, pos.limit, incremental); |
michael@0 | 413 | if (match != U_MATCH) { |
michael@0 | 414 | return match; |
michael@0 | 415 | } |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | keyLimit = oText; |
michael@0 | 419 | |
michael@0 | 420 | if (postContext != NULL) { |
michael@0 | 421 | if (incremental && keyLimit == pos.limit) { |
michael@0 | 422 | // The key matches just before pos.limit, and there is |
michael@0 | 423 | // a postContext. Since we are in incremental mode, |
michael@0 | 424 | // we must assume more characters may be inserted at |
michael@0 | 425 | // pos.limit -- this is a partial match. |
michael@0 | 426 | return U_PARTIAL_MATCH; |
michael@0 | 427 | } |
michael@0 | 428 | |
michael@0 | 429 | match = postContext->matches(text, oText, pos.contextLimit, incremental); |
michael@0 | 430 | if (match != U_MATCH) { |
michael@0 | 431 | return match; |
michael@0 | 432 | } |
michael@0 | 433 | } |
michael@0 | 434 | |
michael@0 | 435 | // ------------------------- Stop Anchor ------------------------ |
michael@0 | 436 | |
michael@0 | 437 | if (((flags & ANCHOR_END)) != 0) { |
michael@0 | 438 | if (oText != pos.contextLimit) { |
michael@0 | 439 | return U_MISMATCH; |
michael@0 | 440 | } |
michael@0 | 441 | if (incremental) { |
michael@0 | 442 | return U_PARTIAL_MATCH; |
michael@0 | 443 | } |
michael@0 | 444 | } |
michael@0 | 445 | |
michael@0 | 446 | // =========================== REPLACE ========================== |
michael@0 | 447 | |
michael@0 | 448 | // We have a full match. The key is between pos.start and |
michael@0 | 449 | // keyLimit. |
michael@0 | 450 | |
michael@0 | 451 | int32_t newStart; |
michael@0 | 452 | int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); |
michael@0 | 453 | int32_t lenDelta = newLength - (keyLimit - pos.start); |
michael@0 | 454 | |
michael@0 | 455 | oText += lenDelta; |
michael@0 | 456 | pos.limit += lenDelta; |
michael@0 | 457 | pos.contextLimit += lenDelta; |
michael@0 | 458 | // Restrict new value of start to [minOText, min(oText, pos.limit)]. |
michael@0 | 459 | pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); |
michael@0 | 460 | return U_MATCH; |
michael@0 | 461 | } |
michael@0 | 462 | |
michael@0 | 463 | /** |
michael@0 | 464 | * Create a source string that represents this rule. Append it to the |
michael@0 | 465 | * given string. |
michael@0 | 466 | */ |
michael@0 | 467 | UnicodeString& TransliterationRule::toRule(UnicodeString& rule, |
michael@0 | 468 | UBool escapeUnprintable) const { |
michael@0 | 469 | |
michael@0 | 470 | // Accumulate special characters (and non-specials following them) |
michael@0 | 471 | // into quoteBuf. Append quoteBuf, within single quotes, when |
michael@0 | 472 | // a non-quoted element must be inserted. |
michael@0 | 473 | UnicodeString str, quoteBuf; |
michael@0 | 474 | |
michael@0 | 475 | // Do not emit the braces '{' '}' around the pattern if there |
michael@0 | 476 | // is neither anteContext nor postContext. |
michael@0 | 477 | UBool emitBraces = |
michael@0 | 478 | (anteContext != NULL) || (postContext != NULL); |
michael@0 | 479 | |
michael@0 | 480 | // Emit start anchor |
michael@0 | 481 | if ((flags & ANCHOR_START) != 0) { |
michael@0 | 482 | rule.append((UChar)94/*^*/); |
michael@0 | 483 | } |
michael@0 | 484 | |
michael@0 | 485 | // Emit the input pattern |
michael@0 | 486 | ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); |
michael@0 | 487 | |
michael@0 | 488 | if (emitBraces) { |
michael@0 | 489 | ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 490 | } |
michael@0 | 491 | |
michael@0 | 492 | ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); |
michael@0 | 493 | |
michael@0 | 494 | if (emitBraces) { |
michael@0 | 495 | ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 496 | } |
michael@0 | 497 | |
michael@0 | 498 | ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); |
michael@0 | 499 | |
michael@0 | 500 | // Emit end anchor |
michael@0 | 501 | if ((flags & ANCHOR_END) != 0) { |
michael@0 | 502 | rule.append((UChar)36/*$*/); |
michael@0 | 503 | } |
michael@0 | 504 | |
michael@0 | 505 | ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 506 | |
michael@0 | 507 | // Emit the output pattern |
michael@0 | 508 | |
michael@0 | 509 | ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), |
michael@0 | 510 | TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 511 | |
michael@0 | 512 | ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 513 | |
michael@0 | 514 | return rule; |
michael@0 | 515 | } |
michael@0 | 516 | |
michael@0 | 517 | void TransliterationRule::setData(const TransliterationRuleData* d) { |
michael@0 | 518 | data = d; |
michael@0 | 519 | if (anteContext != NULL) anteContext->setData(d); |
michael@0 | 520 | if (postContext != NULL) postContext->setData(d); |
michael@0 | 521 | if (key != NULL) key->setData(d); |
michael@0 | 522 | // assert(output != NULL); |
michael@0 | 523 | output->setData(d); |
michael@0 | 524 | // Don't have to do segments since they are in the context or key |
michael@0 | 525 | } |
michael@0 | 526 | |
michael@0 | 527 | /** |
michael@0 | 528 | * Union the set of all characters that may be modified by this rule |
michael@0 | 529 | * into the given set. |
michael@0 | 530 | */ |
michael@0 | 531 | void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { |
michael@0 | 532 | int32_t limit = anteContextLength + keyLength; |
michael@0 | 533 | for (int32_t i=anteContextLength; i<limit; ) { |
michael@0 | 534 | UChar32 ch = pattern.char32At(i); |
michael@0 | 535 | i += U16_LENGTH(ch); |
michael@0 | 536 | const UnicodeMatcher* matcher = data->lookupMatcher(ch); |
michael@0 | 537 | if (matcher == NULL) { |
michael@0 | 538 | toUnionTo.add(ch); |
michael@0 | 539 | } else { |
michael@0 | 540 | matcher->addMatchSetTo(toUnionTo); |
michael@0 | 541 | } |
michael@0 | 542 | } |
michael@0 | 543 | } |
michael@0 | 544 | |
michael@0 | 545 | /** |
michael@0 | 546 | * Union the set of all characters that may be emitted by this rule |
michael@0 | 547 | * into the given set. |
michael@0 | 548 | */ |
michael@0 | 549 | void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { |
michael@0 | 550 | output->toReplacer()->addReplacementSetTo(toUnionTo); |
michael@0 | 551 | } |
michael@0 | 552 | |
michael@0 | 553 | U_NAMESPACE_END |
michael@0 | 554 | |
michael@0 | 555 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 556 | |
michael@0 | 557 | //eof |