1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/rbt_rule.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,557 @@ 1.4 +/* 1.5 + ********************************************************************** 1.6 + * Copyright (C) 1999-2011, International Business Machines 1.7 + * Corporation and others. All Rights Reserved. 1.8 + ********************************************************************** 1.9 + * Date Name Description 1.10 + * 11/17/99 aliu Creation. 1.11 + ********************************************************************** 1.12 + */ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/rep.h" 1.19 +#include "unicode/unifilt.h" 1.20 +#include "unicode/uniset.h" 1.21 +#include "unicode/utf16.h" 1.22 +#include "rbt_rule.h" 1.23 +#include "rbt_data.h" 1.24 +#include "cmemory.h" 1.25 +#include "strmatch.h" 1.26 +#include "strrepl.h" 1.27 +#include "util.h" 1.28 +#include "putilimp.h" 1.29 + 1.30 +static const UChar FORWARD_OP[] = {32,62,32,0}; // " > " 1.31 + 1.32 +U_NAMESPACE_BEGIN 1.33 + 1.34 +/** 1.35 + * Construct a new rule with the given input, output text, and other 1.36 + * attributes. A cursor position may be specified for the output text. 1.37 + * @param input input string, including key and optional ante and 1.38 + * post context 1.39 + * @param anteContextPos offset into input to end of ante context, or -1 if 1.40 + * none. Must be <= input.length() if not -1. 1.41 + * @param postContextPos offset into input to start of post context, or -1 1.42 + * if none. Must be <= input.length() if not -1, and must be >= 1.43 + * anteContextPos. 1.44 + * @param output output string 1.45 + * @param cursorPosition offset into output at which cursor is located, or -1 if 1.46 + * none. If less than zero, then the cursor is placed after the 1.47 + * <code>output</code>; that is, -1 is equivalent to 1.48 + * <code>output.length()</code>. If greater than 1.49 + * <code>output.length()</code> then an exception is thrown. 1.50 + * @param segs array of UnicodeFunctors corresponding to input pattern 1.51 + * segments, or null if there are none. The array itself is adopted, 1.52 + * but the pointers within it are not. 1.53 + * @param segsCount number of elements in segs[] 1.54 + * @param anchorStart TRUE if the the rule is anchored on the left to 1.55 + * the context start 1.56 + * @param anchorEnd TRUE if the rule is anchored on the right to the 1.57 + * context limit 1.58 + */ 1.59 +TransliterationRule::TransliterationRule(const UnicodeString& input, 1.60 + int32_t anteContextPos, int32_t postContextPos, 1.61 + const UnicodeString& outputStr, 1.62 + int32_t cursorPosition, int32_t cursorOffset, 1.63 + UnicodeFunctor** segs, 1.64 + int32_t segsCount, 1.65 + UBool anchorStart, UBool anchorEnd, 1.66 + const TransliterationRuleData* theData, 1.67 + UErrorCode& status) : 1.68 + UMemory(), 1.69 + segments(0), 1.70 + data(theData) { 1.71 + 1.72 + if (U_FAILURE(status)) { 1.73 + return; 1.74 + } 1.75 + // Do range checks only when warranted to save time 1.76 + if (anteContextPos < 0) { 1.77 + anteContextLength = 0; 1.78 + } else { 1.79 + if (anteContextPos > input.length()) { 1.80 + // throw new IllegalArgumentException("Invalid ante context"); 1.81 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.82 + return; 1.83 + } 1.84 + anteContextLength = anteContextPos; 1.85 + } 1.86 + if (postContextPos < 0) { 1.87 + keyLength = input.length() - anteContextLength; 1.88 + } else { 1.89 + if (postContextPos < anteContextLength || 1.90 + postContextPos > input.length()) { 1.91 + // throw new IllegalArgumentException("Invalid post context"); 1.92 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.93 + return; 1.94 + } 1.95 + keyLength = postContextPos - anteContextLength; 1.96 + } 1.97 + if (cursorPosition < 0) { 1.98 + cursorPosition = outputStr.length(); 1.99 + } else if (cursorPosition > outputStr.length()) { 1.100 + // throw new IllegalArgumentException("Invalid cursor position"); 1.101 + status = U_ILLEGAL_ARGUMENT_ERROR; 1.102 + return; 1.103 + } 1.104 + // We don't validate the segments array. The caller must 1.105 + // guarantee that the segments are well-formed (that is, that 1.106 + // all $n references in the output refer to indices of this 1.107 + // array, and that no array elements are null). 1.108 + this->segments = segs; 1.109 + this->segmentsCount = segsCount; 1.110 + 1.111 + pattern = input; 1.112 + flags = 0; 1.113 + if (anchorStart) { 1.114 + flags |= ANCHOR_START; 1.115 + } 1.116 + if (anchorEnd) { 1.117 + flags |= ANCHOR_END; 1.118 + } 1.119 + 1.120 + anteContext = NULL; 1.121 + if (anteContextLength > 0) { 1.122 + anteContext = new StringMatcher(pattern, 0, anteContextLength, 1.123 + FALSE, *data); 1.124 + /* test for NULL */ 1.125 + if (anteContext == 0) { 1.126 + status = U_MEMORY_ALLOCATION_ERROR; 1.127 + return; 1.128 + } 1.129 + } 1.130 + 1.131 + key = NULL; 1.132 + if (keyLength > 0) { 1.133 + key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength, 1.134 + FALSE, *data); 1.135 + /* test for NULL */ 1.136 + if (key == 0) { 1.137 + status = U_MEMORY_ALLOCATION_ERROR; 1.138 + return; 1.139 + } 1.140 + } 1.141 + 1.142 + int32_t postContextLength = pattern.length() - keyLength - anteContextLength; 1.143 + postContext = NULL; 1.144 + if (postContextLength > 0) { 1.145 + postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(), 1.146 + FALSE, *data); 1.147 + /* test for NULL */ 1.148 + if (postContext == 0) { 1.149 + status = U_MEMORY_ALLOCATION_ERROR; 1.150 + return; 1.151 + } 1.152 + } 1.153 + 1.154 + this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data); 1.155 + /* test for NULL */ 1.156 + if (this->output == 0) { 1.157 + status = U_MEMORY_ALLOCATION_ERROR; 1.158 + return; 1.159 + } 1.160 +} 1.161 + 1.162 +/** 1.163 + * Copy constructor. 1.164 + */ 1.165 +TransliterationRule::TransliterationRule(TransliterationRule& other) : 1.166 + UMemory(other), 1.167 + anteContext(NULL), 1.168 + key(NULL), 1.169 + postContext(NULL), 1.170 + pattern(other.pattern), 1.171 + anteContextLength(other.anteContextLength), 1.172 + keyLength(other.keyLength), 1.173 + flags(other.flags), 1.174 + data(other.data) { 1.175 + 1.176 + segments = NULL; 1.177 + segmentsCount = 0; 1.178 + if (other.segmentsCount > 0) { 1.179 + segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *)); 1.180 + uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0])); 1.181 + } 1.182 + 1.183 + if (other.anteContext != NULL) { 1.184 + anteContext = (StringMatcher*) other.anteContext->clone(); 1.185 + } 1.186 + if (other.key != NULL) { 1.187 + key = (StringMatcher*) other.key->clone(); 1.188 + } 1.189 + if (other.postContext != NULL) { 1.190 + postContext = (StringMatcher*) other.postContext->clone(); 1.191 + } 1.192 + output = other.output->clone(); 1.193 +} 1.194 + 1.195 +TransliterationRule::~TransliterationRule() { 1.196 + uprv_free(segments); 1.197 + delete anteContext; 1.198 + delete key; 1.199 + delete postContext; 1.200 + delete output; 1.201 +} 1.202 + 1.203 +/** 1.204 + * Return the preceding context length. This method is needed to 1.205 + * support the <code>Transliterator</code> method 1.206 + * <code>getMaximumContextLength()</code>. Internally, this is 1.207 + * implemented as the anteContextLength, optionally plus one if 1.208 + * there is a start anchor. The one character anchor gap is 1.209 + * needed to make repeated incremental transliteration with 1.210 + * anchors work. 1.211 + */ 1.212 +int32_t TransliterationRule::getContextLength(void) const { 1.213 + return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0); 1.214 +} 1.215 + 1.216 +/** 1.217 + * Internal method. Returns 8-bit index value for this rule. 1.218 + * This is the low byte of the first character of the key, 1.219 + * unless the first character of the key is a set. If it's a 1.220 + * set, or otherwise can match multiple keys, the index value is -1. 1.221 + */ 1.222 +int16_t TransliterationRule::getIndexValue() const { 1.223 + if (anteContextLength == pattern.length()) { 1.224 + // A pattern with just ante context {such as foo)>bar} can 1.225 + // match any key. 1.226 + return -1; 1.227 + } 1.228 + UChar32 c = pattern.char32At(anteContextLength); 1.229 + return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1); 1.230 +} 1.231 + 1.232 +/** 1.233 + * Internal method. Returns true if this rule matches the given 1.234 + * index value. The index value is an 8-bit integer, 0..255, 1.235 + * representing the low byte of the first character of the key. 1.236 + * It matches this rule if it matches the first character of the 1.237 + * key, or if the first character of the key is a set, and the set 1.238 + * contains any character with a low byte equal to the index 1.239 + * value. If the rule contains only ante context, as in foo)>bar, 1.240 + * then it will match any key. 1.241 + */ 1.242 +UBool TransliterationRule::matchesIndexValue(uint8_t v) const { 1.243 + // Delegate to the key, or if there is none, to the postContext. 1.244 + // If there is neither then we match any key; return true. 1.245 + UnicodeMatcher *m = (key != NULL) ? key : postContext; 1.246 + return (m != NULL) ? m->matchesIndexValue(v) : TRUE; 1.247 +} 1.248 + 1.249 +/** 1.250 + * Return true if this rule masks another rule. If r1 masks r2 then 1.251 + * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 1.252 + * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 1.253 + * "[c]a>x" masks "[dc]a>y". 1.254 + */ 1.255 +UBool TransliterationRule::masks(const TransliterationRule& r2) const { 1.256 + /* Rule r1 masks rule r2 if the string formed of the 1.257 + * antecontext, key, and postcontext overlaps in the following 1.258 + * way: 1.259 + * 1.260 + * r1: aakkkpppp 1.261 + * r2: aaakkkkkpppp 1.262 + * ^ 1.263 + * 1.264 + * The strings must be aligned at the first character of the 1.265 + * key. The length of r1 to the left of the alignment point 1.266 + * must be <= the length of r2 to the left; ditto for the 1.267 + * right. The characters of r1 must equal (or be a superset 1.268 + * of) the corresponding characters of r2. The superset 1.269 + * operation should be performed to check for UnicodeSet 1.270 + * masking. 1.271 + * 1.272 + * Anchors: Two patterns that differ only in anchors only 1.273 + * mask one another if they are exactly equal, and r2 has 1.274 + * all the anchors r1 has (optionally, plus some). Here Y 1.275 + * means the row masks the column, N means it doesn't. 1.276 + * 1.277 + * ab ^ab ab$ ^ab$ 1.278 + * ab Y Y Y Y 1.279 + * ^ab N Y N Y 1.280 + * ab$ N N Y Y 1.281 + * ^ab$ N N N Y 1.282 + * 1.283 + * Post context: {a}b masks ab, but not vice versa, since {a}b 1.284 + * matches everything ab matches, and {a}b matches {|a|}b but ab 1.285 + * does not. Pre context is different (a{b} does not align with 1.286 + * ab). 1.287 + */ 1.288 + 1.289 + /* LIMITATION of the current mask algorithm: Some rule 1.290 + * maskings are currently not detected. For example, 1.291 + * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO 1.292 + */ 1.293 + 1.294 + int32_t len = pattern.length(); 1.295 + int32_t left = anteContextLength; 1.296 + int32_t left2 = r2.anteContextLength; 1.297 + int32_t right = len - left; 1.298 + int32_t right2 = r2.pattern.length() - left2; 1.299 + int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern); 1.300 + 1.301 + // TODO Clean this up -- some logic might be combinable with the 1.302 + // next statement. 1.303 + 1.304 + // Test for anchor masking 1.305 + if (left == left2 && right == right2 && 1.306 + keyLength <= r2.keyLength && 1.307 + 0 == cachedCompare) { 1.308 + // The following boolean logic implements the table above 1.309 + return (flags == r2.flags) || 1.310 + (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) || 1.311 + ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END)); 1.312 + } 1.313 + 1.314 + return left <= left2 && 1.315 + (right < right2 || 1.316 + (right == right2 && keyLength <= r2.keyLength)) && 1.317 + (0 == cachedCompare); 1.318 +} 1.319 + 1.320 +static inline int32_t posBefore(const Replaceable& str, int32_t pos) { 1.321 + return (pos > 0) ? 1.322 + pos - U16_LENGTH(str.char32At(pos-1)) : 1.323 + pos - 1; 1.324 +} 1.325 + 1.326 +static inline int32_t posAfter(const Replaceable& str, int32_t pos) { 1.327 + return (pos >= 0 && pos < str.length()) ? 1.328 + pos + U16_LENGTH(str.char32At(pos)) : 1.329 + pos + 1; 1.330 +} 1.331 + 1.332 +/** 1.333 + * Attempt a match and replacement at the given position. Return 1.334 + * the degree of match between this rule and the given text. The 1.335 + * degree of match may be mismatch, a partial match, or a full 1.336 + * match. A mismatch means at least one character of the text 1.337 + * does not match the context or key. A partial match means some 1.338 + * context and key characters match, but the text is not long 1.339 + * enough to match all of them. A full match means all context 1.340 + * and key characters match. 1.341 + * 1.342 + * If a full match is obtained, perform a replacement, update pos, 1.343 + * and return U_MATCH. Otherwise both text and pos are unchanged. 1.344 + * 1.345 + * @param text the text 1.346 + * @param pos the position indices 1.347 + * @param incremental if TRUE, test for partial matches that may 1.348 + * be completed by additional text inserted at pos.limit. 1.349 + * @return one of <code>U_MISMATCH</code>, 1.350 + * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 1.351 + * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 1.352 + */ 1.353 +UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text, 1.354 + UTransPosition& pos, 1.355 + UBool incremental) const { 1.356 + // Matching and replacing are done in one method because the 1.357 + // replacement operation needs information obtained during the 1.358 + // match. Another way to do this is to have the match method 1.359 + // create a match result struct with relevant offsets, and to pass 1.360 + // this into the replace method. 1.361 + 1.362 + // ============================ MATCH =========================== 1.363 + 1.364 + // Reset segment match data 1.365 + if (segments != NULL) { 1.366 + for (int32_t i=0; i<segmentsCount; ++i) { 1.367 + ((StringMatcher*) segments[i])->resetMatch(); 1.368 + } 1.369 + } 1.370 + 1.371 +// int32_t lenDelta, keyLimit; 1.372 + int32_t keyLimit; 1.373 + 1.374 + // ------------------------ Ante Context ------------------------ 1.375 + 1.376 + // A mismatch in the ante context, or with the start anchor, 1.377 + // is an outright U_MISMATCH regardless of whether we are 1.378 + // incremental or not. 1.379 + int32_t oText; // offset into 'text' 1.380 +// int32_t newStart = 0; 1.381 + int32_t minOText; 1.382 + 1.383 + // Note (1): We process text in 16-bit code units, rather than 1.384 + // 32-bit code points. This works because stand-ins are 1.385 + // always in the BMP and because we are doing a literal match 1.386 + // operation, which can be done 16-bits at a time. 1.387 + 1.388 + int32_t anteLimit = posBefore(text, pos.contextStart); 1.389 + 1.390 + UMatchDegree match; 1.391 + 1.392 + // Start reverse match at char before pos.start 1.393 + oText = posBefore(text, pos.start); 1.394 + 1.395 + if (anteContext != NULL) { 1.396 + match = anteContext->matches(text, oText, anteLimit, FALSE); 1.397 + if (match != U_MATCH) { 1.398 + return U_MISMATCH; 1.399 + } 1.400 + } 1.401 + 1.402 + minOText = posAfter(text, oText); 1.403 + 1.404 + // ------------------------ Start Anchor ------------------------ 1.405 + 1.406 + if (((flags & ANCHOR_START) != 0) && oText != anteLimit) { 1.407 + return U_MISMATCH; 1.408 + } 1.409 + 1.410 + // -------------------- Key and Post Context -------------------- 1.411 + 1.412 + oText = pos.start; 1.413 + 1.414 + if (key != NULL) { 1.415 + match = key->matches(text, oText, pos.limit, incremental); 1.416 + if (match != U_MATCH) { 1.417 + return match; 1.418 + } 1.419 + } 1.420 + 1.421 + keyLimit = oText; 1.422 + 1.423 + if (postContext != NULL) { 1.424 + if (incremental && keyLimit == pos.limit) { 1.425 + // The key matches just before pos.limit, and there is 1.426 + // a postContext. Since we are in incremental mode, 1.427 + // we must assume more characters may be inserted at 1.428 + // pos.limit -- this is a partial match. 1.429 + return U_PARTIAL_MATCH; 1.430 + } 1.431 + 1.432 + match = postContext->matches(text, oText, pos.contextLimit, incremental); 1.433 + if (match != U_MATCH) { 1.434 + return match; 1.435 + } 1.436 + } 1.437 + 1.438 + // ------------------------- Stop Anchor ------------------------ 1.439 + 1.440 + if (((flags & ANCHOR_END)) != 0) { 1.441 + if (oText != pos.contextLimit) { 1.442 + return U_MISMATCH; 1.443 + } 1.444 + if (incremental) { 1.445 + return U_PARTIAL_MATCH; 1.446 + } 1.447 + } 1.448 + 1.449 + // =========================== REPLACE ========================== 1.450 + 1.451 + // We have a full match. The key is between pos.start and 1.452 + // keyLimit. 1.453 + 1.454 + int32_t newStart; 1.455 + int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart); 1.456 + int32_t lenDelta = newLength - (keyLimit - pos.start); 1.457 + 1.458 + oText += lenDelta; 1.459 + pos.limit += lenDelta; 1.460 + pos.contextLimit += lenDelta; 1.461 + // Restrict new value of start to [minOText, min(oText, pos.limit)]. 1.462 + pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart)); 1.463 + return U_MATCH; 1.464 +} 1.465 + 1.466 +/** 1.467 + * Create a source string that represents this rule. Append it to the 1.468 + * given string. 1.469 + */ 1.470 +UnicodeString& TransliterationRule::toRule(UnicodeString& rule, 1.471 + UBool escapeUnprintable) const { 1.472 + 1.473 + // Accumulate special characters (and non-specials following them) 1.474 + // into quoteBuf. Append quoteBuf, within single quotes, when 1.475 + // a non-quoted element must be inserted. 1.476 + UnicodeString str, quoteBuf; 1.477 + 1.478 + // Do not emit the braces '{' '}' around the pattern if there 1.479 + // is neither anteContext nor postContext. 1.480 + UBool emitBraces = 1.481 + (anteContext != NULL) || (postContext != NULL); 1.482 + 1.483 + // Emit start anchor 1.484 + if ((flags & ANCHOR_START) != 0) { 1.485 + rule.append((UChar)94/*^*/); 1.486 + } 1.487 + 1.488 + // Emit the input pattern 1.489 + ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf); 1.490 + 1.491 + if (emitBraces) { 1.492 + ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf); 1.493 + } 1.494 + 1.495 + ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf); 1.496 + 1.497 + if (emitBraces) { 1.498 + ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf); 1.499 + } 1.500 + 1.501 + ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf); 1.502 + 1.503 + // Emit end anchor 1.504 + if ((flags & ANCHOR_END) != 0) { 1.505 + rule.append((UChar)36/*$*/); 1.506 + } 1.507 + 1.508 + ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf); 1.509 + 1.510 + // Emit the output pattern 1.511 + 1.512 + ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable), 1.513 + TRUE, escapeUnprintable, quoteBuf); 1.514 + 1.515 + ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf); 1.516 + 1.517 + return rule; 1.518 +} 1.519 + 1.520 +void TransliterationRule::setData(const TransliterationRuleData* d) { 1.521 + data = d; 1.522 + if (anteContext != NULL) anteContext->setData(d); 1.523 + if (postContext != NULL) postContext->setData(d); 1.524 + if (key != NULL) key->setData(d); 1.525 + // assert(output != NULL); 1.526 + output->setData(d); 1.527 + // Don't have to do segments since they are in the context or key 1.528 +} 1.529 + 1.530 +/** 1.531 + * Union the set of all characters that may be modified by this rule 1.532 + * into the given set. 1.533 + */ 1.534 +void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const { 1.535 + int32_t limit = anteContextLength + keyLength; 1.536 + for (int32_t i=anteContextLength; i<limit; ) { 1.537 + UChar32 ch = pattern.char32At(i); 1.538 + i += U16_LENGTH(ch); 1.539 + const UnicodeMatcher* matcher = data->lookupMatcher(ch); 1.540 + if (matcher == NULL) { 1.541 + toUnionTo.add(ch); 1.542 + } else { 1.543 + matcher->addMatchSetTo(toUnionTo); 1.544 + } 1.545 + } 1.546 +} 1.547 + 1.548 +/** 1.549 + * Union the set of all characters that may be emitted by this rule 1.550 + * into the given set. 1.551 + */ 1.552 +void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const { 1.553 + output->toReplacer()->addReplacementSetTo(toUnionTo); 1.554 +} 1.555 + 1.556 +U_NAMESPACE_END 1.557 + 1.558 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.559 + 1.560 +//eof