intl/icu/source/i18n/rbt_rule.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rbt_rule.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,557 @@
     1.4 +/*
     1.5 + **********************************************************************
     1.6 + *   Copyright (C) 1999-2011, International Business Machines
     1.7 + *   Corporation and others.  All Rights Reserved.
     1.8 + **********************************************************************
     1.9 + *   Date        Name        Description
    1.10 + *   11/17/99    aliu        Creation.
    1.11 + **********************************************************************
    1.12 + */
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/rep.h"
    1.19 +#include "unicode/unifilt.h"
    1.20 +#include "unicode/uniset.h"
    1.21 +#include "unicode/utf16.h"
    1.22 +#include "rbt_rule.h"
    1.23 +#include "rbt_data.h"
    1.24 +#include "cmemory.h"
    1.25 +#include "strmatch.h"
    1.26 +#include "strrepl.h"
    1.27 +#include "util.h"
    1.28 +#include "putilimp.h"
    1.29 +
    1.30 +static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "
    1.31 +
    1.32 +U_NAMESPACE_BEGIN
    1.33 +
    1.34 +/**
    1.35 + * Construct a new rule with the given input, output text, and other
    1.36 + * attributes.  A cursor position may be specified for the output text.
    1.37 + * @param input input string, including key and optional ante and
    1.38 + * post context
    1.39 + * @param anteContextPos offset into input to end of ante context, or -1 if
    1.40 + * none.  Must be <= input.length() if not -1.
    1.41 + * @param postContextPos offset into input to start of post context, or -1
    1.42 + * if none.  Must be <= input.length() if not -1, and must be >=
    1.43 + * anteContextPos.
    1.44 + * @param output output string
    1.45 + * @param cursorPosition offset into output at which cursor is located, or -1 if
    1.46 + * none.  If less than zero, then the cursor is placed after the
    1.47 + * <code>output</code>; that is, -1 is equivalent to
    1.48 + * <code>output.length()</code>.  If greater than
    1.49 + * <code>output.length()</code> then an exception is thrown.
    1.50 + * @param segs array of UnicodeFunctors corresponding to input pattern
    1.51 + * segments, or null if there are none.  The array itself is adopted,
    1.52 + * but the pointers within it are not.
    1.53 + * @param segsCount number of elements in segs[]
    1.54 + * @param anchorStart TRUE if the the rule is anchored on the left to
    1.55 + * the context start
    1.56 + * @param anchorEnd TRUE if the rule is anchored on the right to the
    1.57 + * context limit
    1.58 + */
    1.59 +TransliterationRule::TransliterationRule(const UnicodeString& input,
    1.60 +                                         int32_t anteContextPos, int32_t postContextPos,
    1.61 +                                         const UnicodeString& outputStr,
    1.62 +                                         int32_t cursorPosition, int32_t cursorOffset,
    1.63 +                                         UnicodeFunctor** segs,
    1.64 +                                         int32_t segsCount,
    1.65 +                                         UBool anchorStart, UBool anchorEnd,
    1.66 +                                         const TransliterationRuleData* theData,
    1.67 +                                         UErrorCode& status) :
    1.68 +    UMemory(),
    1.69 +    segments(0),
    1.70 +    data(theData) {
    1.71 +
    1.72 +    if (U_FAILURE(status)) {
    1.73 +        return;
    1.74 +    }
    1.75 +    // Do range checks only when warranted to save time
    1.76 +    if (anteContextPos < 0) {
    1.77 +        anteContextLength = 0;
    1.78 +    } else {
    1.79 +        if (anteContextPos > input.length()) {
    1.80 +            // throw new IllegalArgumentException("Invalid ante context");
    1.81 +            status = U_ILLEGAL_ARGUMENT_ERROR;
    1.82 +            return;
    1.83 +        }
    1.84 +        anteContextLength = anteContextPos;
    1.85 +    }
    1.86 +    if (postContextPos < 0) {
    1.87 +        keyLength = input.length() - anteContextLength;
    1.88 +    } else {
    1.89 +        if (postContextPos < anteContextLength ||
    1.90 +            postContextPos > input.length()) {
    1.91 +            // throw new IllegalArgumentException("Invalid post context");
    1.92 +            status = U_ILLEGAL_ARGUMENT_ERROR;
    1.93 +            return;
    1.94 +        }
    1.95 +        keyLength = postContextPos - anteContextLength;
    1.96 +    }
    1.97 +    if (cursorPosition < 0) {
    1.98 +        cursorPosition = outputStr.length();
    1.99 +    } else if (cursorPosition > outputStr.length()) {
   1.100 +        // throw new IllegalArgumentException("Invalid cursor position");
   1.101 +        status = U_ILLEGAL_ARGUMENT_ERROR;
   1.102 +        return;
   1.103 +    }
   1.104 +    // We don't validate the segments array.  The caller must
   1.105 +    // guarantee that the segments are well-formed (that is, that
   1.106 +    // all $n references in the output refer to indices of this
   1.107 +    // array, and that no array elements are null).
   1.108 +    this->segments = segs;
   1.109 +    this->segmentsCount = segsCount;
   1.110 +
   1.111 +    pattern = input;
   1.112 +    flags = 0;
   1.113 +    if (anchorStart) {
   1.114 +        flags |= ANCHOR_START;
   1.115 +    }
   1.116 +    if (anchorEnd) {
   1.117 +        flags |= ANCHOR_END;
   1.118 +    }
   1.119 +
   1.120 +    anteContext = NULL;
   1.121 +    if (anteContextLength > 0) {
   1.122 +        anteContext = new StringMatcher(pattern, 0, anteContextLength,
   1.123 +                                        FALSE, *data);
   1.124 +        /* test for NULL */
   1.125 +        if (anteContext == 0) {
   1.126 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.127 +            return;
   1.128 +        }
   1.129 +    }
   1.130 +    
   1.131 +    key = NULL;
   1.132 +    if (keyLength > 0) {
   1.133 +        key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,
   1.134 +                                FALSE, *data);
   1.135 +        /* test for NULL */
   1.136 +        if (key == 0) {
   1.137 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.138 +            return;
   1.139 +        }
   1.140 +    }
   1.141 +    
   1.142 +    int32_t postContextLength = pattern.length() - keyLength - anteContextLength;
   1.143 +    postContext = NULL;
   1.144 +    if (postContextLength > 0) {
   1.145 +        postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),
   1.146 +                                        FALSE, *data);
   1.147 +        /* test for NULL */
   1.148 +        if (postContext == 0) {
   1.149 +            status = U_MEMORY_ALLOCATION_ERROR;
   1.150 +            return;
   1.151 +        }
   1.152 +    }
   1.153 +
   1.154 +    this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);
   1.155 +    /* test for NULL */
   1.156 +    if (this->output == 0) {
   1.157 +        status = U_MEMORY_ALLOCATION_ERROR;
   1.158 +        return;
   1.159 +    }
   1.160 +}
   1.161 +
   1.162 +/**
   1.163 + * Copy constructor.
   1.164 + */
   1.165 +TransliterationRule::TransliterationRule(TransliterationRule& other) :
   1.166 +    UMemory(other),
   1.167 +    anteContext(NULL),
   1.168 +    key(NULL),
   1.169 +    postContext(NULL),
   1.170 +    pattern(other.pattern),
   1.171 +    anteContextLength(other.anteContextLength),
   1.172 +    keyLength(other.keyLength),
   1.173 +    flags(other.flags),
   1.174 +    data(other.data) {
   1.175 +
   1.176 +    segments = NULL;
   1.177 +    segmentsCount = 0;
   1.178 +    if (other.segmentsCount > 0) {
   1.179 +        segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));
   1.180 +        uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
   1.181 +    }
   1.182 +
   1.183 +    if (other.anteContext != NULL) {
   1.184 +        anteContext = (StringMatcher*) other.anteContext->clone();
   1.185 +    }
   1.186 +    if (other.key != NULL) {
   1.187 +        key = (StringMatcher*) other.key->clone();
   1.188 +    }
   1.189 +    if (other.postContext != NULL) {
   1.190 +        postContext = (StringMatcher*) other.postContext->clone();
   1.191 +    }
   1.192 +    output = other.output->clone();
   1.193 +}
   1.194 +
   1.195 +TransliterationRule::~TransliterationRule() {
   1.196 +    uprv_free(segments);
   1.197 +    delete anteContext;
   1.198 +    delete key;
   1.199 +    delete postContext;
   1.200 +    delete output;
   1.201 +}
   1.202 +
   1.203 +/**
   1.204 + * Return the preceding context length.  This method is needed to
   1.205 + * support the <code>Transliterator</code> method
   1.206 + * <code>getMaximumContextLength()</code>.  Internally, this is
   1.207 + * implemented as the anteContextLength, optionally plus one if
   1.208 + * there is a start anchor.  The one character anchor gap is
   1.209 + * needed to make repeated incremental transliteration with
   1.210 + * anchors work.
   1.211 + */
   1.212 +int32_t TransliterationRule::getContextLength(void) const {
   1.213 +    return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
   1.214 +}
   1.215 +
   1.216 +/**
   1.217 + * Internal method.  Returns 8-bit index value for this rule.
   1.218 + * This is the low byte of the first character of the key,
   1.219 + * unless the first character of the key is a set.  If it's a
   1.220 + * set, or otherwise can match multiple keys, the index value is -1.
   1.221 + */
   1.222 +int16_t TransliterationRule::getIndexValue() const {
   1.223 +    if (anteContextLength == pattern.length()) {
   1.224 +        // A pattern with just ante context {such as foo)>bar} can
   1.225 +        // match any key.
   1.226 +        return -1;
   1.227 +    }
   1.228 +    UChar32 c = pattern.char32At(anteContextLength);
   1.229 +    return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);
   1.230 +}
   1.231 +
   1.232 +/**
   1.233 + * Internal method.  Returns true if this rule matches the given
   1.234 + * index value.  The index value is an 8-bit integer, 0..255,
   1.235 + * representing the low byte of the first character of the key.
   1.236 + * It matches this rule if it matches the first character of the
   1.237 + * key, or if the first character of the key is a set, and the set
   1.238 + * contains any character with a low byte equal to the index
   1.239 + * value.  If the rule contains only ante context, as in foo)>bar,
   1.240 + * then it will match any key.
   1.241 + */
   1.242 +UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
   1.243 +    // Delegate to the key, or if there is none, to the postContext.
   1.244 +    // If there is neither then we match any key; return true.
   1.245 +    UnicodeMatcher *m = (key != NULL) ? key : postContext;
   1.246 +    return (m != NULL) ? m->matchesIndexValue(v) : TRUE;
   1.247 +}
   1.248 +
   1.249 +/**
   1.250 + * Return true if this rule masks another rule.  If r1 masks r2 then
   1.251 + * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
   1.252 + * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
   1.253 + * "[c]a>x" masks "[dc]a>y".
   1.254 + */
   1.255 +UBool TransliterationRule::masks(const TransliterationRule& r2) const {
   1.256 +    /* Rule r1 masks rule r2 if the string formed of the
   1.257 +     * antecontext, key, and postcontext overlaps in the following
   1.258 +     * way:
   1.259 +     *
   1.260 +     * r1:      aakkkpppp
   1.261 +     * r2:     aaakkkkkpppp
   1.262 +     *            ^
   1.263 +     * 
   1.264 +     * The strings must be aligned at the first character of the
   1.265 +     * key.  The length of r1 to the left of the alignment point
   1.266 +     * must be <= the length of r2 to the left; ditto for the
   1.267 +     * right.  The characters of r1 must equal (or be a superset
   1.268 +     * of) the corresponding characters of r2.  The superset
   1.269 +     * operation should be performed to check for UnicodeSet
   1.270 +     * masking.
   1.271 +     *
   1.272 +     * Anchors:  Two patterns that differ only in anchors only
   1.273 +     * mask one another if they are exactly equal, and r2 has
   1.274 +     * all the anchors r1 has (optionally, plus some).  Here Y
   1.275 +     * means the row masks the column, N means it doesn't.
   1.276 +     *
   1.277 +     *         ab   ^ab    ab$  ^ab$
   1.278 +     *   ab    Y     Y     Y     Y
   1.279 +     *  ^ab    N     Y     N     Y
   1.280 +     *   ab$   N     N     Y     Y
   1.281 +     *  ^ab$   N     N     N     Y
   1.282 +     *
   1.283 +     * Post context: {a}b masks ab, but not vice versa, since {a}b
   1.284 +     * matches everything ab matches, and {a}b matches {|a|}b but ab
   1.285 +     * does not.  Pre context is different (a{b} does not align with
   1.286 +     * ab).
   1.287 +     */
   1.288 +
   1.289 +    /* LIMITATION of the current mask algorithm: Some rule
   1.290 +     * maskings are currently not detected.  For example,
   1.291 +     * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
   1.292 +     */
   1.293 +
   1.294 +    int32_t len = pattern.length();
   1.295 +    int32_t left = anteContextLength;
   1.296 +    int32_t left2 = r2.anteContextLength;
   1.297 +    int32_t right = len - left;
   1.298 +    int32_t right2 = r2.pattern.length() - left2;
   1.299 +    int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);
   1.300 +
   1.301 +    // TODO Clean this up -- some logic might be combinable with the
   1.302 +    // next statement.
   1.303 +
   1.304 +    // Test for anchor masking
   1.305 +    if (left == left2 && right == right2 &&
   1.306 +        keyLength <= r2.keyLength &&
   1.307 +        0 == cachedCompare) {
   1.308 +        // The following boolean logic implements the table above
   1.309 +        return (flags == r2.flags) ||
   1.310 +            (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
   1.311 +            ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
   1.312 +    }
   1.313 +
   1.314 +    return left <= left2 &&
   1.315 +        (right < right2 ||
   1.316 +         (right == right2 && keyLength <= r2.keyLength)) &&
   1.317 +         (0 == cachedCompare);
   1.318 +}
   1.319 +
   1.320 +static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
   1.321 +    return (pos > 0) ?
   1.322 +        pos - U16_LENGTH(str.char32At(pos-1)) :
   1.323 +        pos - 1;
   1.324 +}
   1.325 +
   1.326 +static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
   1.327 +    return (pos >= 0 && pos < str.length()) ?
   1.328 +        pos + U16_LENGTH(str.char32At(pos)) :
   1.329 +        pos + 1;
   1.330 +}
   1.331 +
   1.332 +/**
   1.333 + * Attempt a match and replacement at the given position.  Return
   1.334 + * the degree of match between this rule and the given text.  The
   1.335 + * degree of match may be mismatch, a partial match, or a full
   1.336 + * match.  A mismatch means at least one character of the text
   1.337 + * does not match the context or key.  A partial match means some
   1.338 + * context and key characters match, but the text is not long
   1.339 + * enough to match all of them.  A full match means all context
   1.340 + * and key characters match.
   1.341 + * 
   1.342 + * If a full match is obtained, perform a replacement, update pos,
   1.343 + * and return U_MATCH.  Otherwise both text and pos are unchanged.
   1.344 + * 
   1.345 + * @param text the text
   1.346 + * @param pos the position indices
   1.347 + * @param incremental if TRUE, test for partial matches that may
   1.348 + * be completed by additional text inserted at pos.limit.
   1.349 + * @return one of <code>U_MISMATCH</code>,
   1.350 + * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
   1.351 + * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
   1.352 + */
   1.353 +UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
   1.354 +                                                  UTransPosition& pos,
   1.355 +                                                  UBool incremental) const {
   1.356 +    // Matching and replacing are done in one method because the
   1.357 +    // replacement operation needs information obtained during the
   1.358 +    // match.  Another way to do this is to have the match method
   1.359 +    // create a match result struct with relevant offsets, and to pass
   1.360 +    // this into the replace method.
   1.361 +
   1.362 +    // ============================ MATCH ===========================
   1.363 +
   1.364 +    // Reset segment match data
   1.365 +    if (segments != NULL) {
   1.366 +        for (int32_t i=0; i<segmentsCount; ++i) {
   1.367 +            ((StringMatcher*) segments[i])->resetMatch();
   1.368 +        }
   1.369 +    }
   1.370 +
   1.371 +//    int32_t lenDelta, keyLimit;
   1.372 +    int32_t keyLimit;
   1.373 +
   1.374 +    // ------------------------ Ante Context ------------------------
   1.375 +
   1.376 +    // A mismatch in the ante context, or with the start anchor,
   1.377 +    // is an outright U_MISMATCH regardless of whether we are
   1.378 +    // incremental or not.
   1.379 +    int32_t oText; // offset into 'text'
   1.380 +//    int32_t newStart = 0;
   1.381 +    int32_t minOText;
   1.382 +
   1.383 +    // Note (1): We process text in 16-bit code units, rather than
   1.384 +    // 32-bit code points.  This works because stand-ins are
   1.385 +    // always in the BMP and because we are doing a literal match
   1.386 +    // operation, which can be done 16-bits at a time.
   1.387 +    
   1.388 +    int32_t anteLimit = posBefore(text, pos.contextStart);
   1.389 +
   1.390 +    UMatchDegree match;
   1.391 +
   1.392 +    // Start reverse match at char before pos.start
   1.393 +    oText = posBefore(text, pos.start);
   1.394 +
   1.395 +    if (anteContext != NULL) {
   1.396 +        match = anteContext->matches(text, oText, anteLimit, FALSE);
   1.397 +        if (match != U_MATCH) {
   1.398 +            return U_MISMATCH;
   1.399 +        }
   1.400 +    }
   1.401 +
   1.402 +    minOText = posAfter(text, oText);
   1.403 +
   1.404 +    // ------------------------ Start Anchor ------------------------
   1.405 +    
   1.406 +    if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
   1.407 +        return U_MISMATCH;
   1.408 +    }
   1.409 +
   1.410 +    // -------------------- Key and Post Context --------------------
   1.411 +    
   1.412 +    oText = pos.start;
   1.413 +
   1.414 +    if (key != NULL) {
   1.415 +        match = key->matches(text, oText, pos.limit, incremental);
   1.416 +        if (match != U_MATCH) {
   1.417 +            return match;
   1.418 +        }
   1.419 +    }
   1.420 +
   1.421 +    keyLimit = oText;
   1.422 +
   1.423 +    if (postContext != NULL) {
   1.424 +        if (incremental && keyLimit == pos.limit) {
   1.425 +            // The key matches just before pos.limit, and there is
   1.426 +            // a postContext.  Since we are in incremental mode,
   1.427 +            // we must assume more characters may be inserted at
   1.428 +            // pos.limit -- this is a partial match.
   1.429 +            return U_PARTIAL_MATCH;
   1.430 +        }
   1.431 +
   1.432 +        match = postContext->matches(text, oText, pos.contextLimit, incremental);
   1.433 +        if (match != U_MATCH) {
   1.434 +            return match;
   1.435 +        }
   1.436 +    }
   1.437 +    
   1.438 +    // ------------------------- Stop Anchor ------------------------
   1.439 +    
   1.440 +    if (((flags & ANCHOR_END)) != 0) {
   1.441 +        if (oText != pos.contextLimit) {
   1.442 +            return U_MISMATCH;
   1.443 +        }
   1.444 +        if (incremental) {
   1.445 +            return U_PARTIAL_MATCH;
   1.446 +        }
   1.447 +    }
   1.448 +    
   1.449 +    // =========================== REPLACE ==========================
   1.450 +
   1.451 +    // We have a full match.  The key is between pos.start and
   1.452 +    // keyLimit.
   1.453 +
   1.454 +    int32_t newStart;
   1.455 +    int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);
   1.456 +    int32_t lenDelta = newLength - (keyLimit - pos.start);
   1.457 +
   1.458 +    oText += lenDelta;
   1.459 +    pos.limit += lenDelta;
   1.460 +    pos.contextLimit += lenDelta;
   1.461 +    // Restrict new value of start to [minOText, min(oText, pos.limit)].
   1.462 +    pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
   1.463 +    return U_MATCH;
   1.464 +}
   1.465 +
   1.466 +/**
   1.467 + * Create a source string that represents this rule.  Append it to the
   1.468 + * given string.
   1.469 + */
   1.470 +UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
   1.471 +                                           UBool escapeUnprintable) const {
   1.472 +
   1.473 +    // Accumulate special characters (and non-specials following them)
   1.474 +    // into quoteBuf.  Append quoteBuf, within single quotes, when
   1.475 +    // a non-quoted element must be inserted.
   1.476 +    UnicodeString str, quoteBuf;
   1.477 +
   1.478 +    // Do not emit the braces '{' '}' around the pattern if there
   1.479 +    // is neither anteContext nor postContext.
   1.480 +    UBool emitBraces =
   1.481 +        (anteContext != NULL) || (postContext != NULL);
   1.482 +
   1.483 +    // Emit start anchor
   1.484 +    if ((flags & ANCHOR_START) != 0) {
   1.485 +        rule.append((UChar)94/*^*/);
   1.486 +    }
   1.487 +
   1.488 +    // Emit the input pattern
   1.489 +    ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
   1.490 +
   1.491 +    if (emitBraces) {
   1.492 +        ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
   1.493 +    }
   1.494 +
   1.495 +    ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);
   1.496 +
   1.497 +    if (emitBraces) {
   1.498 +        ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
   1.499 +    }
   1.500 +
   1.501 +    ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
   1.502 +
   1.503 +    // Emit end anchor
   1.504 +    if ((flags & ANCHOR_END) != 0) {
   1.505 +        rule.append((UChar)36/*$*/);
   1.506 +    }
   1.507 +
   1.508 +    ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf);
   1.509 +
   1.510 +    // Emit the output pattern
   1.511 +
   1.512 +    ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),
   1.513 +                              TRUE, escapeUnprintable, quoteBuf);
   1.514 +
   1.515 +    ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
   1.516 +
   1.517 +    return rule;
   1.518 +}
   1.519 +
   1.520 +void TransliterationRule::setData(const TransliterationRuleData* d) {
   1.521 +    data = d;
   1.522 +    if (anteContext != NULL) anteContext->setData(d);
   1.523 +    if (postContext != NULL) postContext->setData(d);
   1.524 +    if (key != NULL) key->setData(d);
   1.525 +    // assert(output != NULL);
   1.526 +    output->setData(d);
   1.527 +    // Don't have to do segments since they are in the context or key
   1.528 +}
   1.529 +
   1.530 +/**
   1.531 + * Union the set of all characters that may be modified by this rule
   1.532 + * into the given set.
   1.533 + */
   1.534 +void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
   1.535 +    int32_t limit = anteContextLength + keyLength;
   1.536 +    for (int32_t i=anteContextLength; i<limit; ) {
   1.537 +        UChar32 ch = pattern.char32At(i);
   1.538 +        i += U16_LENGTH(ch);
   1.539 +        const UnicodeMatcher* matcher = data->lookupMatcher(ch);
   1.540 +        if (matcher == NULL) {
   1.541 +            toUnionTo.add(ch);
   1.542 +        } else {
   1.543 +            matcher->addMatchSetTo(toUnionTo);
   1.544 +        }
   1.545 +    }
   1.546 +}
   1.547 +
   1.548 +/**
   1.549 + * Union the set of all characters that may be emitted by this rule
   1.550 + * into the given set.
   1.551 + */
   1.552 +void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
   1.553 +    output->toReplacer()->addReplacementSetTo(toUnionTo);
   1.554 +}
   1.555 +
   1.556 +U_NAMESPACE_END
   1.557 +
   1.558 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.559 +
   1.560 +//eof

mercurial