intl/icu/source/i18n/strrepl.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/strrepl.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,327 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (c) 2002-2012, International Business Machines Corporation
     1.7 +*   and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   01/21/2002  aliu        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/uniset.h"
    1.19 +#include "unicode/utf16.h"
    1.20 +#include "strrepl.h"
    1.21 +#include "rbt_data.h"
    1.22 +#include "util.h"
    1.23 +
    1.24 +U_NAMESPACE_BEGIN
    1.25 +
    1.26 +UnicodeReplacer::~UnicodeReplacer() {}
    1.27 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
    1.28 +
    1.29 +/**
    1.30 + * Construct a StringReplacer that sets the emits the given output
    1.31 + * text and sets the cursor to the given position.
    1.32 + * @param theOutput text that will replace input text when the
    1.33 + * replace() method is called.  May contain stand-in characters
    1.34 + * that represent nested replacers.
    1.35 + * @param theCursorPos cursor position that will be returned by
    1.36 + * the replace() method
    1.37 + * @param theData transliterator context object that translates
    1.38 + * stand-in characters to UnicodeReplacer objects
    1.39 + */
    1.40 +StringReplacer::StringReplacer(const UnicodeString& theOutput,
    1.41 +                               int32_t theCursorPos,
    1.42 +                               const TransliterationRuleData* theData) {
    1.43 +    output = theOutput;
    1.44 +    cursorPos = theCursorPos;
    1.45 +    hasCursor = TRUE;
    1.46 +    data = theData;
    1.47 +    isComplex = TRUE;
    1.48 +}
    1.49 +
    1.50 +/**
    1.51 + * Construct a StringReplacer that sets the emits the given output
    1.52 + * text and does not modify the cursor.
    1.53 + * @param theOutput text that will replace input text when the
    1.54 + * replace() method is called.  May contain stand-in characters
    1.55 + * that represent nested replacers.
    1.56 + * @param theData transliterator context object that translates
    1.57 + * stand-in characters to UnicodeReplacer objects
    1.58 + */
    1.59 +StringReplacer::StringReplacer(const UnicodeString& theOutput,
    1.60 +                               const TransliterationRuleData* theData) {
    1.61 +    output = theOutput;
    1.62 +    cursorPos = 0;
    1.63 +    hasCursor = FALSE;
    1.64 +    data = theData;
    1.65 +    isComplex = TRUE;
    1.66 +}
    1.67 +
    1.68 +/**
    1.69 + * Copy constructor.
    1.70 + */
    1.71 +StringReplacer::StringReplacer(const StringReplacer& other) :
    1.72 +    UnicodeFunctor(other),
    1.73 +    UnicodeReplacer(other)
    1.74 +{
    1.75 +    output = other.output;
    1.76 +    cursorPos = other.cursorPos;
    1.77 +    hasCursor = other.hasCursor;
    1.78 +    data = other.data;
    1.79 +    isComplex = other.isComplex;
    1.80 +}
    1.81 +
    1.82 +/**
    1.83 + * Destructor
    1.84 + */
    1.85 +StringReplacer::~StringReplacer() {
    1.86 +}
    1.87 +
    1.88 +/**
    1.89 + * Implement UnicodeFunctor
    1.90 + */
    1.91 +UnicodeFunctor* StringReplacer::clone() const {
    1.92 +    return new StringReplacer(*this);
    1.93 +}
    1.94 +
    1.95 +/**
    1.96 + * Implement UnicodeFunctor
    1.97 + */
    1.98 +UnicodeReplacer* StringReplacer::toReplacer() const {
    1.99 +  return const_cast<StringReplacer *>(this);
   1.100 +}
   1.101 +
   1.102 +/**
   1.103 + * UnicodeReplacer API
   1.104 + */
   1.105 +int32_t StringReplacer::replace(Replaceable& text,
   1.106 +                                int32_t start,
   1.107 +                                int32_t limit,
   1.108 +                                int32_t& cursor) {
   1.109 +    int32_t outLen;
   1.110 +    int32_t newStart = 0;
   1.111 +
   1.112 +    // NOTE: It should be possible to _always_ run the complex
   1.113 +    // processing code; just slower.  If not, then there is a bug
   1.114 +    // in the complex processing code.
   1.115 +
   1.116 +    // Simple (no nested replacers) Processing Code :
   1.117 +    if (!isComplex) {
   1.118 +        text.handleReplaceBetween(start, limit, output);
   1.119 +        outLen = output.length();
   1.120 +
   1.121 +        // Setup default cursor position (for cursorPos within output)
   1.122 +        newStart = cursorPos;
   1.123 +    }
   1.124 +
   1.125 +    // Complex (nested replacers) Processing Code :
   1.126 +    else {
   1.127 +        /* When there are segments to be copied, use the Replaceable.copy()
   1.128 +         * API in order to retain out-of-band data.  Copy everything to the
   1.129 +         * end of the string, then copy them back over the key.  This preserves
   1.130 +         * the integrity of indices into the key and surrounding context while
   1.131 +         * generating the output text.
   1.132 +         */
   1.133 +        UnicodeString buf;
   1.134 +        int32_t oOutput; // offset into 'output'
   1.135 +        isComplex = FALSE;
   1.136 +
   1.137 +        // The temporary buffer starts at tempStart, and extends
   1.138 +        // to destLimit.  The start of the buffer has a single
   1.139 +        // character from before the key.  This provides style
   1.140 +        // data when addition characters are filled into the
   1.141 +        // temporary buffer.  If there is nothing to the left, use
   1.142 +        // the non-character U+FFFF, which Replaceable subclasses
   1.143 +        // should treat specially as a "no-style character."
   1.144 +        // destStart points to the point after the style context
   1.145 +        // character, so it is tempStart+1 or tempStart+2.
   1.146 +        int32_t tempStart = text.length(); // start of temp buffer
   1.147 +        int32_t destStart = tempStart; // copy new text to here
   1.148 +        if (start > 0) {
   1.149 +            int32_t len = U16_LENGTH(text.char32At(start-1));
   1.150 +            text.copy(start-len, start, tempStart);
   1.151 +            destStart += len;
   1.152 +        } else {
   1.153 +            UnicodeString str((UChar) 0xFFFF);
   1.154 +            text.handleReplaceBetween(tempStart, tempStart, str);
   1.155 +            destStart++;
   1.156 +        }
   1.157 +        int32_t destLimit = destStart;
   1.158 +
   1.159 +        for (oOutput=0; oOutput<output.length(); ) {
   1.160 +            if (oOutput == cursorPos) {
   1.161 +                // Record the position of the cursor
   1.162 +                newStart = destLimit - destStart; // relative to start
   1.163 +            }
   1.164 +            UChar32 c = output.char32At(oOutput);
   1.165 +            UnicodeReplacer* r = data->lookupReplacer(c);
   1.166 +            if (r == NULL) {
   1.167 +                // Accumulate straight (non-segment) text.
   1.168 +                buf.append(c);
   1.169 +            } else {
   1.170 +                isComplex = TRUE;
   1.171 +
   1.172 +                // Insert any accumulated straight text.
   1.173 +                if (buf.length() > 0) {
   1.174 +                    text.handleReplaceBetween(destLimit, destLimit, buf);
   1.175 +                    destLimit += buf.length();
   1.176 +                    buf.truncate(0);
   1.177 +                }
   1.178 +
   1.179 +                // Delegate output generation to replacer object
   1.180 +                int32_t len = r->replace(text, destLimit, destLimit, cursor);
   1.181 +                destLimit += len;
   1.182 +            }
   1.183 +            oOutput += U16_LENGTH(c);
   1.184 +        }
   1.185 +        // Insert any accumulated straight text.
   1.186 +        if (buf.length() > 0) {
   1.187 +            text.handleReplaceBetween(destLimit, destLimit, buf);
   1.188 +            destLimit += buf.length();
   1.189 +        }
   1.190 +        if (oOutput == cursorPos) {
   1.191 +            // Record the position of the cursor
   1.192 +            newStart = destLimit - destStart; // relative to start
   1.193 +        }
   1.194 +
   1.195 +        outLen = destLimit - destStart;
   1.196 +
   1.197 +        // Copy new text to start, and delete it
   1.198 +        text.copy(destStart, destLimit, start);
   1.199 +        text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
   1.200 +
   1.201 +        // Delete the old text (the key)
   1.202 +        text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
   1.203 +    }        
   1.204 +
   1.205 +    if (hasCursor) {
   1.206 +        // Adjust the cursor for positions outside the key.  These
   1.207 +        // refer to code points rather than code units.  If cursorPos
   1.208 +        // is within the output string, then use newStart, which has
   1.209 +        // already been set above.
   1.210 +        if (cursorPos < 0) {
   1.211 +            newStart = start;
   1.212 +            int32_t n = cursorPos;
   1.213 +            // Outside the output string, cursorPos counts code points
   1.214 +            while (n < 0 && newStart > 0) {
   1.215 +                newStart -= U16_LENGTH(text.char32At(newStart-1));
   1.216 +                ++n;
   1.217 +            }
   1.218 +            newStart += n;
   1.219 +        } else if (cursorPos > output.length()) {
   1.220 +            newStart = start + outLen;
   1.221 +            int32_t n = cursorPos - output.length();
   1.222 +            // Outside the output string, cursorPos counts code points
   1.223 +            while (n > 0 && newStart < text.length()) {
   1.224 +                newStart += U16_LENGTH(text.char32At(newStart));
   1.225 +                --n;
   1.226 +            }
   1.227 +            newStart += n;
   1.228 +        } else {
   1.229 +            // Cursor is within output string.  It has been set up above
   1.230 +            // to be relative to start.
   1.231 +            newStart += start;
   1.232 +        }
   1.233 +
   1.234 +        cursor = newStart;
   1.235 +    }
   1.236 +
   1.237 +    return outLen;
   1.238 +}
   1.239 +
   1.240 +/**
   1.241 + * UnicodeReplacer API
   1.242 + */
   1.243 +UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
   1.244 +                                                 UBool escapeUnprintable) const {
   1.245 +    rule.truncate(0);
   1.246 +    UnicodeString quoteBuf;
   1.247 +
   1.248 +    int32_t cursor = cursorPos;
   1.249 +
   1.250 +    // Handle a cursor preceding the output
   1.251 +    if (hasCursor && cursor < 0) {
   1.252 +        while (cursor++ < 0) {
   1.253 +            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
   1.254 +        }
   1.255 +        // Fall through and append '|' below
   1.256 +    }
   1.257 +
   1.258 +    for (int32_t i=0; i<output.length(); ++i) {
   1.259 +        if (hasCursor && i == cursor) {
   1.260 +            ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
   1.261 +        }
   1.262 +        UChar c = output.charAt(i); // Ok to use 16-bits here
   1.263 +
   1.264 +        UnicodeReplacer* r = data->lookupReplacer(c);
   1.265 +        if (r == NULL) {
   1.266 +            ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
   1.267 +        } else {
   1.268 +            UnicodeString buf;
   1.269 +            r->toReplacerPattern(buf, escapeUnprintable);
   1.270 +            buf.insert(0, (UChar)0x20);
   1.271 +            buf.append((UChar)0x20);
   1.272 +            ICU_Utility::appendToRule(rule, buf,
   1.273 +                                      TRUE, escapeUnprintable, quoteBuf);
   1.274 +        }
   1.275 +    }
   1.276 +
   1.277 +    // Handle a cursor after the output.  Use > rather than >= because
   1.278 +    // if cursor == output.length() it is at the end of the output,
   1.279 +    // which is the default position, so we need not emit it.
   1.280 +    if (hasCursor && cursor > output.length()) {
   1.281 +        cursor -= output.length();
   1.282 +        while (cursor-- > 0) {
   1.283 +            ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
   1.284 +        }
   1.285 +        ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
   1.286 +    }
   1.287 +    // Flush quoteBuf out to result
   1.288 +    ICU_Utility::appendToRule(rule, -1,
   1.289 +                              TRUE, escapeUnprintable, quoteBuf);
   1.290 +
   1.291 +    return rule;
   1.292 +}
   1.293 +
   1.294 +/**
   1.295 + * Implement UnicodeReplacer
   1.296 + */
   1.297 +void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
   1.298 +    UChar32 ch;
   1.299 +    for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
   1.300 +    ch = output.char32At(i);
   1.301 +    UnicodeReplacer* r = data->lookupReplacer(ch);
   1.302 +    if (r == NULL) {
   1.303 +        toUnionTo.add(ch);
   1.304 +    } else {
   1.305 +        r->addReplacementSetTo(toUnionTo);
   1.306 +    }
   1.307 +    }
   1.308 +}
   1.309 +
   1.310 +/**
   1.311 + * UnicodeFunctor API
   1.312 + */
   1.313 +void StringReplacer::setData(const TransliterationRuleData* d) {
   1.314 +    data = d;
   1.315 +    int32_t i = 0;
   1.316 +    while (i<output.length()) {
   1.317 +        UChar32 c = output.char32At(i);
   1.318 +        UnicodeFunctor* f = data->lookup(c);
   1.319 +        if (f != NULL) {
   1.320 +            f->setData(data);
   1.321 +        }
   1.322 +        i += U16_LENGTH(c);
   1.323 +    }
   1.324 +}
   1.325 +
   1.326 +U_NAMESPACE_END
   1.327 +
   1.328 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.329 +
   1.330 +//eof

mercurial