1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/strrepl.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,327 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (c) 2002-2012, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 01/21/2002 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/uniset.h" 1.19 +#include "unicode/utf16.h" 1.20 +#include "strrepl.h" 1.21 +#include "rbt_data.h" 1.22 +#include "util.h" 1.23 + 1.24 +U_NAMESPACE_BEGIN 1.25 + 1.26 +UnicodeReplacer::~UnicodeReplacer() {} 1.27 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) 1.28 + 1.29 +/** 1.30 + * Construct a StringReplacer that sets the emits the given output 1.31 + * text and sets the cursor to the given position. 1.32 + * @param theOutput text that will replace input text when the 1.33 + * replace() method is called. May contain stand-in characters 1.34 + * that represent nested replacers. 1.35 + * @param theCursorPos cursor position that will be returned by 1.36 + * the replace() method 1.37 + * @param theData transliterator context object that translates 1.38 + * stand-in characters to UnicodeReplacer objects 1.39 + */ 1.40 +StringReplacer::StringReplacer(const UnicodeString& theOutput, 1.41 + int32_t theCursorPos, 1.42 + const TransliterationRuleData* theData) { 1.43 + output = theOutput; 1.44 + cursorPos = theCursorPos; 1.45 + hasCursor = TRUE; 1.46 + data = theData; 1.47 + isComplex = TRUE; 1.48 +} 1.49 + 1.50 +/** 1.51 + * Construct a StringReplacer that sets the emits the given output 1.52 + * text and does not modify the cursor. 1.53 + * @param theOutput text that will replace input text when the 1.54 + * replace() method is called. May contain stand-in characters 1.55 + * that represent nested replacers. 1.56 + * @param theData transliterator context object that translates 1.57 + * stand-in characters to UnicodeReplacer objects 1.58 + */ 1.59 +StringReplacer::StringReplacer(const UnicodeString& theOutput, 1.60 + const TransliterationRuleData* theData) { 1.61 + output = theOutput; 1.62 + cursorPos = 0; 1.63 + hasCursor = FALSE; 1.64 + data = theData; 1.65 + isComplex = TRUE; 1.66 +} 1.67 + 1.68 +/** 1.69 + * Copy constructor. 1.70 + */ 1.71 +StringReplacer::StringReplacer(const StringReplacer& other) : 1.72 + UnicodeFunctor(other), 1.73 + UnicodeReplacer(other) 1.74 +{ 1.75 + output = other.output; 1.76 + cursorPos = other.cursorPos; 1.77 + hasCursor = other.hasCursor; 1.78 + data = other.data; 1.79 + isComplex = other.isComplex; 1.80 +} 1.81 + 1.82 +/** 1.83 + * Destructor 1.84 + */ 1.85 +StringReplacer::~StringReplacer() { 1.86 +} 1.87 + 1.88 +/** 1.89 + * Implement UnicodeFunctor 1.90 + */ 1.91 +UnicodeFunctor* StringReplacer::clone() const { 1.92 + return new StringReplacer(*this); 1.93 +} 1.94 + 1.95 +/** 1.96 + * Implement UnicodeFunctor 1.97 + */ 1.98 +UnicodeReplacer* StringReplacer::toReplacer() const { 1.99 + return const_cast<StringReplacer *>(this); 1.100 +} 1.101 + 1.102 +/** 1.103 + * UnicodeReplacer API 1.104 + */ 1.105 +int32_t StringReplacer::replace(Replaceable& text, 1.106 + int32_t start, 1.107 + int32_t limit, 1.108 + int32_t& cursor) { 1.109 + int32_t outLen; 1.110 + int32_t newStart = 0; 1.111 + 1.112 + // NOTE: It should be possible to _always_ run the complex 1.113 + // processing code; just slower. If not, then there is a bug 1.114 + // in the complex processing code. 1.115 + 1.116 + // Simple (no nested replacers) Processing Code : 1.117 + if (!isComplex) { 1.118 + text.handleReplaceBetween(start, limit, output); 1.119 + outLen = output.length(); 1.120 + 1.121 + // Setup default cursor position (for cursorPos within output) 1.122 + newStart = cursorPos; 1.123 + } 1.124 + 1.125 + // Complex (nested replacers) Processing Code : 1.126 + else { 1.127 + /* When there are segments to be copied, use the Replaceable.copy() 1.128 + * API in order to retain out-of-band data. Copy everything to the 1.129 + * end of the string, then copy them back over the key. This preserves 1.130 + * the integrity of indices into the key and surrounding context while 1.131 + * generating the output text. 1.132 + */ 1.133 + UnicodeString buf; 1.134 + int32_t oOutput; // offset into 'output' 1.135 + isComplex = FALSE; 1.136 + 1.137 + // The temporary buffer starts at tempStart, and extends 1.138 + // to destLimit. The start of the buffer has a single 1.139 + // character from before the key. This provides style 1.140 + // data when addition characters are filled into the 1.141 + // temporary buffer. If there is nothing to the left, use 1.142 + // the non-character U+FFFF, which Replaceable subclasses 1.143 + // should treat specially as a "no-style character." 1.144 + // destStart points to the point after the style context 1.145 + // character, so it is tempStart+1 or tempStart+2. 1.146 + int32_t tempStart = text.length(); // start of temp buffer 1.147 + int32_t destStart = tempStart; // copy new text to here 1.148 + if (start > 0) { 1.149 + int32_t len = U16_LENGTH(text.char32At(start-1)); 1.150 + text.copy(start-len, start, tempStart); 1.151 + destStart += len; 1.152 + } else { 1.153 + UnicodeString str((UChar) 0xFFFF); 1.154 + text.handleReplaceBetween(tempStart, tempStart, str); 1.155 + destStart++; 1.156 + } 1.157 + int32_t destLimit = destStart; 1.158 + 1.159 + for (oOutput=0; oOutput<output.length(); ) { 1.160 + if (oOutput == cursorPos) { 1.161 + // Record the position of the cursor 1.162 + newStart = destLimit - destStart; // relative to start 1.163 + } 1.164 + UChar32 c = output.char32At(oOutput); 1.165 + UnicodeReplacer* r = data->lookupReplacer(c); 1.166 + if (r == NULL) { 1.167 + // Accumulate straight (non-segment) text. 1.168 + buf.append(c); 1.169 + } else { 1.170 + isComplex = TRUE; 1.171 + 1.172 + // Insert any accumulated straight text. 1.173 + if (buf.length() > 0) { 1.174 + text.handleReplaceBetween(destLimit, destLimit, buf); 1.175 + destLimit += buf.length(); 1.176 + buf.truncate(0); 1.177 + } 1.178 + 1.179 + // Delegate output generation to replacer object 1.180 + int32_t len = r->replace(text, destLimit, destLimit, cursor); 1.181 + destLimit += len; 1.182 + } 1.183 + oOutput += U16_LENGTH(c); 1.184 + } 1.185 + // Insert any accumulated straight text. 1.186 + if (buf.length() > 0) { 1.187 + text.handleReplaceBetween(destLimit, destLimit, buf); 1.188 + destLimit += buf.length(); 1.189 + } 1.190 + if (oOutput == cursorPos) { 1.191 + // Record the position of the cursor 1.192 + newStart = destLimit - destStart; // relative to start 1.193 + } 1.194 + 1.195 + outLen = destLimit - destStart; 1.196 + 1.197 + // Copy new text to start, and delete it 1.198 + text.copy(destStart, destLimit, start); 1.199 + text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); 1.200 + 1.201 + // Delete the old text (the key) 1.202 + text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); 1.203 + } 1.204 + 1.205 + if (hasCursor) { 1.206 + // Adjust the cursor for positions outside the key. These 1.207 + // refer to code points rather than code units. If cursorPos 1.208 + // is within the output string, then use newStart, which has 1.209 + // already been set above. 1.210 + if (cursorPos < 0) { 1.211 + newStart = start; 1.212 + int32_t n = cursorPos; 1.213 + // Outside the output string, cursorPos counts code points 1.214 + while (n < 0 && newStart > 0) { 1.215 + newStart -= U16_LENGTH(text.char32At(newStart-1)); 1.216 + ++n; 1.217 + } 1.218 + newStart += n; 1.219 + } else if (cursorPos > output.length()) { 1.220 + newStart = start + outLen; 1.221 + int32_t n = cursorPos - output.length(); 1.222 + // Outside the output string, cursorPos counts code points 1.223 + while (n > 0 && newStart < text.length()) { 1.224 + newStart += U16_LENGTH(text.char32At(newStart)); 1.225 + --n; 1.226 + } 1.227 + newStart += n; 1.228 + } else { 1.229 + // Cursor is within output string. It has been set up above 1.230 + // to be relative to start. 1.231 + newStart += start; 1.232 + } 1.233 + 1.234 + cursor = newStart; 1.235 + } 1.236 + 1.237 + return outLen; 1.238 +} 1.239 + 1.240 +/** 1.241 + * UnicodeReplacer API 1.242 + */ 1.243 +UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, 1.244 + UBool escapeUnprintable) const { 1.245 + rule.truncate(0); 1.246 + UnicodeString quoteBuf; 1.247 + 1.248 + int32_t cursor = cursorPos; 1.249 + 1.250 + // Handle a cursor preceding the output 1.251 + if (hasCursor && cursor < 0) { 1.252 + while (cursor++ < 0) { 1.253 + ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 1.254 + } 1.255 + // Fall through and append '|' below 1.256 + } 1.257 + 1.258 + for (int32_t i=0; i<output.length(); ++i) { 1.259 + if (hasCursor && i == cursor) { 1.260 + ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 1.261 + } 1.262 + UChar c = output.charAt(i); // Ok to use 16-bits here 1.263 + 1.264 + UnicodeReplacer* r = data->lookupReplacer(c); 1.265 + if (r == NULL) { 1.266 + ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); 1.267 + } else { 1.268 + UnicodeString buf; 1.269 + r->toReplacerPattern(buf, escapeUnprintable); 1.270 + buf.insert(0, (UChar)0x20); 1.271 + buf.append((UChar)0x20); 1.272 + ICU_Utility::appendToRule(rule, buf, 1.273 + TRUE, escapeUnprintable, quoteBuf); 1.274 + } 1.275 + } 1.276 + 1.277 + // Handle a cursor after the output. Use > rather than >= because 1.278 + // if cursor == output.length() it is at the end of the output, 1.279 + // which is the default position, so we need not emit it. 1.280 + if (hasCursor && cursor > output.length()) { 1.281 + cursor -= output.length(); 1.282 + while (cursor-- > 0) { 1.283 + ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); 1.284 + } 1.285 + ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); 1.286 + } 1.287 + // Flush quoteBuf out to result 1.288 + ICU_Utility::appendToRule(rule, -1, 1.289 + TRUE, escapeUnprintable, quoteBuf); 1.290 + 1.291 + return rule; 1.292 +} 1.293 + 1.294 +/** 1.295 + * Implement UnicodeReplacer 1.296 + */ 1.297 +void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { 1.298 + UChar32 ch; 1.299 + for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { 1.300 + ch = output.char32At(i); 1.301 + UnicodeReplacer* r = data->lookupReplacer(ch); 1.302 + if (r == NULL) { 1.303 + toUnionTo.add(ch); 1.304 + } else { 1.305 + r->addReplacementSetTo(toUnionTo); 1.306 + } 1.307 + } 1.308 +} 1.309 + 1.310 +/** 1.311 + * UnicodeFunctor API 1.312 + */ 1.313 +void StringReplacer::setData(const TransliterationRuleData* d) { 1.314 + data = d; 1.315 + int32_t i = 0; 1.316 + while (i<output.length()) { 1.317 + UChar32 c = output.char32At(i); 1.318 + UnicodeFunctor* f = data->lookup(c); 1.319 + if (f != NULL) { 1.320 + f->setData(data); 1.321 + } 1.322 + i += U16_LENGTH(c); 1.323 + } 1.324 +} 1.325 + 1.326 +U_NAMESPACE_END 1.327 + 1.328 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.329 + 1.330 +//eof