intl/icu/source/i18n/strrepl.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (c) 2002-2012, International Business Machines Corporation
michael@0 4 * and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 01/21/2002 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "unicode/uniset.h"
michael@0 16 #include "unicode/utf16.h"
michael@0 17 #include "strrepl.h"
michael@0 18 #include "rbt_data.h"
michael@0 19 #include "util.h"
michael@0 20
michael@0 21 U_NAMESPACE_BEGIN
michael@0 22
michael@0 23 UnicodeReplacer::~UnicodeReplacer() {}
michael@0 24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
michael@0 25
michael@0 26 /**
michael@0 27 * Construct a StringReplacer that sets the emits the given output
michael@0 28 * text and sets the cursor to the given position.
michael@0 29 * @param theOutput text that will replace input text when the
michael@0 30 * replace() method is called. May contain stand-in characters
michael@0 31 * that represent nested replacers.
michael@0 32 * @param theCursorPos cursor position that will be returned by
michael@0 33 * the replace() method
michael@0 34 * @param theData transliterator context object that translates
michael@0 35 * stand-in characters to UnicodeReplacer objects
michael@0 36 */
michael@0 37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
michael@0 38 int32_t theCursorPos,
michael@0 39 const TransliterationRuleData* theData) {
michael@0 40 output = theOutput;
michael@0 41 cursorPos = theCursorPos;
michael@0 42 hasCursor = TRUE;
michael@0 43 data = theData;
michael@0 44 isComplex = TRUE;
michael@0 45 }
michael@0 46
michael@0 47 /**
michael@0 48 * Construct a StringReplacer that sets the emits the given output
michael@0 49 * text and does not modify the cursor.
michael@0 50 * @param theOutput text that will replace input text when the
michael@0 51 * replace() method is called. May contain stand-in characters
michael@0 52 * that represent nested replacers.
michael@0 53 * @param theData transliterator context object that translates
michael@0 54 * stand-in characters to UnicodeReplacer objects
michael@0 55 */
michael@0 56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
michael@0 57 const TransliterationRuleData* theData) {
michael@0 58 output = theOutput;
michael@0 59 cursorPos = 0;
michael@0 60 hasCursor = FALSE;
michael@0 61 data = theData;
michael@0 62 isComplex = TRUE;
michael@0 63 }
michael@0 64
michael@0 65 /**
michael@0 66 * Copy constructor.
michael@0 67 */
michael@0 68 StringReplacer::StringReplacer(const StringReplacer& other) :
michael@0 69 UnicodeFunctor(other),
michael@0 70 UnicodeReplacer(other)
michael@0 71 {
michael@0 72 output = other.output;
michael@0 73 cursorPos = other.cursorPos;
michael@0 74 hasCursor = other.hasCursor;
michael@0 75 data = other.data;
michael@0 76 isComplex = other.isComplex;
michael@0 77 }
michael@0 78
michael@0 79 /**
michael@0 80 * Destructor
michael@0 81 */
michael@0 82 StringReplacer::~StringReplacer() {
michael@0 83 }
michael@0 84
michael@0 85 /**
michael@0 86 * Implement UnicodeFunctor
michael@0 87 */
michael@0 88 UnicodeFunctor* StringReplacer::clone() const {
michael@0 89 return new StringReplacer(*this);
michael@0 90 }
michael@0 91
michael@0 92 /**
michael@0 93 * Implement UnicodeFunctor
michael@0 94 */
michael@0 95 UnicodeReplacer* StringReplacer::toReplacer() const {
michael@0 96 return const_cast<StringReplacer *>(this);
michael@0 97 }
michael@0 98
michael@0 99 /**
michael@0 100 * UnicodeReplacer API
michael@0 101 */
michael@0 102 int32_t StringReplacer::replace(Replaceable& text,
michael@0 103 int32_t start,
michael@0 104 int32_t limit,
michael@0 105 int32_t& cursor) {
michael@0 106 int32_t outLen;
michael@0 107 int32_t newStart = 0;
michael@0 108
michael@0 109 // NOTE: It should be possible to _always_ run the complex
michael@0 110 // processing code; just slower. If not, then there is a bug
michael@0 111 // in the complex processing code.
michael@0 112
michael@0 113 // Simple (no nested replacers) Processing Code :
michael@0 114 if (!isComplex) {
michael@0 115 text.handleReplaceBetween(start, limit, output);
michael@0 116 outLen = output.length();
michael@0 117
michael@0 118 // Setup default cursor position (for cursorPos within output)
michael@0 119 newStart = cursorPos;
michael@0 120 }
michael@0 121
michael@0 122 // Complex (nested replacers) Processing Code :
michael@0 123 else {
michael@0 124 /* When there are segments to be copied, use the Replaceable.copy()
michael@0 125 * API in order to retain out-of-band data. Copy everything to the
michael@0 126 * end of the string, then copy them back over the key. This preserves
michael@0 127 * the integrity of indices into the key and surrounding context while
michael@0 128 * generating the output text.
michael@0 129 */
michael@0 130 UnicodeString buf;
michael@0 131 int32_t oOutput; // offset into 'output'
michael@0 132 isComplex = FALSE;
michael@0 133
michael@0 134 // The temporary buffer starts at tempStart, and extends
michael@0 135 // to destLimit. The start of the buffer has a single
michael@0 136 // character from before the key. This provides style
michael@0 137 // data when addition characters are filled into the
michael@0 138 // temporary buffer. If there is nothing to the left, use
michael@0 139 // the non-character U+FFFF, which Replaceable subclasses
michael@0 140 // should treat specially as a "no-style character."
michael@0 141 // destStart points to the point after the style context
michael@0 142 // character, so it is tempStart+1 or tempStart+2.
michael@0 143 int32_t tempStart = text.length(); // start of temp buffer
michael@0 144 int32_t destStart = tempStart; // copy new text to here
michael@0 145 if (start > 0) {
michael@0 146 int32_t len = U16_LENGTH(text.char32At(start-1));
michael@0 147 text.copy(start-len, start, tempStart);
michael@0 148 destStart += len;
michael@0 149 } else {
michael@0 150 UnicodeString str((UChar) 0xFFFF);
michael@0 151 text.handleReplaceBetween(tempStart, tempStart, str);
michael@0 152 destStart++;
michael@0 153 }
michael@0 154 int32_t destLimit = destStart;
michael@0 155
michael@0 156 for (oOutput=0; oOutput<output.length(); ) {
michael@0 157 if (oOutput == cursorPos) {
michael@0 158 // Record the position of the cursor
michael@0 159 newStart = destLimit - destStart; // relative to start
michael@0 160 }
michael@0 161 UChar32 c = output.char32At(oOutput);
michael@0 162 UnicodeReplacer* r = data->lookupReplacer(c);
michael@0 163 if (r == NULL) {
michael@0 164 // Accumulate straight (non-segment) text.
michael@0 165 buf.append(c);
michael@0 166 } else {
michael@0 167 isComplex = TRUE;
michael@0 168
michael@0 169 // Insert any accumulated straight text.
michael@0 170 if (buf.length() > 0) {
michael@0 171 text.handleReplaceBetween(destLimit, destLimit, buf);
michael@0 172 destLimit += buf.length();
michael@0 173 buf.truncate(0);
michael@0 174 }
michael@0 175
michael@0 176 // Delegate output generation to replacer object
michael@0 177 int32_t len = r->replace(text, destLimit, destLimit, cursor);
michael@0 178 destLimit += len;
michael@0 179 }
michael@0 180 oOutput += U16_LENGTH(c);
michael@0 181 }
michael@0 182 // Insert any accumulated straight text.
michael@0 183 if (buf.length() > 0) {
michael@0 184 text.handleReplaceBetween(destLimit, destLimit, buf);
michael@0 185 destLimit += buf.length();
michael@0 186 }
michael@0 187 if (oOutput == cursorPos) {
michael@0 188 // Record the position of the cursor
michael@0 189 newStart = destLimit - destStart; // relative to start
michael@0 190 }
michael@0 191
michael@0 192 outLen = destLimit - destStart;
michael@0 193
michael@0 194 // Copy new text to start, and delete it
michael@0 195 text.copy(destStart, destLimit, start);
michael@0 196 text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
michael@0 197
michael@0 198 // Delete the old text (the key)
michael@0 199 text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
michael@0 200 }
michael@0 201
michael@0 202 if (hasCursor) {
michael@0 203 // Adjust the cursor for positions outside the key. These
michael@0 204 // refer to code points rather than code units. If cursorPos
michael@0 205 // is within the output string, then use newStart, which has
michael@0 206 // already been set above.
michael@0 207 if (cursorPos < 0) {
michael@0 208 newStart = start;
michael@0 209 int32_t n = cursorPos;
michael@0 210 // Outside the output string, cursorPos counts code points
michael@0 211 while (n < 0 && newStart > 0) {
michael@0 212 newStart -= U16_LENGTH(text.char32At(newStart-1));
michael@0 213 ++n;
michael@0 214 }
michael@0 215 newStart += n;
michael@0 216 } else if (cursorPos > output.length()) {
michael@0 217 newStart = start + outLen;
michael@0 218 int32_t n = cursorPos - output.length();
michael@0 219 // Outside the output string, cursorPos counts code points
michael@0 220 while (n > 0 && newStart < text.length()) {
michael@0 221 newStart += U16_LENGTH(text.char32At(newStart));
michael@0 222 --n;
michael@0 223 }
michael@0 224 newStart += n;
michael@0 225 } else {
michael@0 226 // Cursor is within output string. It has been set up above
michael@0 227 // to be relative to start.
michael@0 228 newStart += start;
michael@0 229 }
michael@0 230
michael@0 231 cursor = newStart;
michael@0 232 }
michael@0 233
michael@0 234 return outLen;
michael@0 235 }
michael@0 236
michael@0 237 /**
michael@0 238 * UnicodeReplacer API
michael@0 239 */
michael@0 240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
michael@0 241 UBool escapeUnprintable) const {
michael@0 242 rule.truncate(0);
michael@0 243 UnicodeString quoteBuf;
michael@0 244
michael@0 245 int32_t cursor = cursorPos;
michael@0 246
michael@0 247 // Handle a cursor preceding the output
michael@0 248 if (hasCursor && cursor < 0) {
michael@0 249 while (cursor++ < 0) {
michael@0 250 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
michael@0 251 }
michael@0 252 // Fall through and append '|' below
michael@0 253 }
michael@0 254
michael@0 255 for (int32_t i=0; i<output.length(); ++i) {
michael@0 256 if (hasCursor && i == cursor) {
michael@0 257 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
michael@0 258 }
michael@0 259 UChar c = output.charAt(i); // Ok to use 16-bits here
michael@0 260
michael@0 261 UnicodeReplacer* r = data->lookupReplacer(c);
michael@0 262 if (r == NULL) {
michael@0 263 ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
michael@0 264 } else {
michael@0 265 UnicodeString buf;
michael@0 266 r->toReplacerPattern(buf, escapeUnprintable);
michael@0 267 buf.insert(0, (UChar)0x20);
michael@0 268 buf.append((UChar)0x20);
michael@0 269 ICU_Utility::appendToRule(rule, buf,
michael@0 270 TRUE, escapeUnprintable, quoteBuf);
michael@0 271 }
michael@0 272 }
michael@0 273
michael@0 274 // Handle a cursor after the output. Use > rather than >= because
michael@0 275 // if cursor == output.length() it is at the end of the output,
michael@0 276 // which is the default position, so we need not emit it.
michael@0 277 if (hasCursor && cursor > output.length()) {
michael@0 278 cursor -= output.length();
michael@0 279 while (cursor-- > 0) {
michael@0 280 ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
michael@0 281 }
michael@0 282 ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
michael@0 283 }
michael@0 284 // Flush quoteBuf out to result
michael@0 285 ICU_Utility::appendToRule(rule, -1,
michael@0 286 TRUE, escapeUnprintable, quoteBuf);
michael@0 287
michael@0 288 return rule;
michael@0 289 }
michael@0 290
michael@0 291 /**
michael@0 292 * Implement UnicodeReplacer
michael@0 293 */
michael@0 294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
michael@0 295 UChar32 ch;
michael@0 296 for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
michael@0 297 ch = output.char32At(i);
michael@0 298 UnicodeReplacer* r = data->lookupReplacer(ch);
michael@0 299 if (r == NULL) {
michael@0 300 toUnionTo.add(ch);
michael@0 301 } else {
michael@0 302 r->addReplacementSetTo(toUnionTo);
michael@0 303 }
michael@0 304 }
michael@0 305 }
michael@0 306
michael@0 307 /**
michael@0 308 * UnicodeFunctor API
michael@0 309 */
michael@0 310 void StringReplacer::setData(const TransliterationRuleData* d) {
michael@0 311 data = d;
michael@0 312 int32_t i = 0;
michael@0 313 while (i<output.length()) {
michael@0 314 UChar32 c = output.char32At(i);
michael@0 315 UnicodeFunctor* f = data->lookup(c);
michael@0 316 if (f != NULL) {
michael@0 317 f->setData(data);
michael@0 318 }
michael@0 319 i += U16_LENGTH(c);
michael@0 320 }
michael@0 321 }
michael@0 322
michael@0 323 U_NAMESPACE_END
michael@0 324
michael@0 325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 326
michael@0 327 //eof

mercurial