Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (c) 2002-2012, International Business Machines Corporation |
michael@0 | 4 | * and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 01/21/2002 aliu Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/uniset.h" |
michael@0 | 16 | #include "unicode/utf16.h" |
michael@0 | 17 | #include "strrepl.h" |
michael@0 | 18 | #include "rbt_data.h" |
michael@0 | 19 | #include "util.h" |
michael@0 | 20 | |
michael@0 | 21 | U_NAMESPACE_BEGIN |
michael@0 | 22 | |
michael@0 | 23 | UnicodeReplacer::~UnicodeReplacer() {} |
michael@0 | 24 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) |
michael@0 | 25 | |
michael@0 | 26 | /** |
michael@0 | 27 | * Construct a StringReplacer that sets the emits the given output |
michael@0 | 28 | * text and sets the cursor to the given position. |
michael@0 | 29 | * @param theOutput text that will replace input text when the |
michael@0 | 30 | * replace() method is called. May contain stand-in characters |
michael@0 | 31 | * that represent nested replacers. |
michael@0 | 32 | * @param theCursorPos cursor position that will be returned by |
michael@0 | 33 | * the replace() method |
michael@0 | 34 | * @param theData transliterator context object that translates |
michael@0 | 35 | * stand-in characters to UnicodeReplacer objects |
michael@0 | 36 | */ |
michael@0 | 37 | StringReplacer::StringReplacer(const UnicodeString& theOutput, |
michael@0 | 38 | int32_t theCursorPos, |
michael@0 | 39 | const TransliterationRuleData* theData) { |
michael@0 | 40 | output = theOutput; |
michael@0 | 41 | cursorPos = theCursorPos; |
michael@0 | 42 | hasCursor = TRUE; |
michael@0 | 43 | data = theData; |
michael@0 | 44 | isComplex = TRUE; |
michael@0 | 45 | } |
michael@0 | 46 | |
michael@0 | 47 | /** |
michael@0 | 48 | * Construct a StringReplacer that sets the emits the given output |
michael@0 | 49 | * text and does not modify the cursor. |
michael@0 | 50 | * @param theOutput text that will replace input text when the |
michael@0 | 51 | * replace() method is called. May contain stand-in characters |
michael@0 | 52 | * that represent nested replacers. |
michael@0 | 53 | * @param theData transliterator context object that translates |
michael@0 | 54 | * stand-in characters to UnicodeReplacer objects |
michael@0 | 55 | */ |
michael@0 | 56 | StringReplacer::StringReplacer(const UnicodeString& theOutput, |
michael@0 | 57 | const TransliterationRuleData* theData) { |
michael@0 | 58 | output = theOutput; |
michael@0 | 59 | cursorPos = 0; |
michael@0 | 60 | hasCursor = FALSE; |
michael@0 | 61 | data = theData; |
michael@0 | 62 | isComplex = TRUE; |
michael@0 | 63 | } |
michael@0 | 64 | |
michael@0 | 65 | /** |
michael@0 | 66 | * Copy constructor. |
michael@0 | 67 | */ |
michael@0 | 68 | StringReplacer::StringReplacer(const StringReplacer& other) : |
michael@0 | 69 | UnicodeFunctor(other), |
michael@0 | 70 | UnicodeReplacer(other) |
michael@0 | 71 | { |
michael@0 | 72 | output = other.output; |
michael@0 | 73 | cursorPos = other.cursorPos; |
michael@0 | 74 | hasCursor = other.hasCursor; |
michael@0 | 75 | data = other.data; |
michael@0 | 76 | isComplex = other.isComplex; |
michael@0 | 77 | } |
michael@0 | 78 | |
michael@0 | 79 | /** |
michael@0 | 80 | * Destructor |
michael@0 | 81 | */ |
michael@0 | 82 | StringReplacer::~StringReplacer() { |
michael@0 | 83 | } |
michael@0 | 84 | |
michael@0 | 85 | /** |
michael@0 | 86 | * Implement UnicodeFunctor |
michael@0 | 87 | */ |
michael@0 | 88 | UnicodeFunctor* StringReplacer::clone() const { |
michael@0 | 89 | return new StringReplacer(*this); |
michael@0 | 90 | } |
michael@0 | 91 | |
michael@0 | 92 | /** |
michael@0 | 93 | * Implement UnicodeFunctor |
michael@0 | 94 | */ |
michael@0 | 95 | UnicodeReplacer* StringReplacer::toReplacer() const { |
michael@0 | 96 | return const_cast<StringReplacer *>(this); |
michael@0 | 97 | } |
michael@0 | 98 | |
michael@0 | 99 | /** |
michael@0 | 100 | * UnicodeReplacer API |
michael@0 | 101 | */ |
michael@0 | 102 | int32_t StringReplacer::replace(Replaceable& text, |
michael@0 | 103 | int32_t start, |
michael@0 | 104 | int32_t limit, |
michael@0 | 105 | int32_t& cursor) { |
michael@0 | 106 | int32_t outLen; |
michael@0 | 107 | int32_t newStart = 0; |
michael@0 | 108 | |
michael@0 | 109 | // NOTE: It should be possible to _always_ run the complex |
michael@0 | 110 | // processing code; just slower. If not, then there is a bug |
michael@0 | 111 | // in the complex processing code. |
michael@0 | 112 | |
michael@0 | 113 | // Simple (no nested replacers) Processing Code : |
michael@0 | 114 | if (!isComplex) { |
michael@0 | 115 | text.handleReplaceBetween(start, limit, output); |
michael@0 | 116 | outLen = output.length(); |
michael@0 | 117 | |
michael@0 | 118 | // Setup default cursor position (for cursorPos within output) |
michael@0 | 119 | newStart = cursorPos; |
michael@0 | 120 | } |
michael@0 | 121 | |
michael@0 | 122 | // Complex (nested replacers) Processing Code : |
michael@0 | 123 | else { |
michael@0 | 124 | /* When there are segments to be copied, use the Replaceable.copy() |
michael@0 | 125 | * API in order to retain out-of-band data. Copy everything to the |
michael@0 | 126 | * end of the string, then copy them back over the key. This preserves |
michael@0 | 127 | * the integrity of indices into the key and surrounding context while |
michael@0 | 128 | * generating the output text. |
michael@0 | 129 | */ |
michael@0 | 130 | UnicodeString buf; |
michael@0 | 131 | int32_t oOutput; // offset into 'output' |
michael@0 | 132 | isComplex = FALSE; |
michael@0 | 133 | |
michael@0 | 134 | // The temporary buffer starts at tempStart, and extends |
michael@0 | 135 | // to destLimit. The start of the buffer has a single |
michael@0 | 136 | // character from before the key. This provides style |
michael@0 | 137 | // data when addition characters are filled into the |
michael@0 | 138 | // temporary buffer. If there is nothing to the left, use |
michael@0 | 139 | // the non-character U+FFFF, which Replaceable subclasses |
michael@0 | 140 | // should treat specially as a "no-style character." |
michael@0 | 141 | // destStart points to the point after the style context |
michael@0 | 142 | // character, so it is tempStart+1 or tempStart+2. |
michael@0 | 143 | int32_t tempStart = text.length(); // start of temp buffer |
michael@0 | 144 | int32_t destStart = tempStart; // copy new text to here |
michael@0 | 145 | if (start > 0) { |
michael@0 | 146 | int32_t len = U16_LENGTH(text.char32At(start-1)); |
michael@0 | 147 | text.copy(start-len, start, tempStart); |
michael@0 | 148 | destStart += len; |
michael@0 | 149 | } else { |
michael@0 | 150 | UnicodeString str((UChar) 0xFFFF); |
michael@0 | 151 | text.handleReplaceBetween(tempStart, tempStart, str); |
michael@0 | 152 | destStart++; |
michael@0 | 153 | } |
michael@0 | 154 | int32_t destLimit = destStart; |
michael@0 | 155 | |
michael@0 | 156 | for (oOutput=0; oOutput<output.length(); ) { |
michael@0 | 157 | if (oOutput == cursorPos) { |
michael@0 | 158 | // Record the position of the cursor |
michael@0 | 159 | newStart = destLimit - destStart; // relative to start |
michael@0 | 160 | } |
michael@0 | 161 | UChar32 c = output.char32At(oOutput); |
michael@0 | 162 | UnicodeReplacer* r = data->lookupReplacer(c); |
michael@0 | 163 | if (r == NULL) { |
michael@0 | 164 | // Accumulate straight (non-segment) text. |
michael@0 | 165 | buf.append(c); |
michael@0 | 166 | } else { |
michael@0 | 167 | isComplex = TRUE; |
michael@0 | 168 | |
michael@0 | 169 | // Insert any accumulated straight text. |
michael@0 | 170 | if (buf.length() > 0) { |
michael@0 | 171 | text.handleReplaceBetween(destLimit, destLimit, buf); |
michael@0 | 172 | destLimit += buf.length(); |
michael@0 | 173 | buf.truncate(0); |
michael@0 | 174 | } |
michael@0 | 175 | |
michael@0 | 176 | // Delegate output generation to replacer object |
michael@0 | 177 | int32_t len = r->replace(text, destLimit, destLimit, cursor); |
michael@0 | 178 | destLimit += len; |
michael@0 | 179 | } |
michael@0 | 180 | oOutput += U16_LENGTH(c); |
michael@0 | 181 | } |
michael@0 | 182 | // Insert any accumulated straight text. |
michael@0 | 183 | if (buf.length() > 0) { |
michael@0 | 184 | text.handleReplaceBetween(destLimit, destLimit, buf); |
michael@0 | 185 | destLimit += buf.length(); |
michael@0 | 186 | } |
michael@0 | 187 | if (oOutput == cursorPos) { |
michael@0 | 188 | // Record the position of the cursor |
michael@0 | 189 | newStart = destLimit - destStart; // relative to start |
michael@0 | 190 | } |
michael@0 | 191 | |
michael@0 | 192 | outLen = destLimit - destStart; |
michael@0 | 193 | |
michael@0 | 194 | // Copy new text to start, and delete it |
michael@0 | 195 | text.copy(destStart, destLimit, start); |
michael@0 | 196 | text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); |
michael@0 | 197 | |
michael@0 | 198 | // Delete the old text (the key) |
michael@0 | 199 | text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); |
michael@0 | 200 | } |
michael@0 | 201 | |
michael@0 | 202 | if (hasCursor) { |
michael@0 | 203 | // Adjust the cursor for positions outside the key. These |
michael@0 | 204 | // refer to code points rather than code units. If cursorPos |
michael@0 | 205 | // is within the output string, then use newStart, which has |
michael@0 | 206 | // already been set above. |
michael@0 | 207 | if (cursorPos < 0) { |
michael@0 | 208 | newStart = start; |
michael@0 | 209 | int32_t n = cursorPos; |
michael@0 | 210 | // Outside the output string, cursorPos counts code points |
michael@0 | 211 | while (n < 0 && newStart > 0) { |
michael@0 | 212 | newStart -= U16_LENGTH(text.char32At(newStart-1)); |
michael@0 | 213 | ++n; |
michael@0 | 214 | } |
michael@0 | 215 | newStart += n; |
michael@0 | 216 | } else if (cursorPos > output.length()) { |
michael@0 | 217 | newStart = start + outLen; |
michael@0 | 218 | int32_t n = cursorPos - output.length(); |
michael@0 | 219 | // Outside the output string, cursorPos counts code points |
michael@0 | 220 | while (n > 0 && newStart < text.length()) { |
michael@0 | 221 | newStart += U16_LENGTH(text.char32At(newStart)); |
michael@0 | 222 | --n; |
michael@0 | 223 | } |
michael@0 | 224 | newStart += n; |
michael@0 | 225 | } else { |
michael@0 | 226 | // Cursor is within output string. It has been set up above |
michael@0 | 227 | // to be relative to start. |
michael@0 | 228 | newStart += start; |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | cursor = newStart; |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | return outLen; |
michael@0 | 235 | } |
michael@0 | 236 | |
michael@0 | 237 | /** |
michael@0 | 238 | * UnicodeReplacer API |
michael@0 | 239 | */ |
michael@0 | 240 | UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, |
michael@0 | 241 | UBool escapeUnprintable) const { |
michael@0 | 242 | rule.truncate(0); |
michael@0 | 243 | UnicodeString quoteBuf; |
michael@0 | 244 | |
michael@0 | 245 | int32_t cursor = cursorPos; |
michael@0 | 246 | |
michael@0 | 247 | // Handle a cursor preceding the output |
michael@0 | 248 | if (hasCursor && cursor < 0) { |
michael@0 | 249 | while (cursor++ < 0) { |
michael@0 | 250 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 251 | } |
michael@0 | 252 | // Fall through and append '|' below |
michael@0 | 253 | } |
michael@0 | 254 | |
michael@0 | 255 | for (int32_t i=0; i<output.length(); ++i) { |
michael@0 | 256 | if (hasCursor && i == cursor) { |
michael@0 | 257 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 258 | } |
michael@0 | 259 | UChar c = output.charAt(i); // Ok to use 16-bits here |
michael@0 | 260 | |
michael@0 | 261 | UnicodeReplacer* r = data->lookupReplacer(c); |
michael@0 | 262 | if (r == NULL) { |
michael@0 | 263 | ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); |
michael@0 | 264 | } else { |
michael@0 | 265 | UnicodeString buf; |
michael@0 | 266 | r->toReplacerPattern(buf, escapeUnprintable); |
michael@0 | 267 | buf.insert(0, (UChar)0x20); |
michael@0 | 268 | buf.append((UChar)0x20); |
michael@0 | 269 | ICU_Utility::appendToRule(rule, buf, |
michael@0 | 270 | TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 271 | } |
michael@0 | 272 | } |
michael@0 | 273 | |
michael@0 | 274 | // Handle a cursor after the output. Use > rather than >= because |
michael@0 | 275 | // if cursor == output.length() it is at the end of the output, |
michael@0 | 276 | // which is the default position, so we need not emit it. |
michael@0 | 277 | if (hasCursor && cursor > output.length()) { |
michael@0 | 278 | cursor -= output.length(); |
michael@0 | 279 | while (cursor-- > 0) { |
michael@0 | 280 | ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 281 | } |
michael@0 | 282 | ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 283 | } |
michael@0 | 284 | // Flush quoteBuf out to result |
michael@0 | 285 | ICU_Utility::appendToRule(rule, -1, |
michael@0 | 286 | TRUE, escapeUnprintable, quoteBuf); |
michael@0 | 287 | |
michael@0 | 288 | return rule; |
michael@0 | 289 | } |
michael@0 | 290 | |
michael@0 | 291 | /** |
michael@0 | 292 | * Implement UnicodeReplacer |
michael@0 | 293 | */ |
michael@0 | 294 | void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { |
michael@0 | 295 | UChar32 ch; |
michael@0 | 296 | for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) { |
michael@0 | 297 | ch = output.char32At(i); |
michael@0 | 298 | UnicodeReplacer* r = data->lookupReplacer(ch); |
michael@0 | 299 | if (r == NULL) { |
michael@0 | 300 | toUnionTo.add(ch); |
michael@0 | 301 | } else { |
michael@0 | 302 | r->addReplacementSetTo(toUnionTo); |
michael@0 | 303 | } |
michael@0 | 304 | } |
michael@0 | 305 | } |
michael@0 | 306 | |
michael@0 | 307 | /** |
michael@0 | 308 | * UnicodeFunctor API |
michael@0 | 309 | */ |
michael@0 | 310 | void StringReplacer::setData(const TransliterationRuleData* d) { |
michael@0 | 311 | data = d; |
michael@0 | 312 | int32_t i = 0; |
michael@0 | 313 | while (i<output.length()) { |
michael@0 | 314 | UChar32 c = output.char32At(i); |
michael@0 | 315 | UnicodeFunctor* f = data->lookup(c); |
michael@0 | 316 | if (f != NULL) { |
michael@0 | 317 | f->setData(data); |
michael@0 | 318 | } |
michael@0 | 319 | i += U16_LENGTH(c); |
michael@0 | 320 | } |
michael@0 | 321 | } |
michael@0 | 322 | |
michael@0 | 323 | U_NAMESPACE_END |
michael@0 | 324 | |
michael@0 | 325 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 326 | |
michael@0 | 327 | //eof |