intl/icu/source/i18n/strrepl.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 **********************************************************************
     3 *   Copyright (c) 2002-2012, International Business Machines Corporation
     4 *   and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   Date        Name        Description
     7 *   01/21/2002  aliu        Creation.
     8 **********************************************************************
     9 */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/uniset.h"
    16 #include "unicode/utf16.h"
    17 #include "strrepl.h"
    18 #include "rbt_data.h"
    19 #include "util.h"
    21 U_NAMESPACE_BEGIN
    23 UnicodeReplacer::~UnicodeReplacer() {}
    24 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer)
    26 /**
    27  * Construct a StringReplacer that sets the emits the given output
    28  * text and sets the cursor to the given position.
    29  * @param theOutput text that will replace input text when the
    30  * replace() method is called.  May contain stand-in characters
    31  * that represent nested replacers.
    32  * @param theCursorPos cursor position that will be returned by
    33  * the replace() method
    34  * @param theData transliterator context object that translates
    35  * stand-in characters to UnicodeReplacer objects
    36  */
    37 StringReplacer::StringReplacer(const UnicodeString& theOutput,
    38                                int32_t theCursorPos,
    39                                const TransliterationRuleData* theData) {
    40     output = theOutput;
    41     cursorPos = theCursorPos;
    42     hasCursor = TRUE;
    43     data = theData;
    44     isComplex = TRUE;
    45 }
    47 /**
    48  * Construct a StringReplacer that sets the emits the given output
    49  * text and does not modify the cursor.
    50  * @param theOutput text that will replace input text when the
    51  * replace() method is called.  May contain stand-in characters
    52  * that represent nested replacers.
    53  * @param theData transliterator context object that translates
    54  * stand-in characters to UnicodeReplacer objects
    55  */
    56 StringReplacer::StringReplacer(const UnicodeString& theOutput,
    57                                const TransliterationRuleData* theData) {
    58     output = theOutput;
    59     cursorPos = 0;
    60     hasCursor = FALSE;
    61     data = theData;
    62     isComplex = TRUE;
    63 }
    65 /**
    66  * Copy constructor.
    67  */
    68 StringReplacer::StringReplacer(const StringReplacer& other) :
    69     UnicodeFunctor(other),
    70     UnicodeReplacer(other)
    71 {
    72     output = other.output;
    73     cursorPos = other.cursorPos;
    74     hasCursor = other.hasCursor;
    75     data = other.data;
    76     isComplex = other.isComplex;
    77 }
    79 /**
    80  * Destructor
    81  */
    82 StringReplacer::~StringReplacer() {
    83 }
    85 /**
    86  * Implement UnicodeFunctor
    87  */
    88 UnicodeFunctor* StringReplacer::clone() const {
    89     return new StringReplacer(*this);
    90 }
    92 /**
    93  * Implement UnicodeFunctor
    94  */
    95 UnicodeReplacer* StringReplacer::toReplacer() const {
    96   return const_cast<StringReplacer *>(this);
    97 }
    99 /**
   100  * UnicodeReplacer API
   101  */
   102 int32_t StringReplacer::replace(Replaceable& text,
   103                                 int32_t start,
   104                                 int32_t limit,
   105                                 int32_t& cursor) {
   106     int32_t outLen;
   107     int32_t newStart = 0;
   109     // NOTE: It should be possible to _always_ run the complex
   110     // processing code; just slower.  If not, then there is a bug
   111     // in the complex processing code.
   113     // Simple (no nested replacers) Processing Code :
   114     if (!isComplex) {
   115         text.handleReplaceBetween(start, limit, output);
   116         outLen = output.length();
   118         // Setup default cursor position (for cursorPos within output)
   119         newStart = cursorPos;
   120     }
   122     // Complex (nested replacers) Processing Code :
   123     else {
   124         /* When there are segments to be copied, use the Replaceable.copy()
   125          * API in order to retain out-of-band data.  Copy everything to the
   126          * end of the string, then copy them back over the key.  This preserves
   127          * the integrity of indices into the key and surrounding context while
   128          * generating the output text.
   129          */
   130         UnicodeString buf;
   131         int32_t oOutput; // offset into 'output'
   132         isComplex = FALSE;
   134         // The temporary buffer starts at tempStart, and extends
   135         // to destLimit.  The start of the buffer has a single
   136         // character from before the key.  This provides style
   137         // data when addition characters are filled into the
   138         // temporary buffer.  If there is nothing to the left, use
   139         // the non-character U+FFFF, which Replaceable subclasses
   140         // should treat specially as a "no-style character."
   141         // destStart points to the point after the style context
   142         // character, so it is tempStart+1 or tempStart+2.
   143         int32_t tempStart = text.length(); // start of temp buffer
   144         int32_t destStart = tempStart; // copy new text to here
   145         if (start > 0) {
   146             int32_t len = U16_LENGTH(text.char32At(start-1));
   147             text.copy(start-len, start, tempStart);
   148             destStart += len;
   149         } else {
   150             UnicodeString str((UChar) 0xFFFF);
   151             text.handleReplaceBetween(tempStart, tempStart, str);
   152             destStart++;
   153         }
   154         int32_t destLimit = destStart;
   156         for (oOutput=0; oOutput<output.length(); ) {
   157             if (oOutput == cursorPos) {
   158                 // Record the position of the cursor
   159                 newStart = destLimit - destStart; // relative to start
   160             }
   161             UChar32 c = output.char32At(oOutput);
   162             UnicodeReplacer* r = data->lookupReplacer(c);
   163             if (r == NULL) {
   164                 // Accumulate straight (non-segment) text.
   165                 buf.append(c);
   166             } else {
   167                 isComplex = TRUE;
   169                 // Insert any accumulated straight text.
   170                 if (buf.length() > 0) {
   171                     text.handleReplaceBetween(destLimit, destLimit, buf);
   172                     destLimit += buf.length();
   173                     buf.truncate(0);
   174                 }
   176                 // Delegate output generation to replacer object
   177                 int32_t len = r->replace(text, destLimit, destLimit, cursor);
   178                 destLimit += len;
   179             }
   180             oOutput += U16_LENGTH(c);
   181         }
   182         // Insert any accumulated straight text.
   183         if (buf.length() > 0) {
   184             text.handleReplaceBetween(destLimit, destLimit, buf);
   185             destLimit += buf.length();
   186         }
   187         if (oOutput == cursorPos) {
   188             // Record the position of the cursor
   189             newStart = destLimit - destStart; // relative to start
   190         }
   192         outLen = destLimit - destStart;
   194         // Copy new text to start, and delete it
   195         text.copy(destStart, destLimit, start);
   196         text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString());
   198         // Delete the old text (the key)
   199         text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString());
   200     }        
   202     if (hasCursor) {
   203         // Adjust the cursor for positions outside the key.  These
   204         // refer to code points rather than code units.  If cursorPos
   205         // is within the output string, then use newStart, which has
   206         // already been set above.
   207         if (cursorPos < 0) {
   208             newStart = start;
   209             int32_t n = cursorPos;
   210             // Outside the output string, cursorPos counts code points
   211             while (n < 0 && newStart > 0) {
   212                 newStart -= U16_LENGTH(text.char32At(newStart-1));
   213                 ++n;
   214             }
   215             newStart += n;
   216         } else if (cursorPos > output.length()) {
   217             newStart = start + outLen;
   218             int32_t n = cursorPos - output.length();
   219             // Outside the output string, cursorPos counts code points
   220             while (n > 0 && newStart < text.length()) {
   221                 newStart += U16_LENGTH(text.char32At(newStart));
   222                 --n;
   223             }
   224             newStart += n;
   225         } else {
   226             // Cursor is within output string.  It has been set up above
   227             // to be relative to start.
   228             newStart += start;
   229         }
   231         cursor = newStart;
   232     }
   234     return outLen;
   235 }
   237 /**
   238  * UnicodeReplacer API
   239  */
   240 UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule,
   241                                                  UBool escapeUnprintable) const {
   242     rule.truncate(0);
   243     UnicodeString quoteBuf;
   245     int32_t cursor = cursorPos;
   247     // Handle a cursor preceding the output
   248     if (hasCursor && cursor < 0) {
   249         while (cursor++ < 0) {
   250             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
   251         }
   252         // Fall through and append '|' below
   253     }
   255     for (int32_t i=0; i<output.length(); ++i) {
   256         if (hasCursor && i == cursor) {
   257             ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
   258         }
   259         UChar c = output.charAt(i); // Ok to use 16-bits here
   261         UnicodeReplacer* r = data->lookupReplacer(c);
   262         if (r == NULL) {
   263             ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf);
   264         } else {
   265             UnicodeString buf;
   266             r->toReplacerPattern(buf, escapeUnprintable);
   267             buf.insert(0, (UChar)0x20);
   268             buf.append((UChar)0x20);
   269             ICU_Utility::appendToRule(rule, buf,
   270                                       TRUE, escapeUnprintable, quoteBuf);
   271         }
   272     }
   274     // Handle a cursor after the output.  Use > rather than >= because
   275     // if cursor == output.length() it is at the end of the output,
   276     // which is the default position, so we need not emit it.
   277     if (hasCursor && cursor > output.length()) {
   278         cursor -= output.length();
   279         while (cursor-- > 0) {
   280             ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf);
   281         }
   282         ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf);
   283     }
   284     // Flush quoteBuf out to result
   285     ICU_Utility::appendToRule(rule, -1,
   286                               TRUE, escapeUnprintable, quoteBuf);
   288     return rule;
   289 }
   291 /**
   292  * Implement UnicodeReplacer
   293  */
   294 void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const {
   295     UChar32 ch;
   296     for (int32_t i=0; i<output.length(); i+=U16_LENGTH(ch)) {
   297     ch = output.char32At(i);
   298     UnicodeReplacer* r = data->lookupReplacer(ch);
   299     if (r == NULL) {
   300         toUnionTo.add(ch);
   301     } else {
   302         r->addReplacementSetTo(toUnionTo);
   303     }
   304     }
   305 }
   307 /**
   308  * UnicodeFunctor API
   309  */
   310 void StringReplacer::setData(const TransliterationRuleData* d) {
   311     data = d;
   312     int32_t i = 0;
   313     while (i<output.length()) {
   314         UChar32 c = output.char32At(i);
   315         UnicodeFunctor* f = data->lookup(c);
   316         if (f != NULL) {
   317             f->setData(data);
   318         }
   319         i += U16_LENGTH(c);
   320     }
   321 }
   323 U_NAMESPACE_END
   325 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   327 //eof

mercurial