michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (c) 2002-2012, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 01/21/2002 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf16.h" michael@0: #include "strrepl.h" michael@0: #include "rbt_data.h" michael@0: #include "util.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UnicodeReplacer::~UnicodeReplacer() {} michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringReplacer) michael@0: michael@0: /** michael@0: * Construct a StringReplacer that sets the emits the given output michael@0: * text and sets the cursor to the given position. michael@0: * @param theOutput text that will replace input text when the michael@0: * replace() method is called. May contain stand-in characters michael@0: * that represent nested replacers. michael@0: * @param theCursorPos cursor position that will be returned by michael@0: * the replace() method michael@0: * @param theData transliterator context object that translates michael@0: * stand-in characters to UnicodeReplacer objects michael@0: */ michael@0: StringReplacer::StringReplacer(const UnicodeString& theOutput, michael@0: int32_t theCursorPos, michael@0: const TransliterationRuleData* theData) { michael@0: output = theOutput; michael@0: cursorPos = theCursorPos; michael@0: hasCursor = TRUE; michael@0: data = theData; michael@0: isComplex = TRUE; michael@0: } michael@0: michael@0: /** michael@0: * Construct a StringReplacer that sets the emits the given output michael@0: * text and does not modify the cursor. michael@0: * @param theOutput text that will replace input text when the michael@0: * replace() method is called. May contain stand-in characters michael@0: * that represent nested replacers. michael@0: * @param theData transliterator context object that translates michael@0: * stand-in characters to UnicodeReplacer objects michael@0: */ michael@0: StringReplacer::StringReplacer(const UnicodeString& theOutput, michael@0: const TransliterationRuleData* theData) { michael@0: output = theOutput; michael@0: cursorPos = 0; michael@0: hasCursor = FALSE; michael@0: data = theData; michael@0: isComplex = TRUE; michael@0: } michael@0: michael@0: /** michael@0: * Copy constructor. michael@0: */ michael@0: StringReplacer::StringReplacer(const StringReplacer& other) : michael@0: UnicodeFunctor(other), michael@0: UnicodeReplacer(other) michael@0: { michael@0: output = other.output; michael@0: cursorPos = other.cursorPos; michael@0: hasCursor = other.hasCursor; michael@0: data = other.data; michael@0: isComplex = other.isComplex; michael@0: } michael@0: michael@0: /** michael@0: * Destructor michael@0: */ michael@0: StringReplacer::~StringReplacer() { michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: */ michael@0: UnicodeFunctor* StringReplacer::clone() const { michael@0: return new StringReplacer(*this); michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: */ michael@0: UnicodeReplacer* StringReplacer::toReplacer() const { michael@0: return const_cast(this); michael@0: } michael@0: michael@0: /** michael@0: * UnicodeReplacer API michael@0: */ michael@0: int32_t StringReplacer::replace(Replaceable& text, michael@0: int32_t start, michael@0: int32_t limit, michael@0: int32_t& cursor) { michael@0: int32_t outLen; michael@0: int32_t newStart = 0; michael@0: michael@0: // NOTE: It should be possible to _always_ run the complex michael@0: // processing code; just slower. If not, then there is a bug michael@0: // in the complex processing code. michael@0: michael@0: // Simple (no nested replacers) Processing Code : michael@0: if (!isComplex) { michael@0: text.handleReplaceBetween(start, limit, output); michael@0: outLen = output.length(); michael@0: michael@0: // Setup default cursor position (for cursorPos within output) michael@0: newStart = cursorPos; michael@0: } michael@0: michael@0: // Complex (nested replacers) Processing Code : michael@0: else { michael@0: /* When there are segments to be copied, use the Replaceable.copy() michael@0: * API in order to retain out-of-band data. Copy everything to the michael@0: * end of the string, then copy them back over the key. This preserves michael@0: * the integrity of indices into the key and surrounding context while michael@0: * generating the output text. michael@0: */ michael@0: UnicodeString buf; michael@0: int32_t oOutput; // offset into 'output' michael@0: isComplex = FALSE; michael@0: michael@0: // The temporary buffer starts at tempStart, and extends michael@0: // to destLimit. The start of the buffer has a single michael@0: // character from before the key. This provides style michael@0: // data when addition characters are filled into the michael@0: // temporary buffer. If there is nothing to the left, use michael@0: // the non-character U+FFFF, which Replaceable subclasses michael@0: // should treat specially as a "no-style character." michael@0: // destStart points to the point after the style context michael@0: // character, so it is tempStart+1 or tempStart+2. michael@0: int32_t tempStart = text.length(); // start of temp buffer michael@0: int32_t destStart = tempStart; // copy new text to here michael@0: if (start > 0) { michael@0: int32_t len = U16_LENGTH(text.char32At(start-1)); michael@0: text.copy(start-len, start, tempStart); michael@0: destStart += len; michael@0: } else { michael@0: UnicodeString str((UChar) 0xFFFF); michael@0: text.handleReplaceBetween(tempStart, tempStart, str); michael@0: destStart++; michael@0: } michael@0: int32_t destLimit = destStart; michael@0: michael@0: for (oOutput=0; oOutputlookupReplacer(c); michael@0: if (r == NULL) { michael@0: // Accumulate straight (non-segment) text. michael@0: buf.append(c); michael@0: } else { michael@0: isComplex = TRUE; michael@0: michael@0: // Insert any accumulated straight text. michael@0: if (buf.length() > 0) { michael@0: text.handleReplaceBetween(destLimit, destLimit, buf); michael@0: destLimit += buf.length(); michael@0: buf.truncate(0); michael@0: } michael@0: michael@0: // Delegate output generation to replacer object michael@0: int32_t len = r->replace(text, destLimit, destLimit, cursor); michael@0: destLimit += len; michael@0: } michael@0: oOutput += U16_LENGTH(c); michael@0: } michael@0: // Insert any accumulated straight text. michael@0: if (buf.length() > 0) { michael@0: text.handleReplaceBetween(destLimit, destLimit, buf); michael@0: destLimit += buf.length(); michael@0: } michael@0: if (oOutput == cursorPos) { michael@0: // Record the position of the cursor michael@0: newStart = destLimit - destStart; // relative to start michael@0: } michael@0: michael@0: outLen = destLimit - destStart; michael@0: michael@0: // Copy new text to start, and delete it michael@0: text.copy(destStart, destLimit, start); michael@0: text.handleReplaceBetween(tempStart + outLen, destLimit + outLen, UnicodeString()); michael@0: michael@0: // Delete the old text (the key) michael@0: text.handleReplaceBetween(start + outLen, limit + outLen, UnicodeString()); michael@0: } michael@0: michael@0: if (hasCursor) { michael@0: // Adjust the cursor for positions outside the key. These michael@0: // refer to code points rather than code units. If cursorPos michael@0: // is within the output string, then use newStart, which has michael@0: // already been set above. michael@0: if (cursorPos < 0) { michael@0: newStart = start; michael@0: int32_t n = cursorPos; michael@0: // Outside the output string, cursorPos counts code points michael@0: while (n < 0 && newStart > 0) { michael@0: newStart -= U16_LENGTH(text.char32At(newStart-1)); michael@0: ++n; michael@0: } michael@0: newStart += n; michael@0: } else if (cursorPos > output.length()) { michael@0: newStart = start + outLen; michael@0: int32_t n = cursorPos - output.length(); michael@0: // Outside the output string, cursorPos counts code points michael@0: while (n > 0 && newStart < text.length()) { michael@0: newStart += U16_LENGTH(text.char32At(newStart)); michael@0: --n; michael@0: } michael@0: newStart += n; michael@0: } else { michael@0: // Cursor is within output string. It has been set up above michael@0: // to be relative to start. michael@0: newStart += start; michael@0: } michael@0: michael@0: cursor = newStart; michael@0: } michael@0: michael@0: return outLen; michael@0: } michael@0: michael@0: /** michael@0: * UnicodeReplacer API michael@0: */ michael@0: UnicodeString& StringReplacer::toReplacerPattern(UnicodeString& rule, michael@0: UBool escapeUnprintable) const { michael@0: rule.truncate(0); michael@0: UnicodeString quoteBuf; michael@0: michael@0: int32_t cursor = cursorPos; michael@0: michael@0: // Handle a cursor preceding the output michael@0: if (hasCursor && cursor < 0) { michael@0: while (cursor++ < 0) { michael@0: ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); michael@0: } michael@0: // Fall through and append '|' below michael@0: } michael@0: michael@0: for (int32_t i=0; ilookupReplacer(c); michael@0: if (r == NULL) { michael@0: ICU_Utility::appendToRule(rule, c, FALSE, escapeUnprintable, quoteBuf); michael@0: } else { michael@0: UnicodeString buf; michael@0: r->toReplacerPattern(buf, escapeUnprintable); michael@0: buf.insert(0, (UChar)0x20); michael@0: buf.append((UChar)0x20); michael@0: ICU_Utility::appendToRule(rule, buf, michael@0: TRUE, escapeUnprintable, quoteBuf); michael@0: } michael@0: } michael@0: michael@0: // Handle a cursor after the output. Use > rather than >= because michael@0: // if cursor == output.length() it is at the end of the output, michael@0: // which is the default position, so we need not emit it. michael@0: if (hasCursor && cursor > output.length()) { michael@0: cursor -= output.length(); michael@0: while (cursor-- > 0) { michael@0: ICU_Utility::appendToRule(rule, (UChar)0x0040 /*@*/, TRUE, escapeUnprintable, quoteBuf); michael@0: } michael@0: ICU_Utility::appendToRule(rule, (UChar)0x007C /*|*/, TRUE, escapeUnprintable, quoteBuf); michael@0: } michael@0: // Flush quoteBuf out to result michael@0: ICU_Utility::appendToRule(rule, -1, michael@0: TRUE, escapeUnprintable, quoteBuf); michael@0: michael@0: return rule; michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeReplacer michael@0: */ michael@0: void StringReplacer::addReplacementSetTo(UnicodeSet& toUnionTo) const { michael@0: UChar32 ch; michael@0: for (int32_t i=0; ilookupReplacer(ch); michael@0: if (r == NULL) { michael@0: toUnionTo.add(ch); michael@0: } else { michael@0: r->addReplacementSetTo(toUnionTo); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * UnicodeFunctor API michael@0: */ michael@0: void StringReplacer::setData(const TransliterationRuleData* d) { michael@0: data = d; michael@0: int32_t i = 0; michael@0: while (ilookup(c); michael@0: if (f != NULL) { michael@0: f->setData(data); michael@0: } michael@0: i += U16_LENGTH(c); michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: //eof