michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (c) 2001-2012, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 07/23/01 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "strmatch.h" michael@0: #include "rbt_data.h" michael@0: #include "util.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf16.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) michael@0: michael@0: StringMatcher::StringMatcher(const UnicodeString& theString, michael@0: int32_t start, michael@0: int32_t limit, michael@0: int32_t segmentNum, michael@0: const TransliterationRuleData& theData) : michael@0: data(&theData), michael@0: segmentNumber(segmentNum), michael@0: matchStart(-1), michael@0: matchLimit(-1) michael@0: { michael@0: theString.extractBetween(start, limit, pattern); michael@0: } michael@0: michael@0: StringMatcher::StringMatcher(const StringMatcher& o) : michael@0: UnicodeFunctor(o), michael@0: UnicodeMatcher(o), michael@0: UnicodeReplacer(o), michael@0: pattern(o.pattern), michael@0: data(o.data), michael@0: segmentNumber(o.segmentNumber), michael@0: matchStart(o.matchStart), michael@0: matchLimit(o.matchLimit) michael@0: { michael@0: } michael@0: michael@0: /** michael@0: * Destructor michael@0: */ michael@0: StringMatcher::~StringMatcher() { michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: */ michael@0: UnicodeFunctor* StringMatcher::clone() const { michael@0: return new StringMatcher(*this); michael@0: } michael@0: michael@0: /** michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer michael@0: * and return the pointer. michael@0: */ michael@0: UnicodeMatcher* StringMatcher::toMatcher() const { michael@0: StringMatcher *nonconst_this = const_cast(this); michael@0: UnicodeMatcher *nonconst_base = static_cast(nonconst_this); michael@0: michael@0: return nonconst_base; michael@0: } michael@0: michael@0: /** michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer michael@0: * and return the pointer. michael@0: */ michael@0: UnicodeReplacer* StringMatcher::toReplacer() const { michael@0: StringMatcher *nonconst_this = const_cast(this); michael@0: UnicodeReplacer *nonconst_base = static_cast(nonconst_this); michael@0: michael@0: return nonconst_base; michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: */ michael@0: UMatchDegree StringMatcher::matches(const Replaceable& text, michael@0: int32_t& offset, michael@0: int32_t limit, michael@0: UBool incremental) { michael@0: int32_t i; michael@0: int32_t cursor = offset; michael@0: if (limit < cursor) { michael@0: // Match in the reverse direction michael@0: for (i=pattern.length()-1; i>=0; --i) { michael@0: UChar keyChar = pattern.charAt(i); michael@0: UnicodeMatcher* subm = data->lookupMatcher(keyChar); michael@0: if (subm == 0) { michael@0: if (cursor > limit && michael@0: keyChar == text.charAt(cursor)) { michael@0: --cursor; michael@0: } else { michael@0: return U_MISMATCH; michael@0: } michael@0: } else { michael@0: UMatchDegree m = michael@0: subm->matches(text, cursor, limit, incremental); michael@0: if (m != U_MATCH) { michael@0: return m; michael@0: } michael@0: } michael@0: } michael@0: // Record the match position, but adjust for a normal michael@0: // forward start, limit, and only if a prior match does not michael@0: // exist -- we want the rightmost match. michael@0: if (matchStart < 0) { michael@0: matchStart = cursor+1; michael@0: matchLimit = offset+1; michael@0: } michael@0: } else { michael@0: for (i=0; ilookupMatcher(keyChar); michael@0: if (subm == 0) { michael@0: // Don't need the cursor < limit check if michael@0: // incremental is TRUE (because it's done above); do need michael@0: // it otherwise. michael@0: if (cursor < limit && michael@0: keyChar == text.charAt(cursor)) { michael@0: ++cursor; michael@0: } else { michael@0: return U_MISMATCH; michael@0: } michael@0: } else { michael@0: UMatchDegree m = michael@0: subm->matches(text, cursor, limit, incremental); michael@0: if (m != U_MATCH) { michael@0: return m; michael@0: } michael@0: } michael@0: } michael@0: // Record the match position michael@0: matchStart = offset; michael@0: matchLimit = cursor; michael@0: } michael@0: michael@0: offset = cursor; michael@0: return U_MATCH; michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: */ michael@0: UnicodeString& StringMatcher::toPattern(UnicodeString& result, michael@0: UBool escapeUnprintable) const michael@0: { michael@0: result.truncate(0); michael@0: UnicodeString str, quoteBuf; michael@0: if (segmentNumber > 0) { michael@0: result.append((UChar)40); /*(*/ michael@0: } michael@0: for (int32_t i=0; ilookupMatcher(keyChar); michael@0: if (m == 0) { michael@0: ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); michael@0: } else { michael@0: ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), michael@0: TRUE, escapeUnprintable, quoteBuf); michael@0: } michael@0: } michael@0: if (segmentNumber > 0) { michael@0: result.append((UChar)41); /*)*/ michael@0: } michael@0: // Flush quoteBuf out to result michael@0: ICU_Utility::appendToRule(result, -1, michael@0: TRUE, escapeUnprintable, quoteBuf); michael@0: return result; michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: */ michael@0: UBool StringMatcher::matchesIndexValue(uint8_t v) const { michael@0: if (pattern.length() == 0) { michael@0: return TRUE; michael@0: } michael@0: UChar32 c = pattern.char32At(0); michael@0: const UnicodeMatcher *m = data->lookupMatcher(c); michael@0: return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: */ michael@0: void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { michael@0: UChar32 ch; michael@0: for (int32_t i=0; ilookupMatcher(ch); michael@0: if (matcher == NULL) { michael@0: toUnionTo.add(ch); michael@0: } else { michael@0: matcher->addMatchSetTo(toUnionTo); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /** michael@0: * UnicodeReplacer API michael@0: */ michael@0: int32_t StringMatcher::replace(Replaceable& text, michael@0: int32_t start, michael@0: int32_t limit, michael@0: int32_t& /*cursor*/) { michael@0: michael@0: int32_t outLen = 0; michael@0: michael@0: // Copy segment with out-of-band data michael@0: int32_t dest = limit; michael@0: // If there was no match, that means that a quantifier michael@0: // matched zero-length. E.g., x (a)* y matched "xy". michael@0: if (matchStart >= 0) { michael@0: if (matchStart != matchLimit) { michael@0: text.copy(matchStart, matchLimit, dest); michael@0: outLen = matchLimit - matchStart; michael@0: } michael@0: } michael@0: michael@0: text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text michael@0: michael@0: return outLen; michael@0: } michael@0: michael@0: /** michael@0: * UnicodeReplacer API michael@0: */ michael@0: UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, michael@0: UBool /*escapeUnprintable*/) const { michael@0: // assert(segmentNumber > 0); michael@0: rule.truncate(0); michael@0: rule.append((UChar)0x0024 /*$*/); michael@0: ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); michael@0: return rule; michael@0: } michael@0: michael@0: /** michael@0: * Remove any match info. This must be called before performing a michael@0: * set of matches with this segment. michael@0: */ michael@0: void StringMatcher::resetMatch() { michael@0: matchStart = matchLimit = -1; michael@0: } michael@0: michael@0: /** michael@0: * Union the set of all characters that may output by this object michael@0: * into the given set. michael@0: * @param toUnionTo the set into which to union the output characters michael@0: */ michael@0: void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { michael@0: // The output of this replacer varies; it is the source text between michael@0: // matchStart and matchLimit. Since this varies depending on the michael@0: // input text, we can't compute it here. We can either do nothing michael@0: // or we can add ALL characters to the set. It's probably more useful michael@0: // to do nothing. michael@0: } michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: */ michael@0: void StringMatcher::setData(const TransliterationRuleData* d) { michael@0: data = d; michael@0: int32_t i = 0; michael@0: while (ilookup(c); michael@0: if (f != NULL) { michael@0: f->setData(data); michael@0: } michael@0: i += U16_LENGTH(c); michael@0: } michael@0: } michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: //eof