1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/strmatch.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,294 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (c) 2001-2012, International Business Machines Corporation 1.7 +* and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 07/23/01 aliu Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "strmatch.h" 1.19 +#include "rbt_data.h" 1.20 +#include "util.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/utf16.h" 1.23 + 1.24 +U_NAMESPACE_BEGIN 1.25 + 1.26 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher) 1.27 + 1.28 +StringMatcher::StringMatcher(const UnicodeString& theString, 1.29 + int32_t start, 1.30 + int32_t limit, 1.31 + int32_t segmentNum, 1.32 + const TransliterationRuleData& theData) : 1.33 + data(&theData), 1.34 + segmentNumber(segmentNum), 1.35 + matchStart(-1), 1.36 + matchLimit(-1) 1.37 +{ 1.38 + theString.extractBetween(start, limit, pattern); 1.39 +} 1.40 + 1.41 +StringMatcher::StringMatcher(const StringMatcher& o) : 1.42 + UnicodeFunctor(o), 1.43 + UnicodeMatcher(o), 1.44 + UnicodeReplacer(o), 1.45 + pattern(o.pattern), 1.46 + data(o.data), 1.47 + segmentNumber(o.segmentNumber), 1.48 + matchStart(o.matchStart), 1.49 + matchLimit(o.matchLimit) 1.50 +{ 1.51 +} 1.52 + 1.53 +/** 1.54 + * Destructor 1.55 + */ 1.56 +StringMatcher::~StringMatcher() { 1.57 +} 1.58 + 1.59 +/** 1.60 + * Implement UnicodeFunctor 1.61 + */ 1.62 +UnicodeFunctor* StringMatcher::clone() const { 1.63 + return new StringMatcher(*this); 1.64 +} 1.65 + 1.66 +/** 1.67 + * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 1.68 + * and return the pointer. 1.69 + */ 1.70 +UnicodeMatcher* StringMatcher::toMatcher() const { 1.71 + StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 1.72 + UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this); 1.73 + 1.74 + return nonconst_base; 1.75 +} 1.76 + 1.77 +/** 1.78 + * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 1.79 + * and return the pointer. 1.80 + */ 1.81 +UnicodeReplacer* StringMatcher::toReplacer() const { 1.82 + StringMatcher *nonconst_this = const_cast<StringMatcher *>(this); 1.83 + UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this); 1.84 + 1.85 + return nonconst_base; 1.86 +} 1.87 + 1.88 +/** 1.89 + * Implement UnicodeMatcher 1.90 + */ 1.91 +UMatchDegree StringMatcher::matches(const Replaceable& text, 1.92 + int32_t& offset, 1.93 + int32_t limit, 1.94 + UBool incremental) { 1.95 + int32_t i; 1.96 + int32_t cursor = offset; 1.97 + if (limit < cursor) { 1.98 + // Match in the reverse direction 1.99 + for (i=pattern.length()-1; i>=0; --i) { 1.100 + UChar keyChar = pattern.charAt(i); 1.101 + UnicodeMatcher* subm = data->lookupMatcher(keyChar); 1.102 + if (subm == 0) { 1.103 + if (cursor > limit && 1.104 + keyChar == text.charAt(cursor)) { 1.105 + --cursor; 1.106 + } else { 1.107 + return U_MISMATCH; 1.108 + } 1.109 + } else { 1.110 + UMatchDegree m = 1.111 + subm->matches(text, cursor, limit, incremental); 1.112 + if (m != U_MATCH) { 1.113 + return m; 1.114 + } 1.115 + } 1.116 + } 1.117 + // Record the match position, but adjust for a normal 1.118 + // forward start, limit, and only if a prior match does not 1.119 + // exist -- we want the rightmost match. 1.120 + if (matchStart < 0) { 1.121 + matchStart = cursor+1; 1.122 + matchLimit = offset+1; 1.123 + } 1.124 + } else { 1.125 + for (i=0; i<pattern.length(); ++i) { 1.126 + if (incremental && cursor == limit) { 1.127 + // We've reached the context limit without a mismatch and 1.128 + // without completing our match. 1.129 + return U_PARTIAL_MATCH; 1.130 + } 1.131 + UChar keyChar = pattern.charAt(i); 1.132 + UnicodeMatcher* subm = data->lookupMatcher(keyChar); 1.133 + if (subm == 0) { 1.134 + // Don't need the cursor < limit check if 1.135 + // incremental is TRUE (because it's done above); do need 1.136 + // it otherwise. 1.137 + if (cursor < limit && 1.138 + keyChar == text.charAt(cursor)) { 1.139 + ++cursor; 1.140 + } else { 1.141 + return U_MISMATCH; 1.142 + } 1.143 + } else { 1.144 + UMatchDegree m = 1.145 + subm->matches(text, cursor, limit, incremental); 1.146 + if (m != U_MATCH) { 1.147 + return m; 1.148 + } 1.149 + } 1.150 + } 1.151 + // Record the match position 1.152 + matchStart = offset; 1.153 + matchLimit = cursor; 1.154 + } 1.155 + 1.156 + offset = cursor; 1.157 + return U_MATCH; 1.158 +} 1.159 + 1.160 +/** 1.161 + * Implement UnicodeMatcher 1.162 + */ 1.163 +UnicodeString& StringMatcher::toPattern(UnicodeString& result, 1.164 + UBool escapeUnprintable) const 1.165 +{ 1.166 + result.truncate(0); 1.167 + UnicodeString str, quoteBuf; 1.168 + if (segmentNumber > 0) { 1.169 + result.append((UChar)40); /*(*/ 1.170 + } 1.171 + for (int32_t i=0; i<pattern.length(); ++i) { 1.172 + UChar keyChar = pattern.charAt(i); 1.173 + const UnicodeMatcher* m = data->lookupMatcher(keyChar); 1.174 + if (m == 0) { 1.175 + ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf); 1.176 + } else { 1.177 + ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable), 1.178 + TRUE, escapeUnprintable, quoteBuf); 1.179 + } 1.180 + } 1.181 + if (segmentNumber > 0) { 1.182 + result.append((UChar)41); /*)*/ 1.183 + } 1.184 + // Flush quoteBuf out to result 1.185 + ICU_Utility::appendToRule(result, -1, 1.186 + TRUE, escapeUnprintable, quoteBuf); 1.187 + return result; 1.188 +} 1.189 + 1.190 +/** 1.191 + * Implement UnicodeMatcher 1.192 + */ 1.193 +UBool StringMatcher::matchesIndexValue(uint8_t v) const { 1.194 + if (pattern.length() == 0) { 1.195 + return TRUE; 1.196 + } 1.197 + UChar32 c = pattern.char32At(0); 1.198 + const UnicodeMatcher *m = data->lookupMatcher(c); 1.199 + return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v); 1.200 +} 1.201 + 1.202 +/** 1.203 + * Implement UnicodeMatcher 1.204 + */ 1.205 +void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const { 1.206 + UChar32 ch; 1.207 + for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) { 1.208 + ch = pattern.char32At(i); 1.209 + const UnicodeMatcher* matcher = data->lookupMatcher(ch); 1.210 + if (matcher == NULL) { 1.211 + toUnionTo.add(ch); 1.212 + } else { 1.213 + matcher->addMatchSetTo(toUnionTo); 1.214 + } 1.215 + } 1.216 +} 1.217 + 1.218 +/** 1.219 + * UnicodeReplacer API 1.220 + */ 1.221 +int32_t StringMatcher::replace(Replaceable& text, 1.222 + int32_t start, 1.223 + int32_t limit, 1.224 + int32_t& /*cursor*/) { 1.225 + 1.226 + int32_t outLen = 0; 1.227 + 1.228 + // Copy segment with out-of-band data 1.229 + int32_t dest = limit; 1.230 + // If there was no match, that means that a quantifier 1.231 + // matched zero-length. E.g., x (a)* y matched "xy". 1.232 + if (matchStart >= 0) { 1.233 + if (matchStart != matchLimit) { 1.234 + text.copy(matchStart, matchLimit, dest); 1.235 + outLen = matchLimit - matchStart; 1.236 + } 1.237 + } 1.238 + 1.239 + text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text 1.240 + 1.241 + return outLen; 1.242 +} 1.243 + 1.244 +/** 1.245 + * UnicodeReplacer API 1.246 + */ 1.247 +UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule, 1.248 + UBool /*escapeUnprintable*/) const { 1.249 + // assert(segmentNumber > 0); 1.250 + rule.truncate(0); 1.251 + rule.append((UChar)0x0024 /*$*/); 1.252 + ICU_Utility::appendNumber(rule, segmentNumber, 10, 1); 1.253 + return rule; 1.254 +} 1.255 + 1.256 +/** 1.257 + * Remove any match info. This must be called before performing a 1.258 + * set of matches with this segment. 1.259 + */ 1.260 + void StringMatcher::resetMatch() { 1.261 + matchStart = matchLimit = -1; 1.262 +} 1.263 + 1.264 +/** 1.265 + * Union the set of all characters that may output by this object 1.266 + * into the given set. 1.267 + * @param toUnionTo the set into which to union the output characters 1.268 + */ 1.269 +void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const { 1.270 + // The output of this replacer varies; it is the source text between 1.271 + // matchStart and matchLimit. Since this varies depending on the 1.272 + // input text, we can't compute it here. We can either do nothing 1.273 + // or we can add ALL characters to the set. It's probably more useful 1.274 + // to do nothing. 1.275 +} 1.276 + 1.277 +/** 1.278 + * Implement UnicodeFunctor 1.279 + */ 1.280 +void StringMatcher::setData(const TransliterationRuleData* d) { 1.281 + data = d; 1.282 + int32_t i = 0; 1.283 + while (i<pattern.length()) { 1.284 + UChar32 c = pattern.char32At(i); 1.285 + UnicodeFunctor* f = data->lookup(c); 1.286 + if (f != NULL) { 1.287 + f->setData(data); 1.288 + } 1.289 + i += U16_LENGTH(c); 1.290 + } 1.291 +} 1.292 + 1.293 +U_NAMESPACE_END 1.294 + 1.295 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.296 + 1.297 +//eof