intl/icu/source/i18n/strmatch.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (c) 2001-2012, International Business Machines Corporation
michael@0 4 * and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 07/23/01 aliu Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12
michael@0 13 #if !UCONFIG_NO_TRANSLITERATION
michael@0 14
michael@0 15 #include "strmatch.h"
michael@0 16 #include "rbt_data.h"
michael@0 17 #include "util.h"
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/utf16.h"
michael@0 20
michael@0 21 U_NAMESPACE_BEGIN
michael@0 22
michael@0 23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
michael@0 24
michael@0 25 StringMatcher::StringMatcher(const UnicodeString& theString,
michael@0 26 int32_t start,
michael@0 27 int32_t limit,
michael@0 28 int32_t segmentNum,
michael@0 29 const TransliterationRuleData& theData) :
michael@0 30 data(&theData),
michael@0 31 segmentNumber(segmentNum),
michael@0 32 matchStart(-1),
michael@0 33 matchLimit(-1)
michael@0 34 {
michael@0 35 theString.extractBetween(start, limit, pattern);
michael@0 36 }
michael@0 37
michael@0 38 StringMatcher::StringMatcher(const StringMatcher& o) :
michael@0 39 UnicodeFunctor(o),
michael@0 40 UnicodeMatcher(o),
michael@0 41 UnicodeReplacer(o),
michael@0 42 pattern(o.pattern),
michael@0 43 data(o.data),
michael@0 44 segmentNumber(o.segmentNumber),
michael@0 45 matchStart(o.matchStart),
michael@0 46 matchLimit(o.matchLimit)
michael@0 47 {
michael@0 48 }
michael@0 49
michael@0 50 /**
michael@0 51 * Destructor
michael@0 52 */
michael@0 53 StringMatcher::~StringMatcher() {
michael@0 54 }
michael@0 55
michael@0 56 /**
michael@0 57 * Implement UnicodeFunctor
michael@0 58 */
michael@0 59 UnicodeFunctor* StringMatcher::clone() const {
michael@0 60 return new StringMatcher(*this);
michael@0 61 }
michael@0 62
michael@0 63 /**
michael@0 64 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
michael@0 65 * and return the pointer.
michael@0 66 */
michael@0 67 UnicodeMatcher* StringMatcher::toMatcher() const {
michael@0 68 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
michael@0 69 UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
michael@0 70
michael@0 71 return nonconst_base;
michael@0 72 }
michael@0 73
michael@0 74 /**
michael@0 75 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
michael@0 76 * and return the pointer.
michael@0 77 */
michael@0 78 UnicodeReplacer* StringMatcher::toReplacer() const {
michael@0 79 StringMatcher *nonconst_this = const_cast<StringMatcher *>(this);
michael@0 80 UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
michael@0 81
michael@0 82 return nonconst_base;
michael@0 83 }
michael@0 84
michael@0 85 /**
michael@0 86 * Implement UnicodeMatcher
michael@0 87 */
michael@0 88 UMatchDegree StringMatcher::matches(const Replaceable& text,
michael@0 89 int32_t& offset,
michael@0 90 int32_t limit,
michael@0 91 UBool incremental) {
michael@0 92 int32_t i;
michael@0 93 int32_t cursor = offset;
michael@0 94 if (limit < cursor) {
michael@0 95 // Match in the reverse direction
michael@0 96 for (i=pattern.length()-1; i>=0; --i) {
michael@0 97 UChar keyChar = pattern.charAt(i);
michael@0 98 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
michael@0 99 if (subm == 0) {
michael@0 100 if (cursor > limit &&
michael@0 101 keyChar == text.charAt(cursor)) {
michael@0 102 --cursor;
michael@0 103 } else {
michael@0 104 return U_MISMATCH;
michael@0 105 }
michael@0 106 } else {
michael@0 107 UMatchDegree m =
michael@0 108 subm->matches(text, cursor, limit, incremental);
michael@0 109 if (m != U_MATCH) {
michael@0 110 return m;
michael@0 111 }
michael@0 112 }
michael@0 113 }
michael@0 114 // Record the match position, but adjust for a normal
michael@0 115 // forward start, limit, and only if a prior match does not
michael@0 116 // exist -- we want the rightmost match.
michael@0 117 if (matchStart < 0) {
michael@0 118 matchStart = cursor+1;
michael@0 119 matchLimit = offset+1;
michael@0 120 }
michael@0 121 } else {
michael@0 122 for (i=0; i<pattern.length(); ++i) {
michael@0 123 if (incremental && cursor == limit) {
michael@0 124 // We've reached the context limit without a mismatch and
michael@0 125 // without completing our match.
michael@0 126 return U_PARTIAL_MATCH;
michael@0 127 }
michael@0 128 UChar keyChar = pattern.charAt(i);
michael@0 129 UnicodeMatcher* subm = data->lookupMatcher(keyChar);
michael@0 130 if (subm == 0) {
michael@0 131 // Don't need the cursor < limit check if
michael@0 132 // incremental is TRUE (because it's done above); do need
michael@0 133 // it otherwise.
michael@0 134 if (cursor < limit &&
michael@0 135 keyChar == text.charAt(cursor)) {
michael@0 136 ++cursor;
michael@0 137 } else {
michael@0 138 return U_MISMATCH;
michael@0 139 }
michael@0 140 } else {
michael@0 141 UMatchDegree m =
michael@0 142 subm->matches(text, cursor, limit, incremental);
michael@0 143 if (m != U_MATCH) {
michael@0 144 return m;
michael@0 145 }
michael@0 146 }
michael@0 147 }
michael@0 148 // Record the match position
michael@0 149 matchStart = offset;
michael@0 150 matchLimit = cursor;
michael@0 151 }
michael@0 152
michael@0 153 offset = cursor;
michael@0 154 return U_MATCH;
michael@0 155 }
michael@0 156
michael@0 157 /**
michael@0 158 * Implement UnicodeMatcher
michael@0 159 */
michael@0 160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
michael@0 161 UBool escapeUnprintable) const
michael@0 162 {
michael@0 163 result.truncate(0);
michael@0 164 UnicodeString str, quoteBuf;
michael@0 165 if (segmentNumber > 0) {
michael@0 166 result.append((UChar)40); /*(*/
michael@0 167 }
michael@0 168 for (int32_t i=0; i<pattern.length(); ++i) {
michael@0 169 UChar keyChar = pattern.charAt(i);
michael@0 170 const UnicodeMatcher* m = data->lookupMatcher(keyChar);
michael@0 171 if (m == 0) {
michael@0 172 ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
michael@0 173 } else {
michael@0 174 ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
michael@0 175 TRUE, escapeUnprintable, quoteBuf);
michael@0 176 }
michael@0 177 }
michael@0 178 if (segmentNumber > 0) {
michael@0 179 result.append((UChar)41); /*)*/
michael@0 180 }
michael@0 181 // Flush quoteBuf out to result
michael@0 182 ICU_Utility::appendToRule(result, -1,
michael@0 183 TRUE, escapeUnprintable, quoteBuf);
michael@0 184 return result;
michael@0 185 }
michael@0 186
michael@0 187 /**
michael@0 188 * Implement UnicodeMatcher
michael@0 189 */
michael@0 190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
michael@0 191 if (pattern.length() == 0) {
michael@0 192 return TRUE;
michael@0 193 }
michael@0 194 UChar32 c = pattern.char32At(0);
michael@0 195 const UnicodeMatcher *m = data->lookupMatcher(c);
michael@0 196 return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
michael@0 197 }
michael@0 198
michael@0 199 /**
michael@0 200 * Implement UnicodeMatcher
michael@0 201 */
michael@0 202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
michael@0 203 UChar32 ch;
michael@0 204 for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
michael@0 205 ch = pattern.char32At(i);
michael@0 206 const UnicodeMatcher* matcher = data->lookupMatcher(ch);
michael@0 207 if (matcher == NULL) {
michael@0 208 toUnionTo.add(ch);
michael@0 209 } else {
michael@0 210 matcher->addMatchSetTo(toUnionTo);
michael@0 211 }
michael@0 212 }
michael@0 213 }
michael@0 214
michael@0 215 /**
michael@0 216 * UnicodeReplacer API
michael@0 217 */
michael@0 218 int32_t StringMatcher::replace(Replaceable& text,
michael@0 219 int32_t start,
michael@0 220 int32_t limit,
michael@0 221 int32_t& /*cursor*/) {
michael@0 222
michael@0 223 int32_t outLen = 0;
michael@0 224
michael@0 225 // Copy segment with out-of-band data
michael@0 226 int32_t dest = limit;
michael@0 227 // If there was no match, that means that a quantifier
michael@0 228 // matched zero-length. E.g., x (a)* y matched "xy".
michael@0 229 if (matchStart >= 0) {
michael@0 230 if (matchStart != matchLimit) {
michael@0 231 text.copy(matchStart, matchLimit, dest);
michael@0 232 outLen = matchLimit - matchStart;
michael@0 233 }
michael@0 234 }
michael@0 235
michael@0 236 text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
michael@0 237
michael@0 238 return outLen;
michael@0 239 }
michael@0 240
michael@0 241 /**
michael@0 242 * UnicodeReplacer API
michael@0 243 */
michael@0 244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
michael@0 245 UBool /*escapeUnprintable*/) const {
michael@0 246 // assert(segmentNumber > 0);
michael@0 247 rule.truncate(0);
michael@0 248 rule.append((UChar)0x0024 /*$*/);
michael@0 249 ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
michael@0 250 return rule;
michael@0 251 }
michael@0 252
michael@0 253 /**
michael@0 254 * Remove any match info. This must be called before performing a
michael@0 255 * set of matches with this segment.
michael@0 256 */
michael@0 257 void StringMatcher::resetMatch() {
michael@0 258 matchStart = matchLimit = -1;
michael@0 259 }
michael@0 260
michael@0 261 /**
michael@0 262 * Union the set of all characters that may output by this object
michael@0 263 * into the given set.
michael@0 264 * @param toUnionTo the set into which to union the output characters
michael@0 265 */
michael@0 266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
michael@0 267 // The output of this replacer varies; it is the source text between
michael@0 268 // matchStart and matchLimit. Since this varies depending on the
michael@0 269 // input text, we can't compute it here. We can either do nothing
michael@0 270 // or we can add ALL characters to the set. It's probably more useful
michael@0 271 // to do nothing.
michael@0 272 }
michael@0 273
michael@0 274 /**
michael@0 275 * Implement UnicodeFunctor
michael@0 276 */
michael@0 277 void StringMatcher::setData(const TransliterationRuleData* d) {
michael@0 278 data = d;
michael@0 279 int32_t i = 0;
michael@0 280 while (i<pattern.length()) {
michael@0 281 UChar32 c = pattern.char32At(i);
michael@0 282 UnicodeFunctor* f = data->lookup(c);
michael@0 283 if (f != NULL) {
michael@0 284 f->setData(data);
michael@0 285 }
michael@0 286 i += U16_LENGTH(c);
michael@0 287 }
michael@0 288 }
michael@0 289
michael@0 290 U_NAMESPACE_END
michael@0 291
michael@0 292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 293
michael@0 294 //eof

mercurial