intl/icu/source/i18n/strmatch.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 **********************************************************************
     3 *   Copyright (c) 2001-2012, International Business Machines Corporation
     4 *   and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   Date        Name        Description
     7 *   07/23/01    aliu        Creation.
     8 **********************************************************************
     9 */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "strmatch.h"
    16 #include "rbt_data.h"
    17 #include "util.h"
    18 #include "unicode/uniset.h"
    19 #include "unicode/utf16.h"
    21 U_NAMESPACE_BEGIN
    23 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringMatcher)
    25 StringMatcher::StringMatcher(const UnicodeString& theString,
    26                              int32_t start,
    27                              int32_t limit,
    28                              int32_t segmentNum,
    29                              const TransliterationRuleData& theData) :
    30     data(&theData),
    31     segmentNumber(segmentNum),
    32     matchStart(-1),
    33     matchLimit(-1)
    34 {
    35     theString.extractBetween(start, limit, pattern);
    36 }
    38 StringMatcher::StringMatcher(const StringMatcher& o) :
    39     UnicodeFunctor(o),
    40     UnicodeMatcher(o),
    41     UnicodeReplacer(o),
    42     pattern(o.pattern),
    43     data(o.data),
    44     segmentNumber(o.segmentNumber),
    45     matchStart(o.matchStart),
    46     matchLimit(o.matchLimit)
    47 {
    48 }
    50 /**
    51  * Destructor
    52  */
    53 StringMatcher::~StringMatcher() {
    54 }
    56 /**
    57  * Implement UnicodeFunctor
    58  */
    59 UnicodeFunctor* StringMatcher::clone() const {
    60     return new StringMatcher(*this);
    61 }
    63 /**
    64  * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
    65  * and return the pointer.
    66  */
    67 UnicodeMatcher* StringMatcher::toMatcher() const {
    68   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
    69   UnicodeMatcher *nonconst_base = static_cast<UnicodeMatcher *>(nonconst_this);
    71   return nonconst_base;
    72 }
    74 /**
    75  * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
    76  * and return the pointer.
    77  */
    78 UnicodeReplacer* StringMatcher::toReplacer() const {
    79   StringMatcher  *nonconst_this = const_cast<StringMatcher *>(this);
    80   UnicodeReplacer *nonconst_base = static_cast<UnicodeReplacer *>(nonconst_this);
    82   return nonconst_base;
    83 }
    85 /**
    86  * Implement UnicodeMatcher
    87  */
    88 UMatchDegree StringMatcher::matches(const Replaceable& text,
    89                                     int32_t& offset,
    90                                     int32_t limit,
    91                                     UBool incremental) {
    92     int32_t i;
    93     int32_t cursor = offset;
    94     if (limit < cursor) {
    95         // Match in the reverse direction
    96         for (i=pattern.length()-1; i>=0; --i) {
    97             UChar keyChar = pattern.charAt(i);
    98             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
    99             if (subm == 0) {
   100                 if (cursor > limit &&
   101                     keyChar == text.charAt(cursor)) {
   102                     --cursor;
   103                 } else {
   104                     return U_MISMATCH;
   105                 }
   106             } else {
   107                 UMatchDegree m =
   108                     subm->matches(text, cursor, limit, incremental);
   109                 if (m != U_MATCH) {
   110                     return m;
   111                 }
   112             }
   113         }
   114         // Record the match position, but adjust for a normal
   115         // forward start, limit, and only if a prior match does not
   116         // exist -- we want the rightmost match.
   117         if (matchStart < 0) {
   118             matchStart = cursor+1;
   119             matchLimit = offset+1;
   120         }
   121     } else {
   122         for (i=0; i<pattern.length(); ++i) {
   123             if (incremental && cursor == limit) {
   124                 // We've reached the context limit without a mismatch and
   125                 // without completing our match.
   126                 return U_PARTIAL_MATCH;
   127             }
   128             UChar keyChar = pattern.charAt(i);
   129             UnicodeMatcher* subm = data->lookupMatcher(keyChar);
   130             if (subm == 0) {
   131                 // Don't need the cursor < limit check if
   132                 // incremental is TRUE (because it's done above); do need
   133                 // it otherwise.
   134                 if (cursor < limit &&
   135                     keyChar == text.charAt(cursor)) {
   136                     ++cursor;
   137                 } else {
   138                     return U_MISMATCH;
   139                 }
   140             } else {
   141                 UMatchDegree m =
   142                     subm->matches(text, cursor, limit, incremental);
   143                 if (m != U_MATCH) {
   144                     return m;
   145                 }
   146             }
   147         }
   148         // Record the match position
   149         matchStart = offset;
   150         matchLimit = cursor;
   151     }
   153     offset = cursor;
   154     return U_MATCH;
   155 }
   157 /**
   158  * Implement UnicodeMatcher
   159  */
   160 UnicodeString& StringMatcher::toPattern(UnicodeString& result,
   161                                         UBool escapeUnprintable) const
   162 {
   163     result.truncate(0);
   164     UnicodeString str, quoteBuf;
   165     if (segmentNumber > 0) {
   166         result.append((UChar)40); /*(*/
   167     }
   168     for (int32_t i=0; i<pattern.length(); ++i) {
   169         UChar keyChar = pattern.charAt(i);
   170         const UnicodeMatcher* m = data->lookupMatcher(keyChar);
   171         if (m == 0) {
   172             ICU_Utility::appendToRule(result, keyChar, FALSE, escapeUnprintable, quoteBuf);
   173         } else {
   174             ICU_Utility::appendToRule(result, m->toPattern(str, escapeUnprintable),
   175                          TRUE, escapeUnprintable, quoteBuf);
   176         }
   177     }
   178     if (segmentNumber > 0) {
   179         result.append((UChar)41); /*)*/
   180     }
   181     // Flush quoteBuf out to result
   182     ICU_Utility::appendToRule(result, -1,
   183                               TRUE, escapeUnprintable, quoteBuf);
   184     return result;
   185 }
   187 /**
   188  * Implement UnicodeMatcher
   189  */
   190 UBool StringMatcher::matchesIndexValue(uint8_t v) const {
   191     if (pattern.length() == 0) {
   192         return TRUE;
   193     }
   194     UChar32 c = pattern.char32At(0);
   195     const UnicodeMatcher *m = data->lookupMatcher(c);
   196     return (m == 0) ? ((c & 0xFF) == v) : m->matchesIndexValue(v);
   197 }
   199 /**
   200  * Implement UnicodeMatcher
   201  */
   202 void StringMatcher::addMatchSetTo(UnicodeSet& toUnionTo) const {
   203     UChar32 ch;
   204     for (int32_t i=0; i<pattern.length(); i+=U16_LENGTH(ch)) {
   205         ch = pattern.char32At(i);
   206         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
   207         if (matcher == NULL) {
   208             toUnionTo.add(ch);
   209         } else {
   210             matcher->addMatchSetTo(toUnionTo);
   211         }
   212     }
   213 }
   215 /**
   216  * UnicodeReplacer API
   217  */
   218 int32_t StringMatcher::replace(Replaceable& text,
   219                                int32_t start,
   220                                int32_t limit,
   221                                int32_t& /*cursor*/) {
   223     int32_t outLen = 0;
   225     // Copy segment with out-of-band data
   226     int32_t dest = limit;
   227     // If there was no match, that means that a quantifier
   228     // matched zero-length.  E.g., x (a)* y matched "xy".
   229     if (matchStart >= 0) {
   230         if (matchStart != matchLimit) {
   231             text.copy(matchStart, matchLimit, dest);
   232             outLen = matchLimit - matchStart;
   233         }
   234     }
   236     text.handleReplaceBetween(start, limit, UnicodeString()); // delete original text
   238     return outLen;
   239 }
   241 /**
   242  * UnicodeReplacer API
   243  */
   244 UnicodeString& StringMatcher::toReplacerPattern(UnicodeString& rule,
   245                                                 UBool /*escapeUnprintable*/) const {
   246     // assert(segmentNumber > 0);
   247     rule.truncate(0);
   248     rule.append((UChar)0x0024 /*$*/);
   249     ICU_Utility::appendNumber(rule, segmentNumber, 10, 1);
   250     return rule;
   251 }
   253 /**
   254  * Remove any match info.  This must be called before performing a
   255  * set of matches with this segment.
   256  */
   257  void StringMatcher::resetMatch() {
   258     matchStart = matchLimit = -1;
   259 }
   261 /**
   262  * Union the set of all characters that may output by this object
   263  * into the given set.
   264  * @param toUnionTo the set into which to union the output characters
   265  */
   266 void StringMatcher::addReplacementSetTo(UnicodeSet& /*toUnionTo*/) const {
   267     // The output of this replacer varies; it is the source text between
   268     // matchStart and matchLimit.  Since this varies depending on the
   269     // input text, we can't compute it here.  We can either do nothing
   270     // or we can add ALL characters to the set.  It's probably more useful
   271     // to do nothing.
   272 }
   274 /**
   275  * Implement UnicodeFunctor
   276  */
   277 void StringMatcher::setData(const TransliterationRuleData* d) {
   278     data = d;
   279     int32_t i = 0;
   280     while (i<pattern.length()) {
   281         UChar32 c = pattern.char32At(i);
   282         UnicodeFunctor* f = data->lookup(c);
   283         if (f != NULL) {
   284             f->setData(data);
   285         }
   286         i += U16_LENGTH(c);
   287     }
   288 }
   290 U_NAMESPACE_END
   292 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   294 //eof

mercurial