intl/icu/source/i18n/rbt_rule.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  **********************************************************************
     3  *   Copyright (C) 1999-2011, International Business Machines
     4  *   Corporation and others.  All Rights Reserved.
     5  **********************************************************************
     6  *   Date        Name        Description
     7  *   11/17/99    aliu        Creation.
     8  **********************************************************************
     9  */
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/rep.h"
    16 #include "unicode/unifilt.h"
    17 #include "unicode/uniset.h"
    18 #include "unicode/utf16.h"
    19 #include "rbt_rule.h"
    20 #include "rbt_data.h"
    21 #include "cmemory.h"
    22 #include "strmatch.h"
    23 #include "strrepl.h"
    24 #include "util.h"
    25 #include "putilimp.h"
    27 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "
    29 U_NAMESPACE_BEGIN
    31 /**
    32  * Construct a new rule with the given input, output text, and other
    33  * attributes.  A cursor position may be specified for the output text.
    34  * @param input input string, including key and optional ante and
    35  * post context
    36  * @param anteContextPos offset into input to end of ante context, or -1 if
    37  * none.  Must be <= input.length() if not -1.
    38  * @param postContextPos offset into input to start of post context, or -1
    39  * if none.  Must be <= input.length() if not -1, and must be >=
    40  * anteContextPos.
    41  * @param output output string
    42  * @param cursorPosition offset into output at which cursor is located, or -1 if
    43  * none.  If less than zero, then the cursor is placed after the
    44  * <code>output</code>; that is, -1 is equivalent to
    45  * <code>output.length()</code>.  If greater than
    46  * <code>output.length()</code> then an exception is thrown.
    47  * @param segs array of UnicodeFunctors corresponding to input pattern
    48  * segments, or null if there are none.  The array itself is adopted,
    49  * but the pointers within it are not.
    50  * @param segsCount number of elements in segs[]
    51  * @param anchorStart TRUE if the the rule is anchored on the left to
    52  * the context start
    53  * @param anchorEnd TRUE if the rule is anchored on the right to the
    54  * context limit
    55  */
    56 TransliterationRule::TransliterationRule(const UnicodeString& input,
    57                                          int32_t anteContextPos, int32_t postContextPos,
    58                                          const UnicodeString& outputStr,
    59                                          int32_t cursorPosition, int32_t cursorOffset,
    60                                          UnicodeFunctor** segs,
    61                                          int32_t segsCount,
    62                                          UBool anchorStart, UBool anchorEnd,
    63                                          const TransliterationRuleData* theData,
    64                                          UErrorCode& status) :
    65     UMemory(),
    66     segments(0),
    67     data(theData) {
    69     if (U_FAILURE(status)) {
    70         return;
    71     }
    72     // Do range checks only when warranted to save time
    73     if (anteContextPos < 0) {
    74         anteContextLength = 0;
    75     } else {
    76         if (anteContextPos > input.length()) {
    77             // throw new IllegalArgumentException("Invalid ante context");
    78             status = U_ILLEGAL_ARGUMENT_ERROR;
    79             return;
    80         }
    81         anteContextLength = anteContextPos;
    82     }
    83     if (postContextPos < 0) {
    84         keyLength = input.length() - anteContextLength;
    85     } else {
    86         if (postContextPos < anteContextLength ||
    87             postContextPos > input.length()) {
    88             // throw new IllegalArgumentException("Invalid post context");
    89             status = U_ILLEGAL_ARGUMENT_ERROR;
    90             return;
    91         }
    92         keyLength = postContextPos - anteContextLength;
    93     }
    94     if (cursorPosition < 0) {
    95         cursorPosition = outputStr.length();
    96     } else if (cursorPosition > outputStr.length()) {
    97         // throw new IllegalArgumentException("Invalid cursor position");
    98         status = U_ILLEGAL_ARGUMENT_ERROR;
    99         return;
   100     }
   101     // We don't validate the segments array.  The caller must
   102     // guarantee that the segments are well-formed (that is, that
   103     // all $n references in the output refer to indices of this
   104     // array, and that no array elements are null).
   105     this->segments = segs;
   106     this->segmentsCount = segsCount;
   108     pattern = input;
   109     flags = 0;
   110     if (anchorStart) {
   111         flags |= ANCHOR_START;
   112     }
   113     if (anchorEnd) {
   114         flags |= ANCHOR_END;
   115     }
   117     anteContext = NULL;
   118     if (anteContextLength > 0) {
   119         anteContext = new StringMatcher(pattern, 0, anteContextLength,
   120                                         FALSE, *data);
   121         /* test for NULL */
   122         if (anteContext == 0) {
   123             status = U_MEMORY_ALLOCATION_ERROR;
   124             return;
   125         }
   126     }
   128     key = NULL;
   129     if (keyLength > 0) {
   130         key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,
   131                                 FALSE, *data);
   132         /* test for NULL */
   133         if (key == 0) {
   134             status = U_MEMORY_ALLOCATION_ERROR;
   135             return;
   136         }
   137     }
   139     int32_t postContextLength = pattern.length() - keyLength - anteContextLength;
   140     postContext = NULL;
   141     if (postContextLength > 0) {
   142         postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),
   143                                         FALSE, *data);
   144         /* test for NULL */
   145         if (postContext == 0) {
   146             status = U_MEMORY_ALLOCATION_ERROR;
   147             return;
   148         }
   149     }
   151     this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);
   152     /* test for NULL */
   153     if (this->output == 0) {
   154         status = U_MEMORY_ALLOCATION_ERROR;
   155         return;
   156     }
   157 }
   159 /**
   160  * Copy constructor.
   161  */
   162 TransliterationRule::TransliterationRule(TransliterationRule& other) :
   163     UMemory(other),
   164     anteContext(NULL),
   165     key(NULL),
   166     postContext(NULL),
   167     pattern(other.pattern),
   168     anteContextLength(other.anteContextLength),
   169     keyLength(other.keyLength),
   170     flags(other.flags),
   171     data(other.data) {
   173     segments = NULL;
   174     segmentsCount = 0;
   175     if (other.segmentsCount > 0) {
   176         segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));
   177         uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));
   178     }
   180     if (other.anteContext != NULL) {
   181         anteContext = (StringMatcher*) other.anteContext->clone();
   182     }
   183     if (other.key != NULL) {
   184         key = (StringMatcher*) other.key->clone();
   185     }
   186     if (other.postContext != NULL) {
   187         postContext = (StringMatcher*) other.postContext->clone();
   188     }
   189     output = other.output->clone();
   190 }
   192 TransliterationRule::~TransliterationRule() {
   193     uprv_free(segments);
   194     delete anteContext;
   195     delete key;
   196     delete postContext;
   197     delete output;
   198 }
   200 /**
   201  * Return the preceding context length.  This method is needed to
   202  * support the <code>Transliterator</code> method
   203  * <code>getMaximumContextLength()</code>.  Internally, this is
   204  * implemented as the anteContextLength, optionally plus one if
   205  * there is a start anchor.  The one character anchor gap is
   206  * needed to make repeated incremental transliteration with
   207  * anchors work.
   208  */
   209 int32_t TransliterationRule::getContextLength(void) const {
   210     return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);
   211 }
   213 /**
   214  * Internal method.  Returns 8-bit index value for this rule.
   215  * This is the low byte of the first character of the key,
   216  * unless the first character of the key is a set.  If it's a
   217  * set, or otherwise can match multiple keys, the index value is -1.
   218  */
   219 int16_t TransliterationRule::getIndexValue() const {
   220     if (anteContextLength == pattern.length()) {
   221         // A pattern with just ante context {such as foo)>bar} can
   222         // match any key.
   223         return -1;
   224     }
   225     UChar32 c = pattern.char32At(anteContextLength);
   226     return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);
   227 }
   229 /**
   230  * Internal method.  Returns true if this rule matches the given
   231  * index value.  The index value is an 8-bit integer, 0..255,
   232  * representing the low byte of the first character of the key.
   233  * It matches this rule if it matches the first character of the
   234  * key, or if the first character of the key is a set, and the set
   235  * contains any character with a low byte equal to the index
   236  * value.  If the rule contains only ante context, as in foo)>bar,
   237  * then it will match any key.
   238  */
   239 UBool TransliterationRule::matchesIndexValue(uint8_t v) const {
   240     // Delegate to the key, or if there is none, to the postContext.
   241     // If there is neither then we match any key; return true.
   242     UnicodeMatcher *m = (key != NULL) ? key : postContext;
   243     return (m != NULL) ? m->matchesIndexValue(v) : TRUE;
   244 }
   246 /**
   247  * Return true if this rule masks another rule.  If r1 masks r2 then
   248  * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
   249  * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
   250  * "[c]a>x" masks "[dc]a>y".
   251  */
   252 UBool TransliterationRule::masks(const TransliterationRule& r2) const {
   253     /* Rule r1 masks rule r2 if the string formed of the
   254      * antecontext, key, and postcontext overlaps in the following
   255      * way:
   256      *
   257      * r1:      aakkkpppp
   258      * r2:     aaakkkkkpppp
   259      *            ^
   260      * 
   261      * The strings must be aligned at the first character of the
   262      * key.  The length of r1 to the left of the alignment point
   263      * must be <= the length of r2 to the left; ditto for the
   264      * right.  The characters of r1 must equal (or be a superset
   265      * of) the corresponding characters of r2.  The superset
   266      * operation should be performed to check for UnicodeSet
   267      * masking.
   268      *
   269      * Anchors:  Two patterns that differ only in anchors only
   270      * mask one another if they are exactly equal, and r2 has
   271      * all the anchors r1 has (optionally, plus some).  Here Y
   272      * means the row masks the column, N means it doesn't.
   273      *
   274      *         ab   ^ab    ab$  ^ab$
   275      *   ab    Y     Y     Y     Y
   276      *  ^ab    N     Y     N     Y
   277      *   ab$   N     N     Y     Y
   278      *  ^ab$   N     N     N     Y
   279      *
   280      * Post context: {a}b masks ab, but not vice versa, since {a}b
   281      * matches everything ab matches, and {a}b matches {|a|}b but ab
   282      * does not.  Pre context is different (a{b} does not align with
   283      * ab).
   284      */
   286     /* LIMITATION of the current mask algorithm: Some rule
   287      * maskings are currently not detected.  For example,
   288      * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO
   289      */
   291     int32_t len = pattern.length();
   292     int32_t left = anteContextLength;
   293     int32_t left2 = r2.anteContextLength;
   294     int32_t right = len - left;
   295     int32_t right2 = r2.pattern.length() - left2;
   296     int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);
   298     // TODO Clean this up -- some logic might be combinable with the
   299     // next statement.
   301     // Test for anchor masking
   302     if (left == left2 && right == right2 &&
   303         keyLength <= r2.keyLength &&
   304         0 == cachedCompare) {
   305         // The following boolean logic implements the table above
   306         return (flags == r2.flags) ||
   307             (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||
   308             ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));
   309     }
   311     return left <= left2 &&
   312         (right < right2 ||
   313          (right == right2 && keyLength <= r2.keyLength)) &&
   314          (0 == cachedCompare);
   315 }
   317 static inline int32_t posBefore(const Replaceable& str, int32_t pos) {
   318     return (pos > 0) ?
   319         pos - U16_LENGTH(str.char32At(pos-1)) :
   320         pos - 1;
   321 }
   323 static inline int32_t posAfter(const Replaceable& str, int32_t pos) {
   324     return (pos >= 0 && pos < str.length()) ?
   325         pos + U16_LENGTH(str.char32At(pos)) :
   326         pos + 1;
   327 }
   329 /**
   330  * Attempt a match and replacement at the given position.  Return
   331  * the degree of match between this rule and the given text.  The
   332  * degree of match may be mismatch, a partial match, or a full
   333  * match.  A mismatch means at least one character of the text
   334  * does not match the context or key.  A partial match means some
   335  * context and key characters match, but the text is not long
   336  * enough to match all of them.  A full match means all context
   337  * and key characters match.
   338  * 
   339  * If a full match is obtained, perform a replacement, update pos,
   340  * and return U_MATCH.  Otherwise both text and pos are unchanged.
   341  * 
   342  * @param text the text
   343  * @param pos the position indices
   344  * @param incremental if TRUE, test for partial matches that may
   345  * be completed by additional text inserted at pos.limit.
   346  * @return one of <code>U_MISMATCH</code>,
   347  * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
   348  * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
   349  */
   350 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,
   351                                                   UTransPosition& pos,
   352                                                   UBool incremental) const {
   353     // Matching and replacing are done in one method because the
   354     // replacement operation needs information obtained during the
   355     // match.  Another way to do this is to have the match method
   356     // create a match result struct with relevant offsets, and to pass
   357     // this into the replace method.
   359     // ============================ MATCH ===========================
   361     // Reset segment match data
   362     if (segments != NULL) {
   363         for (int32_t i=0; i<segmentsCount; ++i) {
   364             ((StringMatcher*) segments[i])->resetMatch();
   365         }
   366     }
   368 //    int32_t lenDelta, keyLimit;
   369     int32_t keyLimit;
   371     // ------------------------ Ante Context ------------------------
   373     // A mismatch in the ante context, or with the start anchor,
   374     // is an outright U_MISMATCH regardless of whether we are
   375     // incremental or not.
   376     int32_t oText; // offset into 'text'
   377 //    int32_t newStart = 0;
   378     int32_t minOText;
   380     // Note (1): We process text in 16-bit code units, rather than
   381     // 32-bit code points.  This works because stand-ins are
   382     // always in the BMP and because we are doing a literal match
   383     // operation, which can be done 16-bits at a time.
   385     int32_t anteLimit = posBefore(text, pos.contextStart);
   387     UMatchDegree match;
   389     // Start reverse match at char before pos.start
   390     oText = posBefore(text, pos.start);
   392     if (anteContext != NULL) {
   393         match = anteContext->matches(text, oText, anteLimit, FALSE);
   394         if (match != U_MATCH) {
   395             return U_MISMATCH;
   396         }
   397     }
   399     minOText = posAfter(text, oText);
   401     // ------------------------ Start Anchor ------------------------
   403     if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {
   404         return U_MISMATCH;
   405     }
   407     // -------------------- Key and Post Context --------------------
   409     oText = pos.start;
   411     if (key != NULL) {
   412         match = key->matches(text, oText, pos.limit, incremental);
   413         if (match != U_MATCH) {
   414             return match;
   415         }
   416     }
   418     keyLimit = oText;
   420     if (postContext != NULL) {
   421         if (incremental && keyLimit == pos.limit) {
   422             // The key matches just before pos.limit, and there is
   423             // a postContext.  Since we are in incremental mode,
   424             // we must assume more characters may be inserted at
   425             // pos.limit -- this is a partial match.
   426             return U_PARTIAL_MATCH;
   427         }
   429         match = postContext->matches(text, oText, pos.contextLimit, incremental);
   430         if (match != U_MATCH) {
   431             return match;
   432         }
   433     }
   435     // ------------------------- Stop Anchor ------------------------
   437     if (((flags & ANCHOR_END)) != 0) {
   438         if (oText != pos.contextLimit) {
   439             return U_MISMATCH;
   440         }
   441         if (incremental) {
   442             return U_PARTIAL_MATCH;
   443         }
   444     }
   446     // =========================== REPLACE ==========================
   448     // We have a full match.  The key is between pos.start and
   449     // keyLimit.
   451     int32_t newStart;
   452     int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);
   453     int32_t lenDelta = newLength - (keyLimit - pos.start);
   455     oText += lenDelta;
   456     pos.limit += lenDelta;
   457     pos.contextLimit += lenDelta;
   458     // Restrict new value of start to [minOText, min(oText, pos.limit)].
   459     pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));
   460     return U_MATCH;
   461 }
   463 /**
   464  * Create a source string that represents this rule.  Append it to the
   465  * given string.
   466  */
   467 UnicodeString& TransliterationRule::toRule(UnicodeString& rule,
   468                                            UBool escapeUnprintable) const {
   470     // Accumulate special characters (and non-specials following them)
   471     // into quoteBuf.  Append quoteBuf, within single quotes, when
   472     // a non-quoted element must be inserted.
   473     UnicodeString str, quoteBuf;
   475     // Do not emit the braces '{' '}' around the pattern if there
   476     // is neither anteContext nor postContext.
   477     UBool emitBraces =
   478         (anteContext != NULL) || (postContext != NULL);
   480     // Emit start anchor
   481     if ((flags & ANCHOR_START) != 0) {
   482         rule.append((UChar)94/*^*/);
   483     }
   485     // Emit the input pattern
   486     ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);
   488     if (emitBraces) {
   489         ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);
   490     }
   492     ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);
   494     if (emitBraces) {
   495         ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);
   496     }
   498     ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);
   500     // Emit end anchor
   501     if ((flags & ANCHOR_END) != 0) {
   502         rule.append((UChar)36/*$*/);
   503     }
   505     ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf);
   507     // Emit the output pattern
   509     ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),
   510                               TRUE, escapeUnprintable, quoteBuf);
   512     ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);
   514     return rule;
   515 }
   517 void TransliterationRule::setData(const TransliterationRuleData* d) {
   518     data = d;
   519     if (anteContext != NULL) anteContext->setData(d);
   520     if (postContext != NULL) postContext->setData(d);
   521     if (key != NULL) key->setData(d);
   522     // assert(output != NULL);
   523     output->setData(d);
   524     // Don't have to do segments since they are in the context or key
   525 }
   527 /**
   528  * Union the set of all characters that may be modified by this rule
   529  * into the given set.
   530  */
   531 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {
   532     int32_t limit = anteContextLength + keyLength;
   533     for (int32_t i=anteContextLength; i<limit; ) {
   534         UChar32 ch = pattern.char32At(i);
   535         i += U16_LENGTH(ch);
   536         const UnicodeMatcher* matcher = data->lookupMatcher(ch);
   537         if (matcher == NULL) {
   538             toUnionTo.add(ch);
   539         } else {
   540             matcher->addMatchSetTo(toUnionTo);
   541         }
   542     }
   543 }
   545 /**
   546  * Union the set of all characters that may be emitted by this rule
   547  * into the given set.
   548  */
   549 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {
   550     output->toReplacer()->addReplacementSetTo(toUnionTo);
   551 }
   553 U_NAMESPACE_END
   555 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   557 //eof

mercurial