The Tor Browser: intl/icu/source/i18n/rbt

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  **********************************************************************

     3  *   Copyright (C) 1999-2011, International Business Machines

     4  *   Corporation and others.  All Rights Reserved.

     5  **********************************************************************

     6  *   Date        Name        Description

     7  *   11/17/99    aliu        Creation.

     8  **********************************************************************

     9  */

    11 #include "unicode/utypes.h"

    13 #if !UCONFIG_NO_TRANSLITERATION

    15 #include "unicode/rep.h"

    16 #include "unicode/unifilt.h"

    17 #include "unicode/uniset.h"

    18 #include "unicode/utf16.h"

    19 #include "rbt_rule.h"

    20 #include "rbt_data.h"

    21 #include "cmemory.h"

    22 #include "strmatch.h"

    23 #include "strrepl.h"

    24 #include "util.h"

    25 #include "putilimp.h"

    27 static const UChar FORWARD_OP[] = {32,62,32,0}; // " > "

    29 U_NAMESPACE_BEGIN

    31 /**

    32  * Construct a new rule with the given input, output text, and other

    33  * attributes.  A cursor position may be specified for the output text.

    34  * @param input input string, including key and optional ante and

    35  * post context

    36  * @param anteContextPos offset into input to end of ante context, or -1 if

    37  * none.  Must be <= input.length() if not -1.

    38  * @param postContextPos offset into input to start of post context, or -1

    39  * if none.  Must be <= input.length() if not -1, and must be >=

    40  * anteContextPos.

    41  * @param output output string

    42  * @param cursorPosition offset into output at which cursor is located, or -1 if

    43  * none.  If less than zero, then the cursor is placed after the

    44  * <code>output</code>; that is, -1 is equivalent to

    45  * <code>output.length()</code>.  If greater than

    46  * <code>output.length()</code> then an exception is thrown.

    47  * @param segs array of UnicodeFunctors corresponding to input pattern

    48  * segments, or null if there are none.  The array itself is adopted,

    49  * but the pointers within it are not.

    50  * @param segsCount number of elements in segs[]

    51  * @param anchorStart TRUE if the the rule is anchored on the left to

    52  * the context start

    53  * @param anchorEnd TRUE if the rule is anchored on the right to the

    54  * context limit

    55  */

    56 TransliterationRule::TransliterationRule(const UnicodeString& input,

    57                                          int32_t anteContextPos, int32_t postContextPos,

    58                                          const UnicodeString& outputStr,

    59                                          int32_t cursorPosition, int32_t cursorOffset,

    60                                          UnicodeFunctor** segs,

    61                                          int32_t segsCount,

    62                                          UBool anchorStart, UBool anchorEnd,

    63                                          const TransliterationRuleData* theData,

    64                                          UErrorCode& status) :

    65     UMemory(),

    66     segments(0),

    67     data(theData) {

    69     if (U_FAILURE(status)) {

    70         return;

    71     }

    72     // Do range checks only when warranted to save time

    73     if (anteContextPos < 0) {

    74         anteContextLength = 0;

    75     } else {

    76         if (anteContextPos > input.length()) {

    77             // throw new IllegalArgumentException("Invalid ante context");

    78             status = U_ILLEGAL_ARGUMENT_ERROR;

    79             return;

    80         }

    81         anteContextLength = anteContextPos;

    82     }

    83     if (postContextPos < 0) {

    84         keyLength = input.length() - anteContextLength;

    85     } else {

    86         if (postContextPos < anteContextLength ||

    87             postContextPos > input.length()) {

    88             // throw new IllegalArgumentException("Invalid post context");

    89             status = U_ILLEGAL_ARGUMENT_ERROR;

    90             return;

    91         }

    92         keyLength = postContextPos - anteContextLength;

    93     }

    94     if (cursorPosition < 0) {

    95         cursorPosition = outputStr.length();

    96     } else if (cursorPosition > outputStr.length()) {

    97         // throw new IllegalArgumentException("Invalid cursor position");

    98         status = U_ILLEGAL_ARGUMENT_ERROR;

    99         return;

   100     }

   101     // We don't validate the segments array.  The caller must

   102     // guarantee that the segments are well-formed (that is, that

   103     // all $n references in the output refer to indices of this

   104     // array, and that no array elements are null).

   105     this->segments = segs;

   106     this->segmentsCount = segsCount;

   108     pattern = input;

   109     flags = 0;

   110     if (anchorStart) {

   111         flags |= ANCHOR_START;

   112     }

   113     if (anchorEnd) {

   114         flags |= ANCHOR_END;

   115     }

   117     anteContext = NULL;

   118     if (anteContextLength > 0) {

   119         anteContext = new StringMatcher(pattern, 0, anteContextLength,

   120                                         FALSE, *data);

   121         /* test for NULL */

   122         if (anteContext == 0) {

   123             status = U_MEMORY_ALLOCATION_ERROR;

   124             return;

   125         }

   126     }

   128     key = NULL;

   129     if (keyLength > 0) {

   130         key = new StringMatcher(pattern, anteContextLength, anteContextLength + keyLength,

   131                                 FALSE, *data);

   132         /* test for NULL */

   133         if (key == 0) {

   134             status = U_MEMORY_ALLOCATION_ERROR;

   135             return;

   136         }

   137     }

   139     int32_t postContextLength = pattern.length() - keyLength - anteContextLength;

   140     postContext = NULL;

   141     if (postContextLength > 0) {

   142         postContext = new StringMatcher(pattern, anteContextLength + keyLength, pattern.length(),

   143                                         FALSE, *data);

   144         /* test for NULL */

   145         if (postContext == 0) {

   146             status = U_MEMORY_ALLOCATION_ERROR;

   147             return;

   148         }

   149     }

   151     this->output = new StringReplacer(outputStr, cursorPosition + cursorOffset, data);

   152     /* test for NULL */

   153     if (this->output == 0) {

   154         status = U_MEMORY_ALLOCATION_ERROR;

   155         return;

   156     }

   157 }

   159 /**

   160  * Copy constructor.

   161  */

   162 TransliterationRule::TransliterationRule(TransliterationRule& other) :

   163     UMemory(other),

   164     anteContext(NULL),

   165     key(NULL),

   166     postContext(NULL),

   167     pattern(other.pattern),

   168     anteContextLength(other.anteContextLength),

   169     keyLength(other.keyLength),

   170     flags(other.flags),

   171     data(other.data) {

   173     segments = NULL;

   174     segmentsCount = 0;

   175     if (other.segmentsCount > 0) {

   176         segments = (UnicodeFunctor **)uprv_malloc(other.segmentsCount * sizeof(UnicodeFunctor *));

   177         uprv_memcpy(segments, other.segments, other.segmentsCount*sizeof(segments[0]));

   178     }

   180     if (other.anteContext != NULL) {

   181         anteContext = (StringMatcher*) other.anteContext->clone();

   182     }

   183     if (other.key != NULL) {

   184         key = (StringMatcher*) other.key->clone();

   185     }

   186     if (other.postContext != NULL) {

   187         postContext = (StringMatcher*) other.postContext->clone();

   188     }

   189     output = other.output->clone();

   190 }

   192 TransliterationRule::~TransliterationRule() {

   193     uprv_free(segments);

   194     delete anteContext;

   195     delete key;

   196     delete postContext;

   197     delete output;

   198 }

   200 /**

   201  * Return the preceding context length.  This method is needed to

   202  * support the <code>Transliterator</code> method

   203  * <code>getMaximumContextLength()</code>.  Internally, this is

   204  * implemented as the anteContextLength, optionally plus one if

   205  * there is a start anchor.  The one character anchor gap is

   206  * needed to make repeated incremental transliteration with

   207  * anchors work.

   208  */

   209 int32_t TransliterationRule::getContextLength(void) const {

   210     return anteContextLength + ((flags & ANCHOR_START) ? 1 : 0);

   211 }

   213 /**

   214  * Internal method.  Returns 8-bit index value for this rule.

   215  * This is the low byte of the first character of the key,

   216  * unless the first character of the key is a set.  If it's a

   217  * set, or otherwise can match multiple keys, the index value is -1.

   218  */

   219 int16_t TransliterationRule::getIndexValue() const {

   220     if (anteContextLength == pattern.length()) {

   221         // A pattern with just ante context {such as foo)>bar} can

   222         // match any key.

   223         return -1;

   224     }

   225     UChar32 c = pattern.char32At(anteContextLength);

   226     return (int16_t)(data->lookupMatcher(c) == NULL ? (c & 0xFF) : -1);

   227 }

   229 /**

   230  * Internal method.  Returns true if this rule matches the given

   231  * index value.  The index value is an 8-bit integer, 0..255,

   232  * representing the low byte of the first character of the key.

   233  * It matches this rule if it matches the first character of the

   234  * key, or if the first character of the key is a set, and the set

   235  * contains any character with a low byte equal to the index

   236  * value.  If the rule contains only ante context, as in foo)>bar,

   237  * then it will match any key.

   238  */

   239 UBool TransliterationRule::matchesIndexValue(uint8_t v) const {

   240     // Delegate to the key, or if there is none, to the postContext.

   241     // If there is neither then we match any key; return true.

   242     UnicodeMatcher *m = (key != NULL) ? key : postContext;

   243     return (m != NULL) ? m->matchesIndexValue(v) : TRUE;

   244 }

   246 /**

   247  * Return true if this rule masks another rule.  If r1 masks r2 then

   248  * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks

   249  * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".

   250  * "[c]a>x" masks "[dc]a>y".

   251  */

   252 UBool TransliterationRule::masks(const TransliterationRule& r2) const {

   253     /* Rule r1 masks rule r2 if the string formed of the

   254      * antecontext, key, and postcontext overlaps in the following

   255      * way:

   256      *

   257      * r1:      aakkkpppp

   258      * r2:     aaakkkkkpppp

   259      *            ^

   260      *

   261      * The strings must be aligned at the first character of the

   262      * key.  The length of r1 to the left of the alignment point

   263      * must be <= the length of r2 to the left; ditto for the

   264      * right.  The characters of r1 must equal (or be a superset

   265      * of) the corresponding characters of r2.  The superset

   266      * operation should be performed to check for UnicodeSet

   267      * masking.

   268      *

   269      * Anchors:  Two patterns that differ only in anchors only

   270      * mask one another if they are exactly equal, and r2 has

   271      * all the anchors r1 has (optionally, plus some).  Here Y

   272      * means the row masks the column, N means it doesn't.

   273      *

   274      *         ab   ^ab    ab$  ^ab$

   275      *   ab    Y     Y     Y     Y

   276      *  ^ab    N     Y     N     Y

   277      *   ab$   N     N     Y     Y

   278      *  ^ab$   N     N     N     Y

   279      *

   280      * Post context: {a}b masks ab, but not vice versa, since {a}b

   281      * matches everything ab matches, and {a}b matches {|a|}b but ab

   282      * does not.  Pre context is different (a{b} does not align with

   283      * ab).

   284      */

   286     /* LIMITATION of the current mask algorithm: Some rule

   287      * maskings are currently not detected.  For example,

   288      * "{Lu}]a>x" masks "A]a>y".  This can be added later. TODO

   289      */

   291     int32_t len = pattern.length();

   292     int32_t left = anteContextLength;

   293     int32_t left2 = r2.anteContextLength;

   294     int32_t right = len - left;

   295     int32_t right2 = r2.pattern.length() - left2;

   296     int32_t cachedCompare = r2.pattern.compare(left2 - left, len, pattern);

   298     // TODO Clean this up -- some logic might be combinable with the

   299     // next statement.

   301     // Test for anchor masking

   302     if (left == left2 && right == right2 &&

   303         keyLength <= r2.keyLength &&

   304         0 == cachedCompare) {

   305         // The following boolean logic implements the table above

   306         return (flags == r2.flags) ||

   307             (!(flags & ANCHOR_START) && !(flags & ANCHOR_END)) ||

   308             ((r2.flags & ANCHOR_START) && (r2.flags & ANCHOR_END));

   309     }

   311     return left <= left2 &&

   312         (right < right2 ||

   313          (right == right2 && keyLength <= r2.keyLength)) &&

   314          (0 == cachedCompare);

   315 }

   317 static inline int32_t posBefore(const Replaceable& str, int32_t pos) {

   318     return (pos > 0) ?

   319         pos - U16_LENGTH(str.char32At(pos-1)) :

   320         pos - 1;

   321 }

   323 static inline int32_t posAfter(const Replaceable& str, int32_t pos) {

   324     return (pos >= 0 && pos < str.length()) ?

   325         pos + U16_LENGTH(str.char32At(pos)) :

   326         pos + 1;

   327 }

   329 /**

   330  * Attempt a match and replacement at the given position.  Return

   331  * the degree of match between this rule and the given text.  The

   332  * degree of match may be mismatch, a partial match, or a full

   333  * match.  A mismatch means at least one character of the text

   334  * does not match the context or key.  A partial match means some

   335  * context and key characters match, but the text is not long

   336  * enough to match all of them.  A full match means all context

   337  * and key characters match.

   338  *

   339  * If a full match is obtained, perform a replacement, update pos,

   340  * and return U_MATCH.  Otherwise both text and pos are unchanged.

   341  *

   342  * @param text the text

   343  * @param pos the position indices

   344  * @param incremental if TRUE, test for partial matches that may

   345  * be completed by additional text inserted at pos.limit.

   346  * @return one of <code>U_MISMATCH</code>,

   347  * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If

   348  * incremental is FALSE then U_PARTIAL_MATCH will not be returned.

   349  */

   350 UMatchDegree TransliterationRule::matchAndReplace(Replaceable& text,

   351                                                   UTransPosition& pos,

   352                                                   UBool incremental) const {

   353     // Matching and replacing are done in one method because the

   354     // replacement operation needs information obtained during the

   355     // match.  Another way to do this is to have the match method

   356     // create a match result struct with relevant offsets, and to pass

   357     // this into the replace method.

   359     // ============================ MATCH ===========================

   361     // Reset segment match data

   362     if (segments != NULL) {

   363         for (int32_t i=0; i<segmentsCount; ++i) {

   364             ((StringMatcher*) segments[i])->resetMatch();

   365         }

   366     }

   368 //    int32_t lenDelta, keyLimit;

   369     int32_t keyLimit;

   371     // ------------------------ Ante Context ------------------------

   373     // A mismatch in the ante context, or with the start anchor,

   374     // is an outright U_MISMATCH regardless of whether we are

   375     // incremental or not.

   376     int32_t oText; // offset into 'text'

   377 //    int32_t newStart = 0;

   378     int32_t minOText;

   380     // Note (1): We process text in 16-bit code units, rather than

   381     // 32-bit code points.  This works because stand-ins are

   382     // always in the BMP and because we are doing a literal match

   383     // operation, which can be done 16-bits at a time.

   385     int32_t anteLimit = posBefore(text, pos.contextStart);

   387     UMatchDegree match;

   389     // Start reverse match at char before pos.start

   390     oText = posBefore(text, pos.start);

   392     if (anteContext != NULL) {

   393         match = anteContext->matches(text, oText, anteLimit, FALSE);

   394         if (match != U_MATCH) {

   395             return U_MISMATCH;

   396         }

   397     }

   399     minOText = posAfter(text, oText);

   401     // ------------------------ Start Anchor ------------------------

   403     if (((flags & ANCHOR_START) != 0) && oText != anteLimit) {

   404         return U_MISMATCH;

   405     }

   407     // -------------------- Key and Post Context --------------------

   409     oText = pos.start;

   411     if (key != NULL) {

   412         match = key->matches(text, oText, pos.limit, incremental);

   413         if (match != U_MATCH) {

   414             return match;

   415         }

   416     }

   418     keyLimit = oText;

   420     if (postContext != NULL) {

   421         if (incremental && keyLimit == pos.limit) {

   422             // The key matches just before pos.limit, and there is

   423             // a postContext.  Since we are in incremental mode,

   424             // we must assume more characters may be inserted at

   425             // pos.limit -- this is a partial match.

   426             return U_PARTIAL_MATCH;

   427         }

   429         match = postContext->matches(text, oText, pos.contextLimit, incremental);

   430         if (match != U_MATCH) {

   431             return match;

   432         }

   433     }

   435     // ------------------------- Stop Anchor ------------------------

   437     if (((flags & ANCHOR_END)) != 0) {

   438         if (oText != pos.contextLimit) {

   439             return U_MISMATCH;

   440         }

   441         if (incremental) {

   442             return U_PARTIAL_MATCH;

   443         }

   444     }

   446     // =========================== REPLACE ==========================

   448     // We have a full match.  The key is between pos.start and

   449     // keyLimit.

   451     int32_t newStart;

   452     int32_t newLength = output->toReplacer()->replace(text, pos.start, keyLimit, newStart);

   453     int32_t lenDelta = newLength - (keyLimit - pos.start);

   455     oText += lenDelta;

   456     pos.limit += lenDelta;

   457     pos.contextLimit += lenDelta;

   458     // Restrict new value of start to [minOText, min(oText, pos.limit)].

   459     pos.start = uprv_max(minOText, uprv_min(uprv_min(oText, pos.limit), newStart));

   460     return U_MATCH;

   461 }

   463 /**

   464  * Create a source string that represents this rule.  Append it to the

   465  * given string.

   466  */

   467 UnicodeString& TransliterationRule::toRule(UnicodeString& rule,

   468                                            UBool escapeUnprintable) const {

   470     // Accumulate special characters (and non-specials following them)

   471     // into quoteBuf.  Append quoteBuf, within single quotes, when

   472     // a non-quoted element must be inserted.

   473     UnicodeString str, quoteBuf;

   475     // Do not emit the braces '{' '}' around the pattern if there

   476     // is neither anteContext nor postContext.

   477     UBool emitBraces =

   478         (anteContext != NULL) || (postContext != NULL);

   480     // Emit start anchor

   481     if ((flags & ANCHOR_START) != 0) {

   482         rule.append((UChar)94/*^*/);

   483     }

   485     // Emit the input pattern

   486     ICU_Utility::appendToRule(rule, anteContext, escapeUnprintable, quoteBuf);

   488     if (emitBraces) {

   489         ICU_Utility::appendToRule(rule, (UChar) 0x007B /*{*/, TRUE, escapeUnprintable, quoteBuf);

   490     }

   492     ICU_Utility::appendToRule(rule, key, escapeUnprintable, quoteBuf);

   494     if (emitBraces) {

   495         ICU_Utility::appendToRule(rule, (UChar) 0x007D /*}*/, TRUE, escapeUnprintable, quoteBuf);

   496     }

   498     ICU_Utility::appendToRule(rule, postContext, escapeUnprintable, quoteBuf);

   500     // Emit end anchor

   501     if ((flags & ANCHOR_END) != 0) {

   502         rule.append((UChar)36/*$*/);

   503     }

   505     ICU_Utility::appendToRule(rule, UnicodeString(TRUE, FORWARD_OP, 3), TRUE, escapeUnprintable, quoteBuf);

   507     // Emit the output pattern

   509     ICU_Utility::appendToRule(rule, output->toReplacer()->toReplacerPattern(str, escapeUnprintable),

   510                               TRUE, escapeUnprintable, quoteBuf);

   512     ICU_Utility::appendToRule(rule, (UChar) 0x003B /*;*/, TRUE, escapeUnprintable, quoteBuf);

   514     return rule;

   515 }

   517 void TransliterationRule::setData(const TransliterationRuleData* d) {

   518     data = d;

   519     if (anteContext != NULL) anteContext->setData(d);

   520     if (postContext != NULL) postContext->setData(d);

   521     if (key != NULL) key->setData(d);

   522     // assert(output != NULL);

   523     output->setData(d);

   524     // Don't have to do segments since they are in the context or key

   525 }

   527 /**

   528  * Union the set of all characters that may be modified by this rule

   529  * into the given set.

   530  */

   531 void TransliterationRule::addSourceSetTo(UnicodeSet& toUnionTo) const {

   532     int32_t limit = anteContextLength + keyLength;

   533     for (int32_t i=anteContextLength; i<limit; ) {

   534         UChar32 ch = pattern.char32At(i);

   535         i += U16_LENGTH(ch);

   536         const UnicodeMatcher* matcher = data->lookupMatcher(ch);

   537         if (matcher == NULL) {

   538             toUnionTo.add(ch);

   539         } else {

   540             matcher->addMatchSetTo(toUnionTo);

   541         }

   542     }

   543 }

   545 /**

   546  * Union the set of all characters that may be emitted by this rule

   547  * into the given set.

   548  */

   549 void TransliterationRule::addTargetSetTo(UnicodeSet& toUnionTo) const {

   550     output->toReplacer()->addReplacementSetTo(toUnionTo);

   551 }

   553 U_NAMESPACE_END

   555 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

   557 //eof

The Tor Browser / file revision

intl/icu/source/i18n/rbt_rule.cpp@b8a032363ba2

intl/icu/source/i18n/rbt_rule.cpp