The Tor Browser: intl/icu/source/i18n/rbt

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.

     3 **********************************************************************

     4 *   Date        Name        Description

     5 *   11/17/99    aliu        Creation.

     6 **********************************************************************

     7 */

     8 #ifndef RBT_RULE_H

     9 #define RBT_RULE_H

    11 #include "unicode/utypes.h"

    13 #if !UCONFIG_NO_TRANSLITERATION

    15 #include "unicode/uobject.h"

    16 #include "unicode/unistr.h"

    17 #include "unicode/utrans.h"

    18 #include "unicode/unimatch.h"

    20 U_NAMESPACE_BEGIN

    22 class Replaceable;

    23 class TransliterationRuleData;

    24 class StringMatcher;

    25 class UnicodeFunctor;

    27 /**

    28  * A transliteration rule used by

    29  * <code>RuleBasedTransliterator</code>.

    30  * <code>TransliterationRule</code> is an immutable object.

    31  *

    32  * <p>A rule consists of an input pattern and an output string.  When

    33  * the input pattern is matched, the output string is emitted.  The

    34  * input pattern consists of zero or more characters which are matched

    35  * exactly (the key) and optional context.  Context must match if it

    36  * is specified.  Context may be specified before the key, after the

    37  * key, or both.  The key, preceding context, and following context

    38  * may contain variables.  Variables represent a set of Unicode

    39  * characters, such as the letters <i>a</i> through <i>z</i>.

    40  * Variables are detected by looking up each character in a supplied

    41  * variable list to see if it has been so defined.

    42  *

    43  * <p>A rule may contain segments in its input string and segment

    44  * references in its output string.  A segment is a substring of the

    45  * input pattern, indicated by an offset and limit.  The segment may

    46  * be in the preceding or following context.  It may not span a

    47  * context boundary.  A segment reference is a special character in

    48  * the output string that causes a segment of the input string (not

    49  * the input pattern) to be copied to the output string.  The range of

    50  * special characters that represent segment references is defined by

    51  * RuleBasedTransliterator.Data.

    52  *

    53  * @author Alan Liu

    54  */

    55 class TransliterationRule : public UMemory {

    57 private:

    59     // TODO Eliminate the pattern and keyLength data members.  They

    60     // are used only by masks() and getIndexValue() which are called

    61     // only during build time, not during run-time.  Perhaps these

    62     // methods and pattern/keyLength can be isolated into a separate

    63     // object.

    65     /**

    66      * The match that must occur before the key, or null if there is no

    67      * preceding context.

    68      */

    69     StringMatcher *anteContext;

    71     /**

    72      * The matcher object for the key.  If null, then the key is empty.

    73      */

    74     StringMatcher *key;

    76     /**

    77      * The match that must occur after the key, or null if there is no

    78      * following context.

    79      */

    80     StringMatcher *postContext;

    82     /**

    83      * The object that performs the replacement if the key,

    84      * anteContext, and postContext are matched.  Never null.

    85      */

    86     UnicodeFunctor* output;

    88     /**

    89      * The string that must be matched, consisting of the anteContext, key,

    90      * and postContext, concatenated together, in that order.  Some components

    91      * may be empty (zero length).

    92      * @see anteContextLength

    93      * @see keyLength

    94      */

    95     UnicodeString pattern;

    97     /**

    98      * An array of matcher objects corresponding to the input pattern

    99      * segments.  If there are no segments this is null.  N.B. This is

   100      * a UnicodeMatcher for generality, but in practice it is always a

   101      * StringMatcher.  In the future we may generalize this, but for

   102      * now we sometimes cast down to StringMatcher.

   103      *

   104      * The array is owned, but the pointers within it are not.

   105      */

   106     UnicodeFunctor** segments;

   108     /**

   109      * The number of elements in segments[] or zero if segments is NULL.

   110      */

   111     int32_t segmentsCount;

   113     /**

   114      * The length of the string that must match before the key.  If

   115      * zero, then there is no matching requirement before the key.

   116      * Substring [0,anteContextLength) of pattern is the anteContext.

   117      */

   118     int32_t anteContextLength;

   120     /**

   121      * The length of the key.  Substring [anteContextLength,

   122      * anteContextLength + keyLength) is the key.

   124      */

   125     int32_t keyLength;

   127     /**

   128      * Miscellaneous attributes.

   129      */

   130     int8_t flags;

   132     /**

   133      * Flag attributes.

   134      */

   135     enum {

   136         ANCHOR_START = 1,

   137         ANCHOR_END   = 2

   138     };

   140     /**

   141      * An alias pointer to the data for this rule.  The data provides

   142      * lookup services for matchers and segments.

   143      */

   144     const TransliterationRuleData* data;

   146 public:

   148     /**

   149      * Construct a new rule with the given input, output text, and other

   150      * attributes.  A cursor position may be specified for the output text.

   151      * @param input          input string, including key and optional ante and

   152      *                       post context.

   153      * @param anteContextPos offset into input to end of ante context, or -1 if

   154      *                       none.  Must be <= input.length() if not -1.

   155      * @param postContextPos offset into input to start of post context, or -1

   156      *                       if none.  Must be <= input.length() if not -1, and must be >=

   157      *                       anteContextPos.

   158      * @param outputStr      output string.

   159      * @param cursorPosition offset into output at which cursor is located, or -1 if

   160      *                       none.  If less than zero, then the cursor is placed after the

   161      *                       <code>output</code>; that is, -1 is equivalent to

   162      *                       <code>output.length()</code>.  If greater than

   163      *                       <code>output.length()</code> then an exception is thrown.

   164      * @param cursorOffset   an offset to be added to cursorPos to position the

   165      *                       cursor either in the ante context, if < 0, or in the post context, if >

   166      *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to

   167      *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset

   168      *                       of -3.

   169      * @param segs           array of UnicodeMatcher corresponding to input pattern

   170      *                       segments, or null if there are none.  The array itself is adopted,

   171      *                       but the pointers within it are not.

   172      * @param segsCount      number of elements in segs[].

   173      * @param anchorStart    TRUE if the the rule is anchored on the left to

   174      *                       the context start.

   175      * @param anchorEnd      TRUE if the rule is anchored on the right to the

   176      *                       context limit.

   177      * @param data           the rule data.

   178      * @param status         Output parameter filled in with success or failure status.

   179      */

   180     TransliterationRule(const UnicodeString& input,

   181                         int32_t anteContextPos, int32_t postContextPos,

   182                         const UnicodeString& outputStr,

   183                         int32_t cursorPosition, int32_t cursorOffset,

   184                         UnicodeFunctor** segs,

   185                         int32_t segsCount,

   186                         UBool anchorStart, UBool anchorEnd,

   187                         const TransliterationRuleData* data,

   188                         UErrorCode& status);

   190     /**

   191      * Copy constructor.

   192      * @param other    the object to be copied.

   193      */

   194     TransliterationRule(TransliterationRule& other);

   196     /**

   197      * Destructor.

   198      */

   199     virtual ~TransliterationRule();

   201     /**

   202      * Change the data object that this rule belongs to.  Used

   203      * internally by the TransliterationRuleData copy constructor.

   204      * @param data    the new data value to be set.

   205      */

   206     void setData(const TransliterationRuleData* data);

   208     /**

   209      * Return the preceding context length.  This method is needed to

   210      * support the <code>Transliterator</code> method

   211      * <code>getMaximumContextLength()</code>.  Internally, this is

   212      * implemented as the anteContextLength, optionally plus one if

   213      * there is a start anchor.  The one character anchor gap is

   214      * needed to make repeated incremental transliteration with

   215      * anchors work.

   216      * @return    the preceding context length.

   217      */

   218     virtual int32_t getContextLength(void) const;

   220     /**

   221      * Internal method.  Returns 8-bit index value for this rule.

   222      * This is the low byte of the first character of the key,

   223      * unless the first character of the key is a set.  If it's a

   224      * set, or otherwise can match multiple keys, the index value is -1.

   225      * @return    8-bit index value for this rule.

   226      */

   227     int16_t getIndexValue() const;

   229     /**

   230      * Internal method.  Returns true if this rule matches the given

   231      * index value.  The index value is an 8-bit integer, 0..255,

   232      * representing the low byte of the first character of the key.

   233      * It matches this rule if it matches the first character of the

   234      * key, or if the first character of the key is a set, and the set

   235      * contains any character with a low byte equal to the index

   236      * value.  If the rule contains only ante context, as in foo)>bar,

   237      * then it will match any key.

   238      * @param v    the given index value.

   239      * @return     true if this rule matches the given index value.

   240      */

   241     UBool matchesIndexValue(uint8_t v) const;

   243     /**

   244      * Return true if this rule masks another rule.  If r1 masks r2 then

   245      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks

   246      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".

   247      * "[c]a>x" masks "[dc]a>y".

   248      * @param r2  the given rule to be compared with.

   249      * @return    true if this rule masks 'r2'

   250      */

   251     virtual UBool masks(const TransliterationRule& r2) const;

   253     /**

   254      * Attempt a match and replacement at the given position.  Return

   255      * the degree of match between this rule and the given text.  The

   256      * degree of match may be mismatch, a partial match, or a full

   257      * match.  A mismatch means at least one character of the text

   258      * does not match the context or key.  A partial match means some

   259      * context and key characters match, but the text is not long

   260      * enough to match all of them.  A full match means all context

   261      * and key characters match.

   262      *

   263      * If a full match is obtained, perform a replacement, update pos,

   264      * and return U_MATCH.  Otherwise both text and pos are unchanged.

   265      *

   266      * @param text the text

   267      * @param pos the position indices

   268      * @param incremental if TRUE, test for partial matches that may

   269      * be completed by additional text inserted at pos.limit.

   270      * @return one of <code>U_MISMATCH</code>,

   271      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If

   272      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.

   273      */

   274     UMatchDegree matchAndReplace(Replaceable& text,

   275                                  UTransPosition& pos,

   276                                  UBool incremental) const;

   278     /**

   279      * Create a rule string that represents this rule object.  Append

   280      * it to the given string.

   281      */

   282     virtual UnicodeString& toRule(UnicodeString& pat,

   283                                   UBool escapeUnprintable) const;

   285     /**

   286      * Union the set of all characters that may be modified by this rule

   287      * into the given set.

   288      */

   289     void addSourceSetTo(UnicodeSet& toUnionTo) const;

   291     /**

   292      * Union the set of all characters that may be emitted by this rule

   293      * into the given set.

   294      */

   295     void addTargetSetTo(UnicodeSet& toUnionTo) const;

   297  private:

   299     friend class StringMatcher;

   301     TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class

   302 };

   304 U_NAMESPACE_END

   306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */

   308 #endif

The Tor Browser / file revision

intl/icu/source/i18n/rbt_rule.h@6474c204b198

intl/icu/source/i18n/rbt_rule.h