intl/icu/source/i18n/rbt_rule.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
     3 **********************************************************************
     4 *   Date        Name        Description
     5 *   11/17/99    aliu        Creation.
     6 **********************************************************************
     7 */
     8 #ifndef RBT_RULE_H
     9 #define RBT_RULE_H
    11 #include "unicode/utypes.h"
    13 #if !UCONFIG_NO_TRANSLITERATION
    15 #include "unicode/uobject.h"
    16 #include "unicode/unistr.h"
    17 #include "unicode/utrans.h"
    18 #include "unicode/unimatch.h"
    20 U_NAMESPACE_BEGIN
    22 class Replaceable;
    23 class TransliterationRuleData;
    24 class StringMatcher;
    25 class UnicodeFunctor;
    27 /**
    28  * A transliteration rule used by
    29  * <code>RuleBasedTransliterator</code>.
    30  * <code>TransliterationRule</code> is an immutable object.
    31  *
    32  * <p>A rule consists of an input pattern and an output string.  When
    33  * the input pattern is matched, the output string is emitted.  The
    34  * input pattern consists of zero or more characters which are matched
    35  * exactly (the key) and optional context.  Context must match if it
    36  * is specified.  Context may be specified before the key, after the
    37  * key, or both.  The key, preceding context, and following context
    38  * may contain variables.  Variables represent a set of Unicode
    39  * characters, such as the letters <i>a</i> through <i>z</i>.
    40  * Variables are detected by looking up each character in a supplied
    41  * variable list to see if it has been so defined.
    42  *
    43  * <p>A rule may contain segments in its input string and segment
    44  * references in its output string.  A segment is a substring of the
    45  * input pattern, indicated by an offset and limit.  The segment may
    46  * be in the preceding or following context.  It may not span a
    47  * context boundary.  A segment reference is a special character in
    48  * the output string that causes a segment of the input string (not
    49  * the input pattern) to be copied to the output string.  The range of
    50  * special characters that represent segment references is defined by
    51  * RuleBasedTransliterator.Data.
    52  *
    53  * @author Alan Liu
    54  */
    55 class TransliterationRule : public UMemory {
    57 private:
    59     // TODO Eliminate the pattern and keyLength data members.  They
    60     // are used only by masks() and getIndexValue() which are called
    61     // only during build time, not during run-time.  Perhaps these
    62     // methods and pattern/keyLength can be isolated into a separate
    63     // object.
    65     /**
    66      * The match that must occur before the key, or null if there is no
    67      * preceding context.
    68      */
    69     StringMatcher *anteContext;
    71     /**
    72      * The matcher object for the key.  If null, then the key is empty.
    73      */
    74     StringMatcher *key;
    76     /**
    77      * The match that must occur after the key, or null if there is no
    78      * following context.
    79      */
    80     StringMatcher *postContext;
    82     /**
    83      * The object that performs the replacement if the key,
    84      * anteContext, and postContext are matched.  Never null.
    85      */
    86     UnicodeFunctor* output;
    88     /**
    89      * The string that must be matched, consisting of the anteContext, key,
    90      * and postContext, concatenated together, in that order.  Some components
    91      * may be empty (zero length).
    92      * @see anteContextLength
    93      * @see keyLength
    94      */
    95     UnicodeString pattern;
    97     /**
    98      * An array of matcher objects corresponding to the input pattern
    99      * segments.  If there are no segments this is null.  N.B. This is
   100      * a UnicodeMatcher for generality, but in practice it is always a
   101      * StringMatcher.  In the future we may generalize this, but for
   102      * now we sometimes cast down to StringMatcher.
   103      *
   104      * The array is owned, but the pointers within it are not.
   105      */
   106     UnicodeFunctor** segments;
   108     /**
   109      * The number of elements in segments[] or zero if segments is NULL.
   110      */
   111     int32_t segmentsCount;
   113     /**
   114      * The length of the string that must match before the key.  If
   115      * zero, then there is no matching requirement before the key.
   116      * Substring [0,anteContextLength) of pattern is the anteContext.
   117      */
   118     int32_t anteContextLength;
   120     /**
   121      * The length of the key.  Substring [anteContextLength,
   122      * anteContextLength + keyLength) is the key.
   124      */
   125     int32_t keyLength;
   127     /**
   128      * Miscellaneous attributes.
   129      */
   130     int8_t flags;
   132     /**
   133      * Flag attributes.
   134      */
   135     enum {
   136         ANCHOR_START = 1,
   137         ANCHOR_END   = 2
   138     };
   140     /**
   141      * An alias pointer to the data for this rule.  The data provides
   142      * lookup services for matchers and segments.
   143      */
   144     const TransliterationRuleData* data;
   146 public:
   148     /**
   149      * Construct a new rule with the given input, output text, and other
   150      * attributes.  A cursor position may be specified for the output text.
   151      * @param input          input string, including key and optional ante and
   152      *                       post context.
   153      * @param anteContextPos offset into input to end of ante context, or -1 if
   154      *                       none.  Must be <= input.length() if not -1.
   155      * @param postContextPos offset into input to start of post context, or -1
   156      *                       if none.  Must be <= input.length() if not -1, and must be >=
   157      *                       anteContextPos.
   158      * @param outputStr      output string.
   159      * @param cursorPosition offset into output at which cursor is located, or -1 if
   160      *                       none.  If less than zero, then the cursor is placed after the
   161      *                       <code>output</code>; that is, -1 is equivalent to
   162      *                       <code>output.length()</code>.  If greater than
   163      *                       <code>output.length()</code> then an exception is thrown.
   164      * @param cursorOffset   an offset to be added to cursorPos to position the
   165      *                       cursor either in the ante context, if < 0, or in the post context, if >
   166      *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
   167      *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
   168      *                       of -3.
   169      * @param segs           array of UnicodeMatcher corresponding to input pattern
   170      *                       segments, or null if there are none.  The array itself is adopted,
   171      *                       but the pointers within it are not.
   172      * @param segsCount      number of elements in segs[].
   173      * @param anchorStart    TRUE if the the rule is anchored on the left to
   174      *                       the context start.
   175      * @param anchorEnd      TRUE if the rule is anchored on the right to the
   176      *                       context limit.
   177      * @param data           the rule data.
   178      * @param status         Output parameter filled in with success or failure status.
   179      */
   180     TransliterationRule(const UnicodeString& input,
   181                         int32_t anteContextPos, int32_t postContextPos,
   182                         const UnicodeString& outputStr,
   183                         int32_t cursorPosition, int32_t cursorOffset,
   184                         UnicodeFunctor** segs,
   185                         int32_t segsCount,
   186                         UBool anchorStart, UBool anchorEnd,
   187                         const TransliterationRuleData* data,
   188                         UErrorCode& status);
   190     /**
   191      * Copy constructor.
   192      * @param other    the object to be copied.
   193      */
   194     TransliterationRule(TransliterationRule& other);
   196     /**
   197      * Destructor.
   198      */
   199     virtual ~TransliterationRule();
   201     /**
   202      * Change the data object that this rule belongs to.  Used
   203      * internally by the TransliterationRuleData copy constructor.
   204      * @param data    the new data value to be set.
   205      */
   206     void setData(const TransliterationRuleData* data);
   208     /**
   209      * Return the preceding context length.  This method is needed to
   210      * support the <code>Transliterator</code> method
   211      * <code>getMaximumContextLength()</code>.  Internally, this is
   212      * implemented as the anteContextLength, optionally plus one if
   213      * there is a start anchor.  The one character anchor gap is
   214      * needed to make repeated incremental transliteration with
   215      * anchors work.
   216      * @return    the preceding context length.
   217      */
   218     virtual int32_t getContextLength(void) const;
   220     /**
   221      * Internal method.  Returns 8-bit index value for this rule.
   222      * This is the low byte of the first character of the key,
   223      * unless the first character of the key is a set.  If it's a
   224      * set, or otherwise can match multiple keys, the index value is -1.
   225      * @return    8-bit index value for this rule.
   226      */
   227     int16_t getIndexValue() const;
   229     /**
   230      * Internal method.  Returns true if this rule matches the given
   231      * index value.  The index value is an 8-bit integer, 0..255,
   232      * representing the low byte of the first character of the key.
   233      * It matches this rule if it matches the first character of the
   234      * key, or if the first character of the key is a set, and the set
   235      * contains any character with a low byte equal to the index
   236      * value.  If the rule contains only ante context, as in foo)>bar,
   237      * then it will match any key.
   238      * @param v    the given index value.
   239      * @return     true if this rule matches the given index value.
   240      */
   241     UBool matchesIndexValue(uint8_t v) const;
   243     /**
   244      * Return true if this rule masks another rule.  If r1 masks r2 then
   245      * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
   246      * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
   247      * "[c]a>x" masks "[dc]a>y".
   248      * @param r2  the given rule to be compared with.
   249      * @return    true if this rule masks 'r2'
   250      */
   251     virtual UBool masks(const TransliterationRule& r2) const;
   253     /**
   254      * Attempt a match and replacement at the given position.  Return
   255      * the degree of match between this rule and the given text.  The
   256      * degree of match may be mismatch, a partial match, or a full
   257      * match.  A mismatch means at least one character of the text
   258      * does not match the context or key.  A partial match means some
   259      * context and key characters match, but the text is not long
   260      * enough to match all of them.  A full match means all context
   261      * and key characters match.
   262      * 
   263      * If a full match is obtained, perform a replacement, update pos,
   264      * and return U_MATCH.  Otherwise both text and pos are unchanged.
   265      * 
   266      * @param text the text
   267      * @param pos the position indices
   268      * @param incremental if TRUE, test for partial matches that may
   269      * be completed by additional text inserted at pos.limit.
   270      * @return one of <code>U_MISMATCH</code>,
   271      * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
   272      * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
   273      */
   274     UMatchDegree matchAndReplace(Replaceable& text,
   275                                  UTransPosition& pos,
   276                                  UBool incremental) const;
   278     /**
   279      * Create a rule string that represents this rule object.  Append
   280      * it to the given string.
   281      */
   282     virtual UnicodeString& toRule(UnicodeString& pat,
   283                                   UBool escapeUnprintable) const;
   285     /**
   286      * Union the set of all characters that may be modified by this rule
   287      * into the given set.
   288      */
   289     void addSourceSetTo(UnicodeSet& toUnionTo) const;
   291     /**
   292      * Union the set of all characters that may be emitted by this rule
   293      * into the given set.
   294      */
   295     void addTargetSetTo(UnicodeSet& toUnionTo) const;
   297  private:
   299     friend class StringMatcher;
   301     TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
   302 };
   304 U_NAMESPACE_END
   306 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   308 #endif

mercurial