intl/icu/source/i18n/rbt_rule.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/rbt_rule.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,308 @@
     1.4 +/*
     1.5 +* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved.
     1.6 +**********************************************************************
     1.7 +*   Date        Name        Description
     1.8 +*   11/17/99    aliu        Creation.
     1.9 +**********************************************************************
    1.10 +*/
    1.11 +#ifndef RBT_RULE_H
    1.12 +#define RBT_RULE_H
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +
    1.16 +#if !UCONFIG_NO_TRANSLITERATION
    1.17 +
    1.18 +#include "unicode/uobject.h"
    1.19 +#include "unicode/unistr.h"
    1.20 +#include "unicode/utrans.h"
    1.21 +#include "unicode/unimatch.h"
    1.22 +
    1.23 +U_NAMESPACE_BEGIN
    1.24 +
    1.25 +class Replaceable;
    1.26 +class TransliterationRuleData;
    1.27 +class StringMatcher;
    1.28 +class UnicodeFunctor;
    1.29 +
    1.30 +/**
    1.31 + * A transliteration rule used by
    1.32 + * <code>RuleBasedTransliterator</code>.
    1.33 + * <code>TransliterationRule</code> is an immutable object.
    1.34 + *
    1.35 + * <p>A rule consists of an input pattern and an output string.  When
    1.36 + * the input pattern is matched, the output string is emitted.  The
    1.37 + * input pattern consists of zero or more characters which are matched
    1.38 + * exactly (the key) and optional context.  Context must match if it
    1.39 + * is specified.  Context may be specified before the key, after the
    1.40 + * key, or both.  The key, preceding context, and following context
    1.41 + * may contain variables.  Variables represent a set of Unicode
    1.42 + * characters, such as the letters <i>a</i> through <i>z</i>.
    1.43 + * Variables are detected by looking up each character in a supplied
    1.44 + * variable list to see if it has been so defined.
    1.45 + *
    1.46 + * <p>A rule may contain segments in its input string and segment
    1.47 + * references in its output string.  A segment is a substring of the
    1.48 + * input pattern, indicated by an offset and limit.  The segment may
    1.49 + * be in the preceding or following context.  It may not span a
    1.50 + * context boundary.  A segment reference is a special character in
    1.51 + * the output string that causes a segment of the input string (not
    1.52 + * the input pattern) to be copied to the output string.  The range of
    1.53 + * special characters that represent segment references is defined by
    1.54 + * RuleBasedTransliterator.Data.
    1.55 + *
    1.56 + * @author Alan Liu
    1.57 + */
    1.58 +class TransliterationRule : public UMemory {
    1.59 +
    1.60 +private:
    1.61 +
    1.62 +    // TODO Eliminate the pattern and keyLength data members.  They
    1.63 +    // are used only by masks() and getIndexValue() which are called
    1.64 +    // only during build time, not during run-time.  Perhaps these
    1.65 +    // methods and pattern/keyLength can be isolated into a separate
    1.66 +    // object.
    1.67 +
    1.68 +    /**
    1.69 +     * The match that must occur before the key, or null if there is no
    1.70 +     * preceding context.
    1.71 +     */
    1.72 +    StringMatcher *anteContext;
    1.73 +
    1.74 +    /**
    1.75 +     * The matcher object for the key.  If null, then the key is empty.
    1.76 +     */
    1.77 +    StringMatcher *key;
    1.78 +
    1.79 +    /**
    1.80 +     * The match that must occur after the key, or null if there is no
    1.81 +     * following context.
    1.82 +     */
    1.83 +    StringMatcher *postContext;
    1.84 +
    1.85 +    /**
    1.86 +     * The object that performs the replacement if the key,
    1.87 +     * anteContext, and postContext are matched.  Never null.
    1.88 +     */
    1.89 +    UnicodeFunctor* output;
    1.90 +
    1.91 +    /**
    1.92 +     * The string that must be matched, consisting of the anteContext, key,
    1.93 +     * and postContext, concatenated together, in that order.  Some components
    1.94 +     * may be empty (zero length).
    1.95 +     * @see anteContextLength
    1.96 +     * @see keyLength
    1.97 +     */
    1.98 +    UnicodeString pattern;
    1.99 +
   1.100 +    /**
   1.101 +     * An array of matcher objects corresponding to the input pattern
   1.102 +     * segments.  If there are no segments this is null.  N.B. This is
   1.103 +     * a UnicodeMatcher for generality, but in practice it is always a
   1.104 +     * StringMatcher.  In the future we may generalize this, but for
   1.105 +     * now we sometimes cast down to StringMatcher.
   1.106 +     *
   1.107 +     * The array is owned, but the pointers within it are not.
   1.108 +     */
   1.109 +    UnicodeFunctor** segments;
   1.110 +
   1.111 +    /**
   1.112 +     * The number of elements in segments[] or zero if segments is NULL.
   1.113 +     */
   1.114 +    int32_t segmentsCount;
   1.115 +
   1.116 +    /**
   1.117 +     * The length of the string that must match before the key.  If
   1.118 +     * zero, then there is no matching requirement before the key.
   1.119 +     * Substring [0,anteContextLength) of pattern is the anteContext.
   1.120 +     */
   1.121 +    int32_t anteContextLength;
   1.122 +
   1.123 +    /**
   1.124 +     * The length of the key.  Substring [anteContextLength,
   1.125 +     * anteContextLength + keyLength) is the key.
   1.126 +
   1.127 +     */
   1.128 +    int32_t keyLength;
   1.129 +
   1.130 +    /**
   1.131 +     * Miscellaneous attributes.
   1.132 +     */
   1.133 +    int8_t flags;
   1.134 +
   1.135 +    /**
   1.136 +     * Flag attributes.
   1.137 +     */
   1.138 +    enum {
   1.139 +        ANCHOR_START = 1,
   1.140 +        ANCHOR_END   = 2
   1.141 +    };
   1.142 +
   1.143 +    /**
   1.144 +     * An alias pointer to the data for this rule.  The data provides
   1.145 +     * lookup services for matchers and segments.
   1.146 +     */
   1.147 +    const TransliterationRuleData* data;
   1.148 +
   1.149 +public:
   1.150 +
   1.151 +    /**
   1.152 +     * Construct a new rule with the given input, output text, and other
   1.153 +     * attributes.  A cursor position may be specified for the output text.
   1.154 +     * @param input          input string, including key and optional ante and
   1.155 +     *                       post context.
   1.156 +     * @param anteContextPos offset into input to end of ante context, or -1 if
   1.157 +     *                       none.  Must be <= input.length() if not -1.
   1.158 +     * @param postContextPos offset into input to start of post context, or -1
   1.159 +     *                       if none.  Must be <= input.length() if not -1, and must be >=
   1.160 +     *                       anteContextPos.
   1.161 +     * @param outputStr      output string.
   1.162 +     * @param cursorPosition offset into output at which cursor is located, or -1 if
   1.163 +     *                       none.  If less than zero, then the cursor is placed after the
   1.164 +     *                       <code>output</code>; that is, -1 is equivalent to
   1.165 +     *                       <code>output.length()</code>.  If greater than
   1.166 +     *                       <code>output.length()</code> then an exception is thrown.
   1.167 +     * @param cursorOffset   an offset to be added to cursorPos to position the
   1.168 +     *                       cursor either in the ante context, if < 0, or in the post context, if >
   1.169 +     *                       0.  For example, the rule "abc{def} > | @@@ xyz;" changes "def" to
   1.170 +     *                       "xyz" and moves the cursor to before "a".  It would have a cursorOffset
   1.171 +     *                       of -3.
   1.172 +     * @param segs           array of UnicodeMatcher corresponding to input pattern
   1.173 +     *                       segments, or null if there are none.  The array itself is adopted,
   1.174 +     *                       but the pointers within it are not.
   1.175 +     * @param segsCount      number of elements in segs[].
   1.176 +     * @param anchorStart    TRUE if the the rule is anchored on the left to
   1.177 +     *                       the context start.
   1.178 +     * @param anchorEnd      TRUE if the rule is anchored on the right to the
   1.179 +     *                       context limit.
   1.180 +     * @param data           the rule data.
   1.181 +     * @param status         Output parameter filled in with success or failure status.
   1.182 +     */
   1.183 +    TransliterationRule(const UnicodeString& input,
   1.184 +                        int32_t anteContextPos, int32_t postContextPos,
   1.185 +                        const UnicodeString& outputStr,
   1.186 +                        int32_t cursorPosition, int32_t cursorOffset,
   1.187 +                        UnicodeFunctor** segs,
   1.188 +                        int32_t segsCount,
   1.189 +                        UBool anchorStart, UBool anchorEnd,
   1.190 +                        const TransliterationRuleData* data,
   1.191 +                        UErrorCode& status);
   1.192 +
   1.193 +    /**
   1.194 +     * Copy constructor.
   1.195 +     * @param other    the object to be copied.
   1.196 +     */
   1.197 +    TransliterationRule(TransliterationRule& other);
   1.198 +
   1.199 +    /**
   1.200 +     * Destructor.
   1.201 +     */
   1.202 +    virtual ~TransliterationRule();
   1.203 +
   1.204 +    /**
   1.205 +     * Change the data object that this rule belongs to.  Used
   1.206 +     * internally by the TransliterationRuleData copy constructor.
   1.207 +     * @param data    the new data value to be set.
   1.208 +     */
   1.209 +    void setData(const TransliterationRuleData* data);
   1.210 +
   1.211 +    /**
   1.212 +     * Return the preceding context length.  This method is needed to
   1.213 +     * support the <code>Transliterator</code> method
   1.214 +     * <code>getMaximumContextLength()</code>.  Internally, this is
   1.215 +     * implemented as the anteContextLength, optionally plus one if
   1.216 +     * there is a start anchor.  The one character anchor gap is
   1.217 +     * needed to make repeated incremental transliteration with
   1.218 +     * anchors work.
   1.219 +     * @return    the preceding context length.
   1.220 +     */
   1.221 +    virtual int32_t getContextLength(void) const;
   1.222 +
   1.223 +    /**
   1.224 +     * Internal method.  Returns 8-bit index value for this rule.
   1.225 +     * This is the low byte of the first character of the key,
   1.226 +     * unless the first character of the key is a set.  If it's a
   1.227 +     * set, or otherwise can match multiple keys, the index value is -1.
   1.228 +     * @return    8-bit index value for this rule.
   1.229 +     */
   1.230 +    int16_t getIndexValue() const;
   1.231 +
   1.232 +    /**
   1.233 +     * Internal method.  Returns true if this rule matches the given
   1.234 +     * index value.  The index value is an 8-bit integer, 0..255,
   1.235 +     * representing the low byte of the first character of the key.
   1.236 +     * It matches this rule if it matches the first character of the
   1.237 +     * key, or if the first character of the key is a set, and the set
   1.238 +     * contains any character with a low byte equal to the index
   1.239 +     * value.  If the rule contains only ante context, as in foo)>bar,
   1.240 +     * then it will match any key.
   1.241 +     * @param v    the given index value.
   1.242 +     * @return     true if this rule matches the given index value.
   1.243 +     */
   1.244 +    UBool matchesIndexValue(uint8_t v) const;
   1.245 +
   1.246 +    /**
   1.247 +     * Return true if this rule masks another rule.  If r1 masks r2 then
   1.248 +     * r1 matches any input string that r2 matches.  If r1 masks r2 and r2 masks
   1.249 +     * r1 then r1 == r2.  Examples: "a>x" masks "ab>y".  "a>x" masks "a[b]>y".
   1.250 +     * "[c]a>x" masks "[dc]a>y".
   1.251 +     * @param r2  the given rule to be compared with.
   1.252 +     * @return    true if this rule masks 'r2'
   1.253 +     */
   1.254 +    virtual UBool masks(const TransliterationRule& r2) const;
   1.255 +
   1.256 +    /**
   1.257 +     * Attempt a match and replacement at the given position.  Return
   1.258 +     * the degree of match between this rule and the given text.  The
   1.259 +     * degree of match may be mismatch, a partial match, or a full
   1.260 +     * match.  A mismatch means at least one character of the text
   1.261 +     * does not match the context or key.  A partial match means some
   1.262 +     * context and key characters match, but the text is not long
   1.263 +     * enough to match all of them.  A full match means all context
   1.264 +     * and key characters match.
   1.265 +     * 
   1.266 +     * If a full match is obtained, perform a replacement, update pos,
   1.267 +     * and return U_MATCH.  Otherwise both text and pos are unchanged.
   1.268 +     * 
   1.269 +     * @param text the text
   1.270 +     * @param pos the position indices
   1.271 +     * @param incremental if TRUE, test for partial matches that may
   1.272 +     * be completed by additional text inserted at pos.limit.
   1.273 +     * @return one of <code>U_MISMATCH</code>,
   1.274 +     * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>.  If
   1.275 +     * incremental is FALSE then U_PARTIAL_MATCH will not be returned.
   1.276 +     */
   1.277 +    UMatchDegree matchAndReplace(Replaceable& text,
   1.278 +                                 UTransPosition& pos,
   1.279 +                                 UBool incremental) const;
   1.280 +
   1.281 +    /**
   1.282 +     * Create a rule string that represents this rule object.  Append
   1.283 +     * it to the given string.
   1.284 +     */
   1.285 +    virtual UnicodeString& toRule(UnicodeString& pat,
   1.286 +                                  UBool escapeUnprintable) const;
   1.287 +
   1.288 +    /**
   1.289 +     * Union the set of all characters that may be modified by this rule
   1.290 +     * into the given set.
   1.291 +     */
   1.292 +    void addSourceSetTo(UnicodeSet& toUnionTo) const;
   1.293 +
   1.294 +    /**
   1.295 +     * Union the set of all characters that may be emitted by this rule
   1.296 +     * into the given set.
   1.297 +     */
   1.298 +    void addTargetSetTo(UnicodeSet& toUnionTo) const;
   1.299 +
   1.300 + private:
   1.301 +
   1.302 +    friend class StringMatcher;
   1.303 +
   1.304 +    TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class
   1.305 +};
   1.306 +
   1.307 +U_NAMESPACE_END
   1.308 +
   1.309 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.310 +
   1.311 +#endif

mercurial