1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/rbt_rule.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,308 @@ 1.4 +/* 1.5 +* Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. 1.6 +********************************************************************** 1.7 +* Date Name Description 1.8 +* 11/17/99 aliu Creation. 1.9 +********************************************************************** 1.10 +*/ 1.11 +#ifndef RBT_RULE_H 1.12 +#define RBT_RULE_H 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 + 1.16 +#if !UCONFIG_NO_TRANSLITERATION 1.17 + 1.18 +#include "unicode/uobject.h" 1.19 +#include "unicode/unistr.h" 1.20 +#include "unicode/utrans.h" 1.21 +#include "unicode/unimatch.h" 1.22 + 1.23 +U_NAMESPACE_BEGIN 1.24 + 1.25 +class Replaceable; 1.26 +class TransliterationRuleData; 1.27 +class StringMatcher; 1.28 +class UnicodeFunctor; 1.29 + 1.30 +/** 1.31 + * A transliteration rule used by 1.32 + * <code>RuleBasedTransliterator</code>. 1.33 + * <code>TransliterationRule</code> is an immutable object. 1.34 + * 1.35 + * <p>A rule consists of an input pattern and an output string. When 1.36 + * the input pattern is matched, the output string is emitted. The 1.37 + * input pattern consists of zero or more characters which are matched 1.38 + * exactly (the key) and optional context. Context must match if it 1.39 + * is specified. Context may be specified before the key, after the 1.40 + * key, or both. The key, preceding context, and following context 1.41 + * may contain variables. Variables represent a set of Unicode 1.42 + * characters, such as the letters <i>a</i> through <i>z</i>. 1.43 + * Variables are detected by looking up each character in a supplied 1.44 + * variable list to see if it has been so defined. 1.45 + * 1.46 + * <p>A rule may contain segments in its input string and segment 1.47 + * references in its output string. A segment is a substring of the 1.48 + * input pattern, indicated by an offset and limit. The segment may 1.49 + * be in the preceding or following context. It may not span a 1.50 + * context boundary. A segment reference is a special character in 1.51 + * the output string that causes a segment of the input string (not 1.52 + * the input pattern) to be copied to the output string. The range of 1.53 + * special characters that represent segment references is defined by 1.54 + * RuleBasedTransliterator.Data. 1.55 + * 1.56 + * @author Alan Liu 1.57 + */ 1.58 +class TransliterationRule : public UMemory { 1.59 + 1.60 +private: 1.61 + 1.62 + // TODO Eliminate the pattern and keyLength data members. They 1.63 + // are used only by masks() and getIndexValue() which are called 1.64 + // only during build time, not during run-time. Perhaps these 1.65 + // methods and pattern/keyLength can be isolated into a separate 1.66 + // object. 1.67 + 1.68 + /** 1.69 + * The match that must occur before the key, or null if there is no 1.70 + * preceding context. 1.71 + */ 1.72 + StringMatcher *anteContext; 1.73 + 1.74 + /** 1.75 + * The matcher object for the key. If null, then the key is empty. 1.76 + */ 1.77 + StringMatcher *key; 1.78 + 1.79 + /** 1.80 + * The match that must occur after the key, or null if there is no 1.81 + * following context. 1.82 + */ 1.83 + StringMatcher *postContext; 1.84 + 1.85 + /** 1.86 + * The object that performs the replacement if the key, 1.87 + * anteContext, and postContext are matched. Never null. 1.88 + */ 1.89 + UnicodeFunctor* output; 1.90 + 1.91 + /** 1.92 + * The string that must be matched, consisting of the anteContext, key, 1.93 + * and postContext, concatenated together, in that order. Some components 1.94 + * may be empty (zero length). 1.95 + * @see anteContextLength 1.96 + * @see keyLength 1.97 + */ 1.98 + UnicodeString pattern; 1.99 + 1.100 + /** 1.101 + * An array of matcher objects corresponding to the input pattern 1.102 + * segments. If there are no segments this is null. N.B. This is 1.103 + * a UnicodeMatcher for generality, but in practice it is always a 1.104 + * StringMatcher. In the future we may generalize this, but for 1.105 + * now we sometimes cast down to StringMatcher. 1.106 + * 1.107 + * The array is owned, but the pointers within it are not. 1.108 + */ 1.109 + UnicodeFunctor** segments; 1.110 + 1.111 + /** 1.112 + * The number of elements in segments[] or zero if segments is NULL. 1.113 + */ 1.114 + int32_t segmentsCount; 1.115 + 1.116 + /** 1.117 + * The length of the string that must match before the key. If 1.118 + * zero, then there is no matching requirement before the key. 1.119 + * Substring [0,anteContextLength) of pattern is the anteContext. 1.120 + */ 1.121 + int32_t anteContextLength; 1.122 + 1.123 + /** 1.124 + * The length of the key. Substring [anteContextLength, 1.125 + * anteContextLength + keyLength) is the key. 1.126 + 1.127 + */ 1.128 + int32_t keyLength; 1.129 + 1.130 + /** 1.131 + * Miscellaneous attributes. 1.132 + */ 1.133 + int8_t flags; 1.134 + 1.135 + /** 1.136 + * Flag attributes. 1.137 + */ 1.138 + enum { 1.139 + ANCHOR_START = 1, 1.140 + ANCHOR_END = 2 1.141 + }; 1.142 + 1.143 + /** 1.144 + * An alias pointer to the data for this rule. The data provides 1.145 + * lookup services for matchers and segments. 1.146 + */ 1.147 + const TransliterationRuleData* data; 1.148 + 1.149 +public: 1.150 + 1.151 + /** 1.152 + * Construct a new rule with the given input, output text, and other 1.153 + * attributes. A cursor position may be specified for the output text. 1.154 + * @param input input string, including key and optional ante and 1.155 + * post context. 1.156 + * @param anteContextPos offset into input to end of ante context, or -1 if 1.157 + * none. Must be <= input.length() if not -1. 1.158 + * @param postContextPos offset into input to start of post context, or -1 1.159 + * if none. Must be <= input.length() if not -1, and must be >= 1.160 + * anteContextPos. 1.161 + * @param outputStr output string. 1.162 + * @param cursorPosition offset into output at which cursor is located, or -1 if 1.163 + * none. If less than zero, then the cursor is placed after the 1.164 + * <code>output</code>; that is, -1 is equivalent to 1.165 + * <code>output.length()</code>. If greater than 1.166 + * <code>output.length()</code> then an exception is thrown. 1.167 + * @param cursorOffset an offset to be added to cursorPos to position the 1.168 + * cursor either in the ante context, if < 0, or in the post context, if > 1.169 + * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to 1.170 + * "xyz" and moves the cursor to before "a". It would have a cursorOffset 1.171 + * of -3. 1.172 + * @param segs array of UnicodeMatcher corresponding to input pattern 1.173 + * segments, or null if there are none. The array itself is adopted, 1.174 + * but the pointers within it are not. 1.175 + * @param segsCount number of elements in segs[]. 1.176 + * @param anchorStart TRUE if the the rule is anchored on the left to 1.177 + * the context start. 1.178 + * @param anchorEnd TRUE if the rule is anchored on the right to the 1.179 + * context limit. 1.180 + * @param data the rule data. 1.181 + * @param status Output parameter filled in with success or failure status. 1.182 + */ 1.183 + TransliterationRule(const UnicodeString& input, 1.184 + int32_t anteContextPos, int32_t postContextPos, 1.185 + const UnicodeString& outputStr, 1.186 + int32_t cursorPosition, int32_t cursorOffset, 1.187 + UnicodeFunctor** segs, 1.188 + int32_t segsCount, 1.189 + UBool anchorStart, UBool anchorEnd, 1.190 + const TransliterationRuleData* data, 1.191 + UErrorCode& status); 1.192 + 1.193 + /** 1.194 + * Copy constructor. 1.195 + * @param other the object to be copied. 1.196 + */ 1.197 + TransliterationRule(TransliterationRule& other); 1.198 + 1.199 + /** 1.200 + * Destructor. 1.201 + */ 1.202 + virtual ~TransliterationRule(); 1.203 + 1.204 + /** 1.205 + * Change the data object that this rule belongs to. Used 1.206 + * internally by the TransliterationRuleData copy constructor. 1.207 + * @param data the new data value to be set. 1.208 + */ 1.209 + void setData(const TransliterationRuleData* data); 1.210 + 1.211 + /** 1.212 + * Return the preceding context length. This method is needed to 1.213 + * support the <code>Transliterator</code> method 1.214 + * <code>getMaximumContextLength()</code>. Internally, this is 1.215 + * implemented as the anteContextLength, optionally plus one if 1.216 + * there is a start anchor. The one character anchor gap is 1.217 + * needed to make repeated incremental transliteration with 1.218 + * anchors work. 1.219 + * @return the preceding context length. 1.220 + */ 1.221 + virtual int32_t getContextLength(void) const; 1.222 + 1.223 + /** 1.224 + * Internal method. Returns 8-bit index value for this rule. 1.225 + * This is the low byte of the first character of the key, 1.226 + * unless the first character of the key is a set. If it's a 1.227 + * set, or otherwise can match multiple keys, the index value is -1. 1.228 + * @return 8-bit index value for this rule. 1.229 + */ 1.230 + int16_t getIndexValue() const; 1.231 + 1.232 + /** 1.233 + * Internal method. Returns true if this rule matches the given 1.234 + * index value. The index value is an 8-bit integer, 0..255, 1.235 + * representing the low byte of the first character of the key. 1.236 + * It matches this rule if it matches the first character of the 1.237 + * key, or if the first character of the key is a set, and the set 1.238 + * contains any character with a low byte equal to the index 1.239 + * value. If the rule contains only ante context, as in foo)>bar, 1.240 + * then it will match any key. 1.241 + * @param v the given index value. 1.242 + * @return true if this rule matches the given index value. 1.243 + */ 1.244 + UBool matchesIndexValue(uint8_t v) const; 1.245 + 1.246 + /** 1.247 + * Return true if this rule masks another rule. If r1 masks r2 then 1.248 + * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks 1.249 + * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". 1.250 + * "[c]a>x" masks "[dc]a>y". 1.251 + * @param r2 the given rule to be compared with. 1.252 + * @return true if this rule masks 'r2' 1.253 + */ 1.254 + virtual UBool masks(const TransliterationRule& r2) const; 1.255 + 1.256 + /** 1.257 + * Attempt a match and replacement at the given position. Return 1.258 + * the degree of match between this rule and the given text. The 1.259 + * degree of match may be mismatch, a partial match, or a full 1.260 + * match. A mismatch means at least one character of the text 1.261 + * does not match the context or key. A partial match means some 1.262 + * context and key characters match, but the text is not long 1.263 + * enough to match all of them. A full match means all context 1.264 + * and key characters match. 1.265 + * 1.266 + * If a full match is obtained, perform a replacement, update pos, 1.267 + * and return U_MATCH. Otherwise both text and pos are unchanged. 1.268 + * 1.269 + * @param text the text 1.270 + * @param pos the position indices 1.271 + * @param incremental if TRUE, test for partial matches that may 1.272 + * be completed by additional text inserted at pos.limit. 1.273 + * @return one of <code>U_MISMATCH</code>, 1.274 + * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If 1.275 + * incremental is FALSE then U_PARTIAL_MATCH will not be returned. 1.276 + */ 1.277 + UMatchDegree matchAndReplace(Replaceable& text, 1.278 + UTransPosition& pos, 1.279 + UBool incremental) const; 1.280 + 1.281 + /** 1.282 + * Create a rule string that represents this rule object. Append 1.283 + * it to the given string. 1.284 + */ 1.285 + virtual UnicodeString& toRule(UnicodeString& pat, 1.286 + UBool escapeUnprintable) const; 1.287 + 1.288 + /** 1.289 + * Union the set of all characters that may be modified by this rule 1.290 + * into the given set. 1.291 + */ 1.292 + void addSourceSetTo(UnicodeSet& toUnionTo) const; 1.293 + 1.294 + /** 1.295 + * Union the set of all characters that may be emitted by this rule 1.296 + * into the given set. 1.297 + */ 1.298 + void addTargetSetTo(UnicodeSet& toUnionTo) const; 1.299 + 1.300 + private: 1.301 + 1.302 + friend class StringMatcher; 1.303 + 1.304 + TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class 1.305 +}; 1.306 + 1.307 +U_NAMESPACE_END 1.308 + 1.309 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.310 + 1.311 +#endif