Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (C) {1999-2001}, International Business Machines Corporation and others. All Rights Reserved. |
michael@0 | 3 | ********************************************************************** |
michael@0 | 4 | * Date Name Description |
michael@0 | 5 | * 11/17/99 aliu Creation. |
michael@0 | 6 | ********************************************************************** |
michael@0 | 7 | */ |
michael@0 | 8 | #ifndef RBT_RULE_H |
michael@0 | 9 | #define RBT_RULE_H |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | |
michael@0 | 13 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 14 | |
michael@0 | 15 | #include "unicode/uobject.h" |
michael@0 | 16 | #include "unicode/unistr.h" |
michael@0 | 17 | #include "unicode/utrans.h" |
michael@0 | 18 | #include "unicode/unimatch.h" |
michael@0 | 19 | |
michael@0 | 20 | U_NAMESPACE_BEGIN |
michael@0 | 21 | |
michael@0 | 22 | class Replaceable; |
michael@0 | 23 | class TransliterationRuleData; |
michael@0 | 24 | class StringMatcher; |
michael@0 | 25 | class UnicodeFunctor; |
michael@0 | 26 | |
michael@0 | 27 | /** |
michael@0 | 28 | * A transliteration rule used by |
michael@0 | 29 | * <code>RuleBasedTransliterator</code>. |
michael@0 | 30 | * <code>TransliterationRule</code> is an immutable object. |
michael@0 | 31 | * |
michael@0 | 32 | * <p>A rule consists of an input pattern and an output string. When |
michael@0 | 33 | * the input pattern is matched, the output string is emitted. The |
michael@0 | 34 | * input pattern consists of zero or more characters which are matched |
michael@0 | 35 | * exactly (the key) and optional context. Context must match if it |
michael@0 | 36 | * is specified. Context may be specified before the key, after the |
michael@0 | 37 | * key, or both. The key, preceding context, and following context |
michael@0 | 38 | * may contain variables. Variables represent a set of Unicode |
michael@0 | 39 | * characters, such as the letters <i>a</i> through <i>z</i>. |
michael@0 | 40 | * Variables are detected by looking up each character in a supplied |
michael@0 | 41 | * variable list to see if it has been so defined. |
michael@0 | 42 | * |
michael@0 | 43 | * <p>A rule may contain segments in its input string and segment |
michael@0 | 44 | * references in its output string. A segment is a substring of the |
michael@0 | 45 | * input pattern, indicated by an offset and limit. The segment may |
michael@0 | 46 | * be in the preceding or following context. It may not span a |
michael@0 | 47 | * context boundary. A segment reference is a special character in |
michael@0 | 48 | * the output string that causes a segment of the input string (not |
michael@0 | 49 | * the input pattern) to be copied to the output string. The range of |
michael@0 | 50 | * special characters that represent segment references is defined by |
michael@0 | 51 | * RuleBasedTransliterator.Data. |
michael@0 | 52 | * |
michael@0 | 53 | * @author Alan Liu |
michael@0 | 54 | */ |
michael@0 | 55 | class TransliterationRule : public UMemory { |
michael@0 | 56 | |
michael@0 | 57 | private: |
michael@0 | 58 | |
michael@0 | 59 | // TODO Eliminate the pattern and keyLength data members. They |
michael@0 | 60 | // are used only by masks() and getIndexValue() which are called |
michael@0 | 61 | // only during build time, not during run-time. Perhaps these |
michael@0 | 62 | // methods and pattern/keyLength can be isolated into a separate |
michael@0 | 63 | // object. |
michael@0 | 64 | |
michael@0 | 65 | /** |
michael@0 | 66 | * The match that must occur before the key, or null if there is no |
michael@0 | 67 | * preceding context. |
michael@0 | 68 | */ |
michael@0 | 69 | StringMatcher *anteContext; |
michael@0 | 70 | |
michael@0 | 71 | /** |
michael@0 | 72 | * The matcher object for the key. If null, then the key is empty. |
michael@0 | 73 | */ |
michael@0 | 74 | StringMatcher *key; |
michael@0 | 75 | |
michael@0 | 76 | /** |
michael@0 | 77 | * The match that must occur after the key, or null if there is no |
michael@0 | 78 | * following context. |
michael@0 | 79 | */ |
michael@0 | 80 | StringMatcher *postContext; |
michael@0 | 81 | |
michael@0 | 82 | /** |
michael@0 | 83 | * The object that performs the replacement if the key, |
michael@0 | 84 | * anteContext, and postContext are matched. Never null. |
michael@0 | 85 | */ |
michael@0 | 86 | UnicodeFunctor* output; |
michael@0 | 87 | |
michael@0 | 88 | /** |
michael@0 | 89 | * The string that must be matched, consisting of the anteContext, key, |
michael@0 | 90 | * and postContext, concatenated together, in that order. Some components |
michael@0 | 91 | * may be empty (zero length). |
michael@0 | 92 | * @see anteContextLength |
michael@0 | 93 | * @see keyLength |
michael@0 | 94 | */ |
michael@0 | 95 | UnicodeString pattern; |
michael@0 | 96 | |
michael@0 | 97 | /** |
michael@0 | 98 | * An array of matcher objects corresponding to the input pattern |
michael@0 | 99 | * segments. If there are no segments this is null. N.B. This is |
michael@0 | 100 | * a UnicodeMatcher for generality, but in practice it is always a |
michael@0 | 101 | * StringMatcher. In the future we may generalize this, but for |
michael@0 | 102 | * now we sometimes cast down to StringMatcher. |
michael@0 | 103 | * |
michael@0 | 104 | * The array is owned, but the pointers within it are not. |
michael@0 | 105 | */ |
michael@0 | 106 | UnicodeFunctor** segments; |
michael@0 | 107 | |
michael@0 | 108 | /** |
michael@0 | 109 | * The number of elements in segments[] or zero if segments is NULL. |
michael@0 | 110 | */ |
michael@0 | 111 | int32_t segmentsCount; |
michael@0 | 112 | |
michael@0 | 113 | /** |
michael@0 | 114 | * The length of the string that must match before the key. If |
michael@0 | 115 | * zero, then there is no matching requirement before the key. |
michael@0 | 116 | * Substring [0,anteContextLength) of pattern is the anteContext. |
michael@0 | 117 | */ |
michael@0 | 118 | int32_t anteContextLength; |
michael@0 | 119 | |
michael@0 | 120 | /** |
michael@0 | 121 | * The length of the key. Substring [anteContextLength, |
michael@0 | 122 | * anteContextLength + keyLength) is the key. |
michael@0 | 123 | |
michael@0 | 124 | */ |
michael@0 | 125 | int32_t keyLength; |
michael@0 | 126 | |
michael@0 | 127 | /** |
michael@0 | 128 | * Miscellaneous attributes. |
michael@0 | 129 | */ |
michael@0 | 130 | int8_t flags; |
michael@0 | 131 | |
michael@0 | 132 | /** |
michael@0 | 133 | * Flag attributes. |
michael@0 | 134 | */ |
michael@0 | 135 | enum { |
michael@0 | 136 | ANCHOR_START = 1, |
michael@0 | 137 | ANCHOR_END = 2 |
michael@0 | 138 | }; |
michael@0 | 139 | |
michael@0 | 140 | /** |
michael@0 | 141 | * An alias pointer to the data for this rule. The data provides |
michael@0 | 142 | * lookup services for matchers and segments. |
michael@0 | 143 | */ |
michael@0 | 144 | const TransliterationRuleData* data; |
michael@0 | 145 | |
michael@0 | 146 | public: |
michael@0 | 147 | |
michael@0 | 148 | /** |
michael@0 | 149 | * Construct a new rule with the given input, output text, and other |
michael@0 | 150 | * attributes. A cursor position may be specified for the output text. |
michael@0 | 151 | * @param input input string, including key and optional ante and |
michael@0 | 152 | * post context. |
michael@0 | 153 | * @param anteContextPos offset into input to end of ante context, or -1 if |
michael@0 | 154 | * none. Must be <= input.length() if not -1. |
michael@0 | 155 | * @param postContextPos offset into input to start of post context, or -1 |
michael@0 | 156 | * if none. Must be <= input.length() if not -1, and must be >= |
michael@0 | 157 | * anteContextPos. |
michael@0 | 158 | * @param outputStr output string. |
michael@0 | 159 | * @param cursorPosition offset into output at which cursor is located, or -1 if |
michael@0 | 160 | * none. If less than zero, then the cursor is placed after the |
michael@0 | 161 | * <code>output</code>; that is, -1 is equivalent to |
michael@0 | 162 | * <code>output.length()</code>. If greater than |
michael@0 | 163 | * <code>output.length()</code> then an exception is thrown. |
michael@0 | 164 | * @param cursorOffset an offset to be added to cursorPos to position the |
michael@0 | 165 | * cursor either in the ante context, if < 0, or in the post context, if > |
michael@0 | 166 | * 0. For example, the rule "abc{def} > | @@@ xyz;" changes "def" to |
michael@0 | 167 | * "xyz" and moves the cursor to before "a". It would have a cursorOffset |
michael@0 | 168 | * of -3. |
michael@0 | 169 | * @param segs array of UnicodeMatcher corresponding to input pattern |
michael@0 | 170 | * segments, or null if there are none. The array itself is adopted, |
michael@0 | 171 | * but the pointers within it are not. |
michael@0 | 172 | * @param segsCount number of elements in segs[]. |
michael@0 | 173 | * @param anchorStart TRUE if the the rule is anchored on the left to |
michael@0 | 174 | * the context start. |
michael@0 | 175 | * @param anchorEnd TRUE if the rule is anchored on the right to the |
michael@0 | 176 | * context limit. |
michael@0 | 177 | * @param data the rule data. |
michael@0 | 178 | * @param status Output parameter filled in with success or failure status. |
michael@0 | 179 | */ |
michael@0 | 180 | TransliterationRule(const UnicodeString& input, |
michael@0 | 181 | int32_t anteContextPos, int32_t postContextPos, |
michael@0 | 182 | const UnicodeString& outputStr, |
michael@0 | 183 | int32_t cursorPosition, int32_t cursorOffset, |
michael@0 | 184 | UnicodeFunctor** segs, |
michael@0 | 185 | int32_t segsCount, |
michael@0 | 186 | UBool anchorStart, UBool anchorEnd, |
michael@0 | 187 | const TransliterationRuleData* data, |
michael@0 | 188 | UErrorCode& status); |
michael@0 | 189 | |
michael@0 | 190 | /** |
michael@0 | 191 | * Copy constructor. |
michael@0 | 192 | * @param other the object to be copied. |
michael@0 | 193 | */ |
michael@0 | 194 | TransliterationRule(TransliterationRule& other); |
michael@0 | 195 | |
michael@0 | 196 | /** |
michael@0 | 197 | * Destructor. |
michael@0 | 198 | */ |
michael@0 | 199 | virtual ~TransliterationRule(); |
michael@0 | 200 | |
michael@0 | 201 | /** |
michael@0 | 202 | * Change the data object that this rule belongs to. Used |
michael@0 | 203 | * internally by the TransliterationRuleData copy constructor. |
michael@0 | 204 | * @param data the new data value to be set. |
michael@0 | 205 | */ |
michael@0 | 206 | void setData(const TransliterationRuleData* data); |
michael@0 | 207 | |
michael@0 | 208 | /** |
michael@0 | 209 | * Return the preceding context length. This method is needed to |
michael@0 | 210 | * support the <code>Transliterator</code> method |
michael@0 | 211 | * <code>getMaximumContextLength()</code>. Internally, this is |
michael@0 | 212 | * implemented as the anteContextLength, optionally plus one if |
michael@0 | 213 | * there is a start anchor. The one character anchor gap is |
michael@0 | 214 | * needed to make repeated incremental transliteration with |
michael@0 | 215 | * anchors work. |
michael@0 | 216 | * @return the preceding context length. |
michael@0 | 217 | */ |
michael@0 | 218 | virtual int32_t getContextLength(void) const; |
michael@0 | 219 | |
michael@0 | 220 | /** |
michael@0 | 221 | * Internal method. Returns 8-bit index value for this rule. |
michael@0 | 222 | * This is the low byte of the first character of the key, |
michael@0 | 223 | * unless the first character of the key is a set. If it's a |
michael@0 | 224 | * set, or otherwise can match multiple keys, the index value is -1. |
michael@0 | 225 | * @return 8-bit index value for this rule. |
michael@0 | 226 | */ |
michael@0 | 227 | int16_t getIndexValue() const; |
michael@0 | 228 | |
michael@0 | 229 | /** |
michael@0 | 230 | * Internal method. Returns true if this rule matches the given |
michael@0 | 231 | * index value. The index value is an 8-bit integer, 0..255, |
michael@0 | 232 | * representing the low byte of the first character of the key. |
michael@0 | 233 | * It matches this rule if it matches the first character of the |
michael@0 | 234 | * key, or if the first character of the key is a set, and the set |
michael@0 | 235 | * contains any character with a low byte equal to the index |
michael@0 | 236 | * value. If the rule contains only ante context, as in foo)>bar, |
michael@0 | 237 | * then it will match any key. |
michael@0 | 238 | * @param v the given index value. |
michael@0 | 239 | * @return true if this rule matches the given index value. |
michael@0 | 240 | */ |
michael@0 | 241 | UBool matchesIndexValue(uint8_t v) const; |
michael@0 | 242 | |
michael@0 | 243 | /** |
michael@0 | 244 | * Return true if this rule masks another rule. If r1 masks r2 then |
michael@0 | 245 | * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks |
michael@0 | 246 | * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". |
michael@0 | 247 | * "[c]a>x" masks "[dc]a>y". |
michael@0 | 248 | * @param r2 the given rule to be compared with. |
michael@0 | 249 | * @return true if this rule masks 'r2' |
michael@0 | 250 | */ |
michael@0 | 251 | virtual UBool masks(const TransliterationRule& r2) const; |
michael@0 | 252 | |
michael@0 | 253 | /** |
michael@0 | 254 | * Attempt a match and replacement at the given position. Return |
michael@0 | 255 | * the degree of match between this rule and the given text. The |
michael@0 | 256 | * degree of match may be mismatch, a partial match, or a full |
michael@0 | 257 | * match. A mismatch means at least one character of the text |
michael@0 | 258 | * does not match the context or key. A partial match means some |
michael@0 | 259 | * context and key characters match, but the text is not long |
michael@0 | 260 | * enough to match all of them. A full match means all context |
michael@0 | 261 | * and key characters match. |
michael@0 | 262 | * |
michael@0 | 263 | * If a full match is obtained, perform a replacement, update pos, |
michael@0 | 264 | * and return U_MATCH. Otherwise both text and pos are unchanged. |
michael@0 | 265 | * |
michael@0 | 266 | * @param text the text |
michael@0 | 267 | * @param pos the position indices |
michael@0 | 268 | * @param incremental if TRUE, test for partial matches that may |
michael@0 | 269 | * be completed by additional text inserted at pos.limit. |
michael@0 | 270 | * @return one of <code>U_MISMATCH</code>, |
michael@0 | 271 | * <code>U_PARTIAL_MATCH</code>, or <code>U_MATCH</code>. If |
michael@0 | 272 | * incremental is FALSE then U_PARTIAL_MATCH will not be returned. |
michael@0 | 273 | */ |
michael@0 | 274 | UMatchDegree matchAndReplace(Replaceable& text, |
michael@0 | 275 | UTransPosition& pos, |
michael@0 | 276 | UBool incremental) const; |
michael@0 | 277 | |
michael@0 | 278 | /** |
michael@0 | 279 | * Create a rule string that represents this rule object. Append |
michael@0 | 280 | * it to the given string. |
michael@0 | 281 | */ |
michael@0 | 282 | virtual UnicodeString& toRule(UnicodeString& pat, |
michael@0 | 283 | UBool escapeUnprintable) const; |
michael@0 | 284 | |
michael@0 | 285 | /** |
michael@0 | 286 | * Union the set of all characters that may be modified by this rule |
michael@0 | 287 | * into the given set. |
michael@0 | 288 | */ |
michael@0 | 289 | void addSourceSetTo(UnicodeSet& toUnionTo) const; |
michael@0 | 290 | |
michael@0 | 291 | /** |
michael@0 | 292 | * Union the set of all characters that may be emitted by this rule |
michael@0 | 293 | * into the given set. |
michael@0 | 294 | */ |
michael@0 | 295 | void addTargetSetTo(UnicodeSet& toUnionTo) const; |
michael@0 | 296 | |
michael@0 | 297 | private: |
michael@0 | 298 | |
michael@0 | 299 | friend class StringMatcher; |
michael@0 | 300 | |
michael@0 | 301 | TransliterationRule &operator=(const TransliterationRule &other); // forbid copying of this class |
michael@0 | 302 | }; |
michael@0 | 303 | |
michael@0 | 304 | U_NAMESPACE_END |
michael@0 | 305 | |
michael@0 | 306 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 307 | |
michael@0 | 308 | #endif |