Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright (C) 2001-2011, International Business Machines Corporation |
michael@0 | 3 | * and others. All Rights Reserved. |
michael@0 | 4 | ********************************************************************** |
michael@0 | 5 | * Date Name Description |
michael@0 | 6 | * 07/23/01 aliu Creation. |
michael@0 | 7 | ********************************************************************** |
michael@0 | 8 | */ |
michael@0 | 9 | #ifndef STRMATCH_H |
michael@0 | 10 | #define STRMATCH_H |
michael@0 | 11 | |
michael@0 | 12 | #include "unicode/utypes.h" |
michael@0 | 13 | |
michael@0 | 14 | #if !UCONFIG_NO_TRANSLITERATION |
michael@0 | 15 | |
michael@0 | 16 | #include "unicode/unistr.h" |
michael@0 | 17 | #include "unicode/unifunct.h" |
michael@0 | 18 | #include "unicode/unimatch.h" |
michael@0 | 19 | #include "unicode/unirepl.h" |
michael@0 | 20 | |
michael@0 | 21 | U_NAMESPACE_BEGIN |
michael@0 | 22 | |
michael@0 | 23 | class TransliterationRuleData; |
michael@0 | 24 | |
michael@0 | 25 | /** |
michael@0 | 26 | * An object that matches a fixed input string, implementing the |
michael@0 | 27 | * UnicodeMatcher API. This object also implements the |
michael@0 | 28 | * UnicodeReplacer API, allowing it to emit the matched text as |
michael@0 | 29 | * output. Since the match text may contain flexible match elements, |
michael@0 | 30 | * such as UnicodeSets, the emitted text is not the match pattern, but |
michael@0 | 31 | * instead a substring of the actual matched text. Following |
michael@0 | 32 | * convention, the output text is the leftmost match seen up to this |
michael@0 | 33 | * point. |
michael@0 | 34 | * |
michael@0 | 35 | * A StringMatcher may represent a segment, in which case it has a |
michael@0 | 36 | * positive segment number. This affects how the matcher converts |
michael@0 | 37 | * itself to a pattern but does not otherwise affect its function. |
michael@0 | 38 | * |
michael@0 | 39 | * A StringMatcher that is not a segment should not be used as a |
michael@0 | 40 | * UnicodeReplacer. |
michael@0 | 41 | */ |
michael@0 | 42 | class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { |
michael@0 | 43 | |
michael@0 | 44 | public: |
michael@0 | 45 | |
michael@0 | 46 | /** |
michael@0 | 47 | * Construct a matcher that matches the given pattern string. |
michael@0 | 48 | * @param string the pattern to be matched, possibly containing |
michael@0 | 49 | * stand-ins that represent nested UnicodeMatcher objects. |
michael@0 | 50 | * @param start inclusive start index of text to be replaced |
michael@0 | 51 | * @param limit exclusive end index of text to be replaced; |
michael@0 | 52 | * must be greater than or equal to start |
michael@0 | 53 | * @param segmentNum the segment number from 1..n, or 0 if this is |
michael@0 | 54 | * not a segment. |
michael@0 | 55 | * @param data context object mapping stand-ins to |
michael@0 | 56 | * UnicodeMatcher objects. |
michael@0 | 57 | */ |
michael@0 | 58 | StringMatcher(const UnicodeString& string, |
michael@0 | 59 | int32_t start, |
michael@0 | 60 | int32_t limit, |
michael@0 | 61 | int32_t segmentNum, |
michael@0 | 62 | const TransliterationRuleData& data); |
michael@0 | 63 | |
michael@0 | 64 | /** |
michael@0 | 65 | * Copy constructor |
michael@0 | 66 | * @param o the object to be copied. |
michael@0 | 67 | */ |
michael@0 | 68 | StringMatcher(const StringMatcher& o); |
michael@0 | 69 | |
michael@0 | 70 | /** |
michael@0 | 71 | * Destructor |
michael@0 | 72 | */ |
michael@0 | 73 | virtual ~StringMatcher(); |
michael@0 | 74 | |
michael@0 | 75 | /** |
michael@0 | 76 | * Implement UnicodeFunctor |
michael@0 | 77 | * @return a copy of the object. |
michael@0 | 78 | */ |
michael@0 | 79 | virtual UnicodeFunctor* clone() const; |
michael@0 | 80 | |
michael@0 | 81 | /** |
michael@0 | 82 | * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer |
michael@0 | 83 | * and return the pointer. |
michael@0 | 84 | * @return the UnicodeMatcher point. |
michael@0 | 85 | */ |
michael@0 | 86 | virtual UnicodeMatcher* toMatcher() const; |
michael@0 | 87 | |
michael@0 | 88 | /** |
michael@0 | 89 | * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer |
michael@0 | 90 | * and return the pointer. |
michael@0 | 91 | * @return the UnicodeReplacer pointer. |
michael@0 | 92 | */ |
michael@0 | 93 | virtual UnicodeReplacer* toReplacer() const; |
michael@0 | 94 | |
michael@0 | 95 | /** |
michael@0 | 96 | * Implement UnicodeMatcher |
michael@0 | 97 | * @param text the text to be matched |
michael@0 | 98 | * @param offset on input, the index into text at which to begin |
michael@0 | 99 | * matching. On output, the limit of the matched text. The |
michael@0 | 100 | * number of matched characters is the output value of offset |
michael@0 | 101 | * minus the input value. Offset should always point to the |
michael@0 | 102 | * HIGH SURROGATE (leading code unit) of a pair of surrogates, |
michael@0 | 103 | * both on entry and upon return. |
michael@0 | 104 | * @param limit the limit index of text to be matched. Greater |
michael@0 | 105 | * than offset for a forward direction match, less than offset for |
michael@0 | 106 | * a backward direction match. The last character to be |
michael@0 | 107 | * considered for matching will be text.charAt(limit-1) in the |
michael@0 | 108 | * forward direction or text.charAt(limit+1) in the backward |
michael@0 | 109 | * direction. |
michael@0 | 110 | * @param incremental if TRUE, then assume further characters may |
michael@0 | 111 | * be inserted at limit and check for partial matching. Otherwise |
michael@0 | 112 | * assume the text as given is complete. |
michael@0 | 113 | * @return a match degree value indicating a full match, a partial |
michael@0 | 114 | * match, or a mismatch. If incremental is FALSE then |
michael@0 | 115 | * U_PARTIAL_MATCH should never be returned. |
michael@0 | 116 | */ |
michael@0 | 117 | virtual UMatchDegree matches(const Replaceable& text, |
michael@0 | 118 | int32_t& offset, |
michael@0 | 119 | int32_t limit, |
michael@0 | 120 | UBool incremental); |
michael@0 | 121 | |
michael@0 | 122 | /** |
michael@0 | 123 | * Implement UnicodeMatcher |
michael@0 | 124 | * @param result Output param to receive the pattern. |
michael@0 | 125 | * @param escapeUnprintable if True then escape the unprintable characters. |
michael@0 | 126 | * @return A reference to 'result'. |
michael@0 | 127 | */ |
michael@0 | 128 | virtual UnicodeString& toPattern(UnicodeString& result, |
michael@0 | 129 | UBool escapeUnprintable = FALSE) const; |
michael@0 | 130 | |
michael@0 | 131 | /** |
michael@0 | 132 | * Implement UnicodeMatcher |
michael@0 | 133 | * Returns TRUE if this matcher will match a character c, where c |
michael@0 | 134 | * & 0xFF == v, at offset, in the forward direction (with limit > |
michael@0 | 135 | * offset). This is used by <tt>RuleBasedTransliterator</tt> for |
michael@0 | 136 | * indexing. |
michael@0 | 137 | * @param v the given value |
michael@0 | 138 | * @return TRUE if this matcher will match a character c, |
michael@0 | 139 | * where c & 0xFF == v |
michael@0 | 140 | */ |
michael@0 | 141 | virtual UBool matchesIndexValue(uint8_t v) const; |
michael@0 | 142 | |
michael@0 | 143 | /** |
michael@0 | 144 | * Implement UnicodeMatcher |
michael@0 | 145 | */ |
michael@0 | 146 | virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; |
michael@0 | 147 | |
michael@0 | 148 | /** |
michael@0 | 149 | * Implement UnicodeFunctor |
michael@0 | 150 | */ |
michael@0 | 151 | virtual void setData(const TransliterationRuleData*); |
michael@0 | 152 | |
michael@0 | 153 | /** |
michael@0 | 154 | * Replace characters in 'text' from 'start' to 'limit' with the |
michael@0 | 155 | * output text of this object. Update the 'cursor' parameter to |
michael@0 | 156 | * give the cursor position and return the length of the |
michael@0 | 157 | * replacement text. |
michael@0 | 158 | * |
michael@0 | 159 | * @param text the text to be matched |
michael@0 | 160 | * @param start inclusive start index of text to be replaced |
michael@0 | 161 | * @param limit exclusive end index of text to be replaced; |
michael@0 | 162 | * must be greater than or equal to start |
michael@0 | 163 | * @param cursor output parameter for the cursor position. |
michael@0 | 164 | * Not all replacer objects will update this, but in a complete |
michael@0 | 165 | * tree of replacer objects, representing the entire output side |
michael@0 | 166 | * of a transliteration rule, at least one must update it. |
michael@0 | 167 | * @return the number of 16-bit code units in the text replacing |
michael@0 | 168 | * the characters at offsets start..(limit-1) in text |
michael@0 | 169 | */ |
michael@0 | 170 | virtual int32_t replace(Replaceable& text, |
michael@0 | 171 | int32_t start, |
michael@0 | 172 | int32_t limit, |
michael@0 | 173 | int32_t& cursor); |
michael@0 | 174 | |
michael@0 | 175 | /** |
michael@0 | 176 | * Returns a string representation of this replacer. If the |
michael@0 | 177 | * result of calling this function is passed to the appropriate |
michael@0 | 178 | * parser, typically TransliteratorParser, it will produce another |
michael@0 | 179 | * replacer that is equal to this one. |
michael@0 | 180 | * @param result the string to receive the pattern. Previous |
michael@0 | 181 | * contents will be deleted. |
michael@0 | 182 | * @param escapeUnprintable if TRUE then convert unprintable |
michael@0 | 183 | * character to their hex escape representations, \\uxxxx or |
michael@0 | 184 | * \\Uxxxxxxxx. Unprintable characters are defined by |
michael@0 | 185 | * Utility.isUnprintable(). |
michael@0 | 186 | * @return a reference to 'result'. |
michael@0 | 187 | */ |
michael@0 | 188 | virtual UnicodeString& toReplacerPattern(UnicodeString& result, |
michael@0 | 189 | UBool escapeUnprintable) const; |
michael@0 | 190 | |
michael@0 | 191 | /** |
michael@0 | 192 | * Remove any match data. This must be called before performing a |
michael@0 | 193 | * set of matches with this segment. |
michael@0 | 194 | */ |
michael@0 | 195 | void resetMatch(); |
michael@0 | 196 | |
michael@0 | 197 | /** |
michael@0 | 198 | * ICU "poor man's RTTI", returns a UClassID for the actual class. |
michael@0 | 199 | */ |
michael@0 | 200 | virtual UClassID getDynamicClassID() const; |
michael@0 | 201 | |
michael@0 | 202 | /** |
michael@0 | 203 | * ICU "poor man's RTTI", returns a UClassID for this class. |
michael@0 | 204 | */ |
michael@0 | 205 | static UClassID U_EXPORT2 getStaticClassID(); |
michael@0 | 206 | |
michael@0 | 207 | /** |
michael@0 | 208 | * Union the set of all characters that may output by this object |
michael@0 | 209 | * into the given set. |
michael@0 | 210 | * @param toUnionTo the set into which to union the output characters |
michael@0 | 211 | */ |
michael@0 | 212 | virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; |
michael@0 | 213 | |
michael@0 | 214 | private: |
michael@0 | 215 | |
michael@0 | 216 | /** |
michael@0 | 217 | * The text to be matched. |
michael@0 | 218 | */ |
michael@0 | 219 | UnicodeString pattern; |
michael@0 | 220 | |
michael@0 | 221 | /** |
michael@0 | 222 | * Context object that maps stand-ins to matcher and replacer |
michael@0 | 223 | * objects. |
michael@0 | 224 | */ |
michael@0 | 225 | const TransliterationRuleData* data; |
michael@0 | 226 | |
michael@0 | 227 | /** |
michael@0 | 228 | * The segment number, 1-based, or 0 if not a segment. |
michael@0 | 229 | */ |
michael@0 | 230 | int32_t segmentNumber; |
michael@0 | 231 | |
michael@0 | 232 | /** |
michael@0 | 233 | * Start offset, in the match text, of the <em>rightmost</em> |
michael@0 | 234 | * match. |
michael@0 | 235 | */ |
michael@0 | 236 | int32_t matchStart; |
michael@0 | 237 | |
michael@0 | 238 | /** |
michael@0 | 239 | * Limit offset, in the match text, of the <em>rightmost</em> |
michael@0 | 240 | * match. |
michael@0 | 241 | */ |
michael@0 | 242 | int32_t matchLimit; |
michael@0 | 243 | |
michael@0 | 244 | }; |
michael@0 | 245 | |
michael@0 | 246 | U_NAMESPACE_END |
michael@0 | 247 | |
michael@0 | 248 | #endif /* #if !UCONFIG_NO_TRANSLITERATION */ |
michael@0 | 249 | |
michael@0 | 250 | #endif |