intl/icu/source/i18n/strmatch.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright (C) 2001-2011, International Business Machines Corporation
michael@0 3 * and others. All Rights Reserved.
michael@0 4 **********************************************************************
michael@0 5 * Date Name Description
michael@0 6 * 07/23/01 aliu Creation.
michael@0 7 **********************************************************************
michael@0 8 */
michael@0 9 #ifndef STRMATCH_H
michael@0 10 #define STRMATCH_H
michael@0 11
michael@0 12 #include "unicode/utypes.h"
michael@0 13
michael@0 14 #if !UCONFIG_NO_TRANSLITERATION
michael@0 15
michael@0 16 #include "unicode/unistr.h"
michael@0 17 #include "unicode/unifunct.h"
michael@0 18 #include "unicode/unimatch.h"
michael@0 19 #include "unicode/unirepl.h"
michael@0 20
michael@0 21 U_NAMESPACE_BEGIN
michael@0 22
michael@0 23 class TransliterationRuleData;
michael@0 24
michael@0 25 /**
michael@0 26 * An object that matches a fixed input string, implementing the
michael@0 27 * UnicodeMatcher API. This object also implements the
michael@0 28 * UnicodeReplacer API, allowing it to emit the matched text as
michael@0 29 * output. Since the match text may contain flexible match elements,
michael@0 30 * such as UnicodeSets, the emitted text is not the match pattern, but
michael@0 31 * instead a substring of the actual matched text. Following
michael@0 32 * convention, the output text is the leftmost match seen up to this
michael@0 33 * point.
michael@0 34 *
michael@0 35 * A StringMatcher may represent a segment, in which case it has a
michael@0 36 * positive segment number. This affects how the matcher converts
michael@0 37 * itself to a pattern but does not otherwise affect its function.
michael@0 38 *
michael@0 39 * A StringMatcher that is not a segment should not be used as a
michael@0 40 * UnicodeReplacer.
michael@0 41 */
michael@0 42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
michael@0 43
michael@0 44 public:
michael@0 45
michael@0 46 /**
michael@0 47 * Construct a matcher that matches the given pattern string.
michael@0 48 * @param string the pattern to be matched, possibly containing
michael@0 49 * stand-ins that represent nested UnicodeMatcher objects.
michael@0 50 * @param start inclusive start index of text to be replaced
michael@0 51 * @param limit exclusive end index of text to be replaced;
michael@0 52 * must be greater than or equal to start
michael@0 53 * @param segmentNum the segment number from 1..n, or 0 if this is
michael@0 54 * not a segment.
michael@0 55 * @param data context object mapping stand-ins to
michael@0 56 * UnicodeMatcher objects.
michael@0 57 */
michael@0 58 StringMatcher(const UnicodeString& string,
michael@0 59 int32_t start,
michael@0 60 int32_t limit,
michael@0 61 int32_t segmentNum,
michael@0 62 const TransliterationRuleData& data);
michael@0 63
michael@0 64 /**
michael@0 65 * Copy constructor
michael@0 66 * @param o the object to be copied.
michael@0 67 */
michael@0 68 StringMatcher(const StringMatcher& o);
michael@0 69
michael@0 70 /**
michael@0 71 * Destructor
michael@0 72 */
michael@0 73 virtual ~StringMatcher();
michael@0 74
michael@0 75 /**
michael@0 76 * Implement UnicodeFunctor
michael@0 77 * @return a copy of the object.
michael@0 78 */
michael@0 79 virtual UnicodeFunctor* clone() const;
michael@0 80
michael@0 81 /**
michael@0 82 * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
michael@0 83 * and return the pointer.
michael@0 84 * @return the UnicodeMatcher point.
michael@0 85 */
michael@0 86 virtual UnicodeMatcher* toMatcher() const;
michael@0 87
michael@0 88 /**
michael@0 89 * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
michael@0 90 * and return the pointer.
michael@0 91 * @return the UnicodeReplacer pointer.
michael@0 92 */
michael@0 93 virtual UnicodeReplacer* toReplacer() const;
michael@0 94
michael@0 95 /**
michael@0 96 * Implement UnicodeMatcher
michael@0 97 * @param text the text to be matched
michael@0 98 * @param offset on input, the index into text at which to begin
michael@0 99 * matching. On output, the limit of the matched text. The
michael@0 100 * number of matched characters is the output value of offset
michael@0 101 * minus the input value. Offset should always point to the
michael@0 102 * HIGH SURROGATE (leading code unit) of a pair of surrogates,
michael@0 103 * both on entry and upon return.
michael@0 104 * @param limit the limit index of text to be matched. Greater
michael@0 105 * than offset for a forward direction match, less than offset for
michael@0 106 * a backward direction match. The last character to be
michael@0 107 * considered for matching will be text.charAt(limit-1) in the
michael@0 108 * forward direction or text.charAt(limit+1) in the backward
michael@0 109 * direction.
michael@0 110 * @param incremental if TRUE, then assume further characters may
michael@0 111 * be inserted at limit and check for partial matching. Otherwise
michael@0 112 * assume the text as given is complete.
michael@0 113 * @return a match degree value indicating a full match, a partial
michael@0 114 * match, or a mismatch. If incremental is FALSE then
michael@0 115 * U_PARTIAL_MATCH should never be returned.
michael@0 116 */
michael@0 117 virtual UMatchDegree matches(const Replaceable& text,
michael@0 118 int32_t& offset,
michael@0 119 int32_t limit,
michael@0 120 UBool incremental);
michael@0 121
michael@0 122 /**
michael@0 123 * Implement UnicodeMatcher
michael@0 124 * @param result Output param to receive the pattern.
michael@0 125 * @param escapeUnprintable if True then escape the unprintable characters.
michael@0 126 * @return A reference to 'result'.
michael@0 127 */
michael@0 128 virtual UnicodeString& toPattern(UnicodeString& result,
michael@0 129 UBool escapeUnprintable = FALSE) const;
michael@0 130
michael@0 131 /**
michael@0 132 * Implement UnicodeMatcher
michael@0 133 * Returns TRUE if this matcher will match a character c, where c
michael@0 134 * & 0xFF == v, at offset, in the forward direction (with limit >
michael@0 135 * offset). This is used by <tt>RuleBasedTransliterator</tt> for
michael@0 136 * indexing.
michael@0 137 * @param v the given value
michael@0 138 * @return TRUE if this matcher will match a character c,
michael@0 139 * where c & 0xFF == v
michael@0 140 */
michael@0 141 virtual UBool matchesIndexValue(uint8_t v) const;
michael@0 142
michael@0 143 /**
michael@0 144 * Implement UnicodeMatcher
michael@0 145 */
michael@0 146 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
michael@0 147
michael@0 148 /**
michael@0 149 * Implement UnicodeFunctor
michael@0 150 */
michael@0 151 virtual void setData(const TransliterationRuleData*);
michael@0 152
michael@0 153 /**
michael@0 154 * Replace characters in 'text' from 'start' to 'limit' with the
michael@0 155 * output text of this object. Update the 'cursor' parameter to
michael@0 156 * give the cursor position and return the length of the
michael@0 157 * replacement text.
michael@0 158 *
michael@0 159 * @param text the text to be matched
michael@0 160 * @param start inclusive start index of text to be replaced
michael@0 161 * @param limit exclusive end index of text to be replaced;
michael@0 162 * must be greater than or equal to start
michael@0 163 * @param cursor output parameter for the cursor position.
michael@0 164 * Not all replacer objects will update this, but in a complete
michael@0 165 * tree of replacer objects, representing the entire output side
michael@0 166 * of a transliteration rule, at least one must update it.
michael@0 167 * @return the number of 16-bit code units in the text replacing
michael@0 168 * the characters at offsets start..(limit-1) in text
michael@0 169 */
michael@0 170 virtual int32_t replace(Replaceable& text,
michael@0 171 int32_t start,
michael@0 172 int32_t limit,
michael@0 173 int32_t& cursor);
michael@0 174
michael@0 175 /**
michael@0 176 * Returns a string representation of this replacer. If the
michael@0 177 * result of calling this function is passed to the appropriate
michael@0 178 * parser, typically TransliteratorParser, it will produce another
michael@0 179 * replacer that is equal to this one.
michael@0 180 * @param result the string to receive the pattern. Previous
michael@0 181 * contents will be deleted.
michael@0 182 * @param escapeUnprintable if TRUE then convert unprintable
michael@0 183 * character to their hex escape representations, \\uxxxx or
michael@0 184 * \\Uxxxxxxxx. Unprintable characters are defined by
michael@0 185 * Utility.isUnprintable().
michael@0 186 * @return a reference to 'result'.
michael@0 187 */
michael@0 188 virtual UnicodeString& toReplacerPattern(UnicodeString& result,
michael@0 189 UBool escapeUnprintable) const;
michael@0 190
michael@0 191 /**
michael@0 192 * Remove any match data. This must be called before performing a
michael@0 193 * set of matches with this segment.
michael@0 194 */
michael@0 195 void resetMatch();
michael@0 196
michael@0 197 /**
michael@0 198 * ICU "poor man's RTTI", returns a UClassID for the actual class.
michael@0 199 */
michael@0 200 virtual UClassID getDynamicClassID() const;
michael@0 201
michael@0 202 /**
michael@0 203 * ICU "poor man's RTTI", returns a UClassID for this class.
michael@0 204 */
michael@0 205 static UClassID U_EXPORT2 getStaticClassID();
michael@0 206
michael@0 207 /**
michael@0 208 * Union the set of all characters that may output by this object
michael@0 209 * into the given set.
michael@0 210 * @param toUnionTo the set into which to union the output characters
michael@0 211 */
michael@0 212 virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
michael@0 213
michael@0 214 private:
michael@0 215
michael@0 216 /**
michael@0 217 * The text to be matched.
michael@0 218 */
michael@0 219 UnicodeString pattern;
michael@0 220
michael@0 221 /**
michael@0 222 * Context object that maps stand-ins to matcher and replacer
michael@0 223 * objects.
michael@0 224 */
michael@0 225 const TransliterationRuleData* data;
michael@0 226
michael@0 227 /**
michael@0 228 * The segment number, 1-based, or 0 if not a segment.
michael@0 229 */
michael@0 230 int32_t segmentNumber;
michael@0 231
michael@0 232 /**
michael@0 233 * Start offset, in the match text, of the <em>rightmost</em>
michael@0 234 * match.
michael@0 235 */
michael@0 236 int32_t matchStart;
michael@0 237
michael@0 238 /**
michael@0 239 * Limit offset, in the match text, of the <em>rightmost</em>
michael@0 240 * match.
michael@0 241 */
michael@0 242 int32_t matchLimit;
michael@0 243
michael@0 244 };
michael@0 245
michael@0 246 U_NAMESPACE_END
michael@0 247
michael@0 248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0 249
michael@0 250 #endif

mercurial