1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/i18n/strmatch.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,250 @@ 1.4 +/* 1.5 + * Copyright (C) 2001-2011, International Business Machines Corporation 1.6 + * and others. All Rights Reserved. 1.7 + ********************************************************************** 1.8 + * Date Name Description 1.9 + * 07/23/01 aliu Creation. 1.10 + ********************************************************************** 1.11 + */ 1.12 +#ifndef STRMATCH_H 1.13 +#define STRMATCH_H 1.14 + 1.15 +#include "unicode/utypes.h" 1.16 + 1.17 +#if !UCONFIG_NO_TRANSLITERATION 1.18 + 1.19 +#include "unicode/unistr.h" 1.20 +#include "unicode/unifunct.h" 1.21 +#include "unicode/unimatch.h" 1.22 +#include "unicode/unirepl.h" 1.23 + 1.24 +U_NAMESPACE_BEGIN 1.25 + 1.26 +class TransliterationRuleData; 1.27 + 1.28 +/** 1.29 + * An object that matches a fixed input string, implementing the 1.30 + * UnicodeMatcher API. This object also implements the 1.31 + * UnicodeReplacer API, allowing it to emit the matched text as 1.32 + * output. Since the match text may contain flexible match elements, 1.33 + * such as UnicodeSets, the emitted text is not the match pattern, but 1.34 + * instead a substring of the actual matched text. Following 1.35 + * convention, the output text is the leftmost match seen up to this 1.36 + * point. 1.37 + * 1.38 + * A StringMatcher may represent a segment, in which case it has a 1.39 + * positive segment number. This affects how the matcher converts 1.40 + * itself to a pattern but does not otherwise affect its function. 1.41 + * 1.42 + * A StringMatcher that is not a segment should not be used as a 1.43 + * UnicodeReplacer. 1.44 + */ 1.45 +class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { 1.46 + 1.47 + public: 1.48 + 1.49 + /** 1.50 + * Construct a matcher that matches the given pattern string. 1.51 + * @param string the pattern to be matched, possibly containing 1.52 + * stand-ins that represent nested UnicodeMatcher objects. 1.53 + * @param start inclusive start index of text to be replaced 1.54 + * @param limit exclusive end index of text to be replaced; 1.55 + * must be greater than or equal to start 1.56 + * @param segmentNum the segment number from 1..n, or 0 if this is 1.57 + * not a segment. 1.58 + * @param data context object mapping stand-ins to 1.59 + * UnicodeMatcher objects. 1.60 + */ 1.61 + StringMatcher(const UnicodeString& string, 1.62 + int32_t start, 1.63 + int32_t limit, 1.64 + int32_t segmentNum, 1.65 + const TransliterationRuleData& data); 1.66 + 1.67 + /** 1.68 + * Copy constructor 1.69 + * @param o the object to be copied. 1.70 + */ 1.71 + StringMatcher(const StringMatcher& o); 1.72 + 1.73 + /** 1.74 + * Destructor 1.75 + */ 1.76 + virtual ~StringMatcher(); 1.77 + 1.78 + /** 1.79 + * Implement UnicodeFunctor 1.80 + * @return a copy of the object. 1.81 + */ 1.82 + virtual UnicodeFunctor* clone() const; 1.83 + 1.84 + /** 1.85 + * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer 1.86 + * and return the pointer. 1.87 + * @return the UnicodeMatcher point. 1.88 + */ 1.89 + virtual UnicodeMatcher* toMatcher() const; 1.90 + 1.91 + /** 1.92 + * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer 1.93 + * and return the pointer. 1.94 + * @return the UnicodeReplacer pointer. 1.95 + */ 1.96 + virtual UnicodeReplacer* toReplacer() const; 1.97 + 1.98 + /** 1.99 + * Implement UnicodeMatcher 1.100 + * @param text the text to be matched 1.101 + * @param offset on input, the index into text at which to begin 1.102 + * matching. On output, the limit of the matched text. The 1.103 + * number of matched characters is the output value of offset 1.104 + * minus the input value. Offset should always point to the 1.105 + * HIGH SURROGATE (leading code unit) of a pair of surrogates, 1.106 + * both on entry and upon return. 1.107 + * @param limit the limit index of text to be matched. Greater 1.108 + * than offset for a forward direction match, less than offset for 1.109 + * a backward direction match. The last character to be 1.110 + * considered for matching will be text.charAt(limit-1) in the 1.111 + * forward direction or text.charAt(limit+1) in the backward 1.112 + * direction. 1.113 + * @param incremental if TRUE, then assume further characters may 1.114 + * be inserted at limit and check for partial matching. Otherwise 1.115 + * assume the text as given is complete. 1.116 + * @return a match degree value indicating a full match, a partial 1.117 + * match, or a mismatch. If incremental is FALSE then 1.118 + * U_PARTIAL_MATCH should never be returned. 1.119 + */ 1.120 + virtual UMatchDegree matches(const Replaceable& text, 1.121 + int32_t& offset, 1.122 + int32_t limit, 1.123 + UBool incremental); 1.124 + 1.125 + /** 1.126 + * Implement UnicodeMatcher 1.127 + * @param result Output param to receive the pattern. 1.128 + * @param escapeUnprintable if True then escape the unprintable characters. 1.129 + * @return A reference to 'result'. 1.130 + */ 1.131 + virtual UnicodeString& toPattern(UnicodeString& result, 1.132 + UBool escapeUnprintable = FALSE) const; 1.133 + 1.134 + /** 1.135 + * Implement UnicodeMatcher 1.136 + * Returns TRUE if this matcher will match a character c, where c 1.137 + * & 0xFF == v, at offset, in the forward direction (with limit > 1.138 + * offset). This is used by <tt>RuleBasedTransliterator</tt> for 1.139 + * indexing. 1.140 + * @param v the given value 1.141 + * @return TRUE if this matcher will match a character c, 1.142 + * where c & 0xFF == v 1.143 + */ 1.144 + virtual UBool matchesIndexValue(uint8_t v) const; 1.145 + 1.146 + /** 1.147 + * Implement UnicodeMatcher 1.148 + */ 1.149 + virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 1.150 + 1.151 + /** 1.152 + * Implement UnicodeFunctor 1.153 + */ 1.154 + virtual void setData(const TransliterationRuleData*); 1.155 + 1.156 + /** 1.157 + * Replace characters in 'text' from 'start' to 'limit' with the 1.158 + * output text of this object. Update the 'cursor' parameter to 1.159 + * give the cursor position and return the length of the 1.160 + * replacement text. 1.161 + * 1.162 + * @param text the text to be matched 1.163 + * @param start inclusive start index of text to be replaced 1.164 + * @param limit exclusive end index of text to be replaced; 1.165 + * must be greater than or equal to start 1.166 + * @param cursor output parameter for the cursor position. 1.167 + * Not all replacer objects will update this, but in a complete 1.168 + * tree of replacer objects, representing the entire output side 1.169 + * of a transliteration rule, at least one must update it. 1.170 + * @return the number of 16-bit code units in the text replacing 1.171 + * the characters at offsets start..(limit-1) in text 1.172 + */ 1.173 + virtual int32_t replace(Replaceable& text, 1.174 + int32_t start, 1.175 + int32_t limit, 1.176 + int32_t& cursor); 1.177 + 1.178 + /** 1.179 + * Returns a string representation of this replacer. If the 1.180 + * result of calling this function is passed to the appropriate 1.181 + * parser, typically TransliteratorParser, it will produce another 1.182 + * replacer that is equal to this one. 1.183 + * @param result the string to receive the pattern. Previous 1.184 + * contents will be deleted. 1.185 + * @param escapeUnprintable if TRUE then convert unprintable 1.186 + * character to their hex escape representations, \\uxxxx or 1.187 + * \\Uxxxxxxxx. Unprintable characters are defined by 1.188 + * Utility.isUnprintable(). 1.189 + * @return a reference to 'result'. 1.190 + */ 1.191 + virtual UnicodeString& toReplacerPattern(UnicodeString& result, 1.192 + UBool escapeUnprintable) const; 1.193 + 1.194 + /** 1.195 + * Remove any match data. This must be called before performing a 1.196 + * set of matches with this segment. 1.197 + */ 1.198 + void resetMatch(); 1.199 + 1.200 + /** 1.201 + * ICU "poor man's RTTI", returns a UClassID for the actual class. 1.202 + */ 1.203 + virtual UClassID getDynamicClassID() const; 1.204 + 1.205 + /** 1.206 + * ICU "poor man's RTTI", returns a UClassID for this class. 1.207 + */ 1.208 + static UClassID U_EXPORT2 getStaticClassID(); 1.209 + 1.210 + /** 1.211 + * Union the set of all characters that may output by this object 1.212 + * into the given set. 1.213 + * @param toUnionTo the set into which to union the output characters 1.214 + */ 1.215 + virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; 1.216 + 1.217 + private: 1.218 + 1.219 + /** 1.220 + * The text to be matched. 1.221 + */ 1.222 + UnicodeString pattern; 1.223 + 1.224 + /** 1.225 + * Context object that maps stand-ins to matcher and replacer 1.226 + * objects. 1.227 + */ 1.228 + const TransliterationRuleData* data; 1.229 + 1.230 + /** 1.231 + * The segment number, 1-based, or 0 if not a segment. 1.232 + */ 1.233 + int32_t segmentNumber; 1.234 + 1.235 + /** 1.236 + * Start offset, in the match text, of the <em>rightmost</em> 1.237 + * match. 1.238 + */ 1.239 + int32_t matchStart; 1.240 + 1.241 + /** 1.242 + * Limit offset, in the match text, of the <em>rightmost</em> 1.243 + * match. 1.244 + */ 1.245 + int32_t matchLimit; 1.246 + 1.247 +}; 1.248 + 1.249 +U_NAMESPACE_END 1.250 + 1.251 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */ 1.252 + 1.253 +#endif