michael@0: /* michael@0: * Copyright (C) 2001-2011, International Business Machines Corporation michael@0: * and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * Date Name Description michael@0: * 07/23/01 aliu Creation. michael@0: ********************************************************************** michael@0: */ michael@0: #ifndef STRMATCH_H michael@0: #define STRMATCH_H michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_TRANSLITERATION michael@0: michael@0: #include "unicode/unistr.h" michael@0: #include "unicode/unifunct.h" michael@0: #include "unicode/unimatch.h" michael@0: #include "unicode/unirepl.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: class TransliterationRuleData; michael@0: michael@0: /** michael@0: * An object that matches a fixed input string, implementing the michael@0: * UnicodeMatcher API. This object also implements the michael@0: * UnicodeReplacer API, allowing it to emit the matched text as michael@0: * output. Since the match text may contain flexible match elements, michael@0: * such as UnicodeSets, the emitted text is not the match pattern, but michael@0: * instead a substring of the actual matched text. Following michael@0: * convention, the output text is the leftmost match seen up to this michael@0: * point. michael@0: * michael@0: * A StringMatcher may represent a segment, in which case it has a michael@0: * positive segment number. This affects how the matcher converts michael@0: * itself to a pattern but does not otherwise affect its function. michael@0: * michael@0: * A StringMatcher that is not a segment should not be used as a michael@0: * UnicodeReplacer. michael@0: */ michael@0: class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer { michael@0: michael@0: public: michael@0: michael@0: /** michael@0: * Construct a matcher that matches the given pattern string. michael@0: * @param string the pattern to be matched, possibly containing michael@0: * stand-ins that represent nested UnicodeMatcher objects. michael@0: * @param start inclusive start index of text to be replaced michael@0: * @param limit exclusive end index of text to be replaced; michael@0: * must be greater than or equal to start michael@0: * @param segmentNum the segment number from 1..n, or 0 if this is michael@0: * not a segment. michael@0: * @param data context object mapping stand-ins to michael@0: * UnicodeMatcher objects. michael@0: */ michael@0: StringMatcher(const UnicodeString& string, michael@0: int32_t start, michael@0: int32_t limit, michael@0: int32_t segmentNum, michael@0: const TransliterationRuleData& data); michael@0: michael@0: /** michael@0: * Copy constructor michael@0: * @param o the object to be copied. michael@0: */ michael@0: StringMatcher(const StringMatcher& o); michael@0: michael@0: /** michael@0: * Destructor michael@0: */ michael@0: virtual ~StringMatcher(); michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: * @return a copy of the object. michael@0: */ michael@0: virtual UnicodeFunctor* clone() const; michael@0: michael@0: /** michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer michael@0: * and return the pointer. michael@0: * @return the UnicodeMatcher point. michael@0: */ michael@0: virtual UnicodeMatcher* toMatcher() const; michael@0: michael@0: /** michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer michael@0: * and return the pointer. michael@0: * @return the UnicodeReplacer pointer. michael@0: */ michael@0: virtual UnicodeReplacer* toReplacer() const; michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: * @param text the text to be matched michael@0: * @param offset on input, the index into text at which to begin michael@0: * matching. On output, the limit of the matched text. The michael@0: * number of matched characters is the output value of offset michael@0: * minus the input value. Offset should always point to the michael@0: * HIGH SURROGATE (leading code unit) of a pair of surrogates, michael@0: * both on entry and upon return. michael@0: * @param limit the limit index of text to be matched. Greater michael@0: * than offset for a forward direction match, less than offset for michael@0: * a backward direction match. The last character to be michael@0: * considered for matching will be text.charAt(limit-1) in the michael@0: * forward direction or text.charAt(limit+1) in the backward michael@0: * direction. michael@0: * @param incremental if TRUE, then assume further characters may michael@0: * be inserted at limit and check for partial matching. Otherwise michael@0: * assume the text as given is complete. michael@0: * @return a match degree value indicating a full match, a partial michael@0: * match, or a mismatch. If incremental is FALSE then michael@0: * U_PARTIAL_MATCH should never be returned. michael@0: */ michael@0: virtual UMatchDegree matches(const Replaceable& text, michael@0: int32_t& offset, michael@0: int32_t limit, michael@0: UBool incremental); michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: * @param result Output param to receive the pattern. michael@0: * @param escapeUnprintable if True then escape the unprintable characters. michael@0: * @return A reference to 'result'. michael@0: */ michael@0: virtual UnicodeString& toPattern(UnicodeString& result, michael@0: UBool escapeUnprintable = FALSE) const; michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: * Returns TRUE if this matcher will match a character c, where c michael@0: * & 0xFF == v, at offset, in the forward direction (with limit > michael@0: * offset). This is used by RuleBasedTransliterator for michael@0: * indexing. michael@0: * @param v the given value michael@0: * @return TRUE if this matcher will match a character c, michael@0: * where c & 0xFF == v michael@0: */ michael@0: virtual UBool matchesIndexValue(uint8_t v) const; michael@0: michael@0: /** michael@0: * Implement UnicodeMatcher michael@0: */ michael@0: virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; michael@0: michael@0: /** michael@0: * Implement UnicodeFunctor michael@0: */ michael@0: virtual void setData(const TransliterationRuleData*); michael@0: michael@0: /** michael@0: * Replace characters in 'text' from 'start' to 'limit' with the michael@0: * output text of this object. Update the 'cursor' parameter to michael@0: * give the cursor position and return the length of the michael@0: * replacement text. michael@0: * michael@0: * @param text the text to be matched michael@0: * @param start inclusive start index of text to be replaced michael@0: * @param limit exclusive end index of text to be replaced; michael@0: * must be greater than or equal to start michael@0: * @param cursor output parameter for the cursor position. michael@0: * Not all replacer objects will update this, but in a complete michael@0: * tree of replacer objects, representing the entire output side michael@0: * of a transliteration rule, at least one must update it. michael@0: * @return the number of 16-bit code units in the text replacing michael@0: * the characters at offsets start..(limit-1) in text michael@0: */ michael@0: virtual int32_t replace(Replaceable& text, michael@0: int32_t start, michael@0: int32_t limit, michael@0: int32_t& cursor); michael@0: michael@0: /** michael@0: * Returns a string representation of this replacer. If the michael@0: * result of calling this function is passed to the appropriate michael@0: * parser, typically TransliteratorParser, it will produce another michael@0: * replacer that is equal to this one. michael@0: * @param result the string to receive the pattern. Previous michael@0: * contents will be deleted. michael@0: * @param escapeUnprintable if TRUE then convert unprintable michael@0: * character to their hex escape representations, \\uxxxx or michael@0: * \\Uxxxxxxxx. Unprintable characters are defined by michael@0: * Utility.isUnprintable(). michael@0: * @return a reference to 'result'. michael@0: */ michael@0: virtual UnicodeString& toReplacerPattern(UnicodeString& result, michael@0: UBool escapeUnprintable) const; michael@0: michael@0: /** michael@0: * Remove any match data. This must be called before performing a michael@0: * set of matches with this segment. michael@0: */ michael@0: void resetMatch(); michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for the actual class. michael@0: */ michael@0: virtual UClassID getDynamicClassID() const; michael@0: michael@0: /** michael@0: * ICU "poor man's RTTI", returns a UClassID for this class. michael@0: */ michael@0: static UClassID U_EXPORT2 getStaticClassID(); michael@0: michael@0: /** michael@0: * Union the set of all characters that may output by this object michael@0: * into the given set. michael@0: * @param toUnionTo the set into which to union the output characters michael@0: */ michael@0: virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const; michael@0: michael@0: private: michael@0: michael@0: /** michael@0: * The text to be matched. michael@0: */ michael@0: UnicodeString pattern; michael@0: michael@0: /** michael@0: * Context object that maps stand-ins to matcher and replacer michael@0: * objects. michael@0: */ michael@0: const TransliterationRuleData* data; michael@0: michael@0: /** michael@0: * The segment number, 1-based, or 0 if not a segment. michael@0: */ michael@0: int32_t segmentNumber; michael@0: michael@0: /** michael@0: * Start offset, in the match text, of the rightmost michael@0: * match. michael@0: */ michael@0: int32_t matchStart; michael@0: michael@0: /** michael@0: * Limit offset, in the match text, of the rightmost michael@0: * match. michael@0: */ michael@0: int32_t matchLimit; michael@0: michael@0: }; michael@0: michael@0: U_NAMESPACE_END michael@0: michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */ michael@0: michael@0: #endif