michael@0: /*
michael@0: * Copyright (C) 2001-2011, International Business Machines Corporation
michael@0: * and others. All Rights Reserved.
michael@0: **********************************************************************
michael@0: * Date Name Description
michael@0: * 07/23/01 aliu Creation.
michael@0: **********************************************************************
michael@0: */
michael@0: #ifndef STRMATCH_H
michael@0: #define STRMATCH_H
michael@0:
michael@0: #include "unicode/utypes.h"
michael@0:
michael@0: #if !UCONFIG_NO_TRANSLITERATION
michael@0:
michael@0: #include "unicode/unistr.h"
michael@0: #include "unicode/unifunct.h"
michael@0: #include "unicode/unimatch.h"
michael@0: #include "unicode/unirepl.h"
michael@0:
michael@0: U_NAMESPACE_BEGIN
michael@0:
michael@0: class TransliterationRuleData;
michael@0:
michael@0: /**
michael@0: * An object that matches a fixed input string, implementing the
michael@0: * UnicodeMatcher API. This object also implements the
michael@0: * UnicodeReplacer API, allowing it to emit the matched text as
michael@0: * output. Since the match text may contain flexible match elements,
michael@0: * such as UnicodeSets, the emitted text is not the match pattern, but
michael@0: * instead a substring of the actual matched text. Following
michael@0: * convention, the output text is the leftmost match seen up to this
michael@0: * point.
michael@0: *
michael@0: * A StringMatcher may represent a segment, in which case it has a
michael@0: * positive segment number. This affects how the matcher converts
michael@0: * itself to a pattern but does not otherwise affect its function.
michael@0: *
michael@0: * A StringMatcher that is not a segment should not be used as a
michael@0: * UnicodeReplacer.
michael@0: */
michael@0: class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
michael@0:
michael@0: public:
michael@0:
michael@0: /**
michael@0: * Construct a matcher that matches the given pattern string.
michael@0: * @param string the pattern to be matched, possibly containing
michael@0: * stand-ins that represent nested UnicodeMatcher objects.
michael@0: * @param start inclusive start index of text to be replaced
michael@0: * @param limit exclusive end index of text to be replaced;
michael@0: * must be greater than or equal to start
michael@0: * @param segmentNum the segment number from 1..n, or 0 if this is
michael@0: * not a segment.
michael@0: * @param data context object mapping stand-ins to
michael@0: * UnicodeMatcher objects.
michael@0: */
michael@0: StringMatcher(const UnicodeString& string,
michael@0: int32_t start,
michael@0: int32_t limit,
michael@0: int32_t segmentNum,
michael@0: const TransliterationRuleData& data);
michael@0:
michael@0: /**
michael@0: * Copy constructor
michael@0: * @param o the object to be copied.
michael@0: */
michael@0: StringMatcher(const StringMatcher& o);
michael@0:
michael@0: /**
michael@0: * Destructor
michael@0: */
michael@0: virtual ~StringMatcher();
michael@0:
michael@0: /**
michael@0: * Implement UnicodeFunctor
michael@0: * @return a copy of the object.
michael@0: */
michael@0: virtual UnicodeFunctor* clone() const;
michael@0:
michael@0: /**
michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeMatcher* pointer
michael@0: * and return the pointer.
michael@0: * @return the UnicodeMatcher point.
michael@0: */
michael@0: virtual UnicodeMatcher* toMatcher() const;
michael@0:
michael@0: /**
michael@0: * UnicodeFunctor API. Cast 'this' to a UnicodeReplacer* pointer
michael@0: * and return the pointer.
michael@0: * @return the UnicodeReplacer pointer.
michael@0: */
michael@0: virtual UnicodeReplacer* toReplacer() const;
michael@0:
michael@0: /**
michael@0: * Implement UnicodeMatcher
michael@0: * @param text the text to be matched
michael@0: * @param offset on input, the index into text at which to begin
michael@0: * matching. On output, the limit of the matched text. The
michael@0: * number of matched characters is the output value of offset
michael@0: * minus the input value. Offset should always point to the
michael@0: * HIGH SURROGATE (leading code unit) of a pair of surrogates,
michael@0: * both on entry and upon return.
michael@0: * @param limit the limit index of text to be matched. Greater
michael@0: * than offset for a forward direction match, less than offset for
michael@0: * a backward direction match. The last character to be
michael@0: * considered for matching will be text.charAt(limit-1) in the
michael@0: * forward direction or text.charAt(limit+1) in the backward
michael@0: * direction.
michael@0: * @param incremental if TRUE, then assume further characters may
michael@0: * be inserted at limit and check for partial matching. Otherwise
michael@0: * assume the text as given is complete.
michael@0: * @return a match degree value indicating a full match, a partial
michael@0: * match, or a mismatch. If incremental is FALSE then
michael@0: * U_PARTIAL_MATCH should never be returned.
michael@0: */
michael@0: virtual UMatchDegree matches(const Replaceable& text,
michael@0: int32_t& offset,
michael@0: int32_t limit,
michael@0: UBool incremental);
michael@0:
michael@0: /**
michael@0: * Implement UnicodeMatcher
michael@0: * @param result Output param to receive the pattern.
michael@0: * @param escapeUnprintable if True then escape the unprintable characters.
michael@0: * @return A reference to 'result'.
michael@0: */
michael@0: virtual UnicodeString& toPattern(UnicodeString& result,
michael@0: UBool escapeUnprintable = FALSE) const;
michael@0:
michael@0: /**
michael@0: * Implement UnicodeMatcher
michael@0: * Returns TRUE if this matcher will match a character c, where c
michael@0: * & 0xFF == v, at offset, in the forward direction (with limit >
michael@0: * offset). This is used by RuleBasedTransliterator for
michael@0: * indexing.
michael@0: * @param v the given value
michael@0: * @return TRUE if this matcher will match a character c,
michael@0: * where c & 0xFF == v
michael@0: */
michael@0: virtual UBool matchesIndexValue(uint8_t v) const;
michael@0:
michael@0: /**
michael@0: * Implement UnicodeMatcher
michael@0: */
michael@0: virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
michael@0:
michael@0: /**
michael@0: * Implement UnicodeFunctor
michael@0: */
michael@0: virtual void setData(const TransliterationRuleData*);
michael@0:
michael@0: /**
michael@0: * Replace characters in 'text' from 'start' to 'limit' with the
michael@0: * output text of this object. Update the 'cursor' parameter to
michael@0: * give the cursor position and return the length of the
michael@0: * replacement text.
michael@0: *
michael@0: * @param text the text to be matched
michael@0: * @param start inclusive start index of text to be replaced
michael@0: * @param limit exclusive end index of text to be replaced;
michael@0: * must be greater than or equal to start
michael@0: * @param cursor output parameter for the cursor position.
michael@0: * Not all replacer objects will update this, but in a complete
michael@0: * tree of replacer objects, representing the entire output side
michael@0: * of a transliteration rule, at least one must update it.
michael@0: * @return the number of 16-bit code units in the text replacing
michael@0: * the characters at offsets start..(limit-1) in text
michael@0: */
michael@0: virtual int32_t replace(Replaceable& text,
michael@0: int32_t start,
michael@0: int32_t limit,
michael@0: int32_t& cursor);
michael@0:
michael@0: /**
michael@0: * Returns a string representation of this replacer. If the
michael@0: * result of calling this function is passed to the appropriate
michael@0: * parser, typically TransliteratorParser, it will produce another
michael@0: * replacer that is equal to this one.
michael@0: * @param result the string to receive the pattern. Previous
michael@0: * contents will be deleted.
michael@0: * @param escapeUnprintable if TRUE then convert unprintable
michael@0: * character to their hex escape representations, \\uxxxx or
michael@0: * \\Uxxxxxxxx. Unprintable characters are defined by
michael@0: * Utility.isUnprintable().
michael@0: * @return a reference to 'result'.
michael@0: */
michael@0: virtual UnicodeString& toReplacerPattern(UnicodeString& result,
michael@0: UBool escapeUnprintable) const;
michael@0:
michael@0: /**
michael@0: * Remove any match data. This must be called before performing a
michael@0: * set of matches with this segment.
michael@0: */
michael@0: void resetMatch();
michael@0:
michael@0: /**
michael@0: * ICU "poor man's RTTI", returns a UClassID for the actual class.
michael@0: */
michael@0: virtual UClassID getDynamicClassID() const;
michael@0:
michael@0: /**
michael@0: * ICU "poor man's RTTI", returns a UClassID for this class.
michael@0: */
michael@0: static UClassID U_EXPORT2 getStaticClassID();
michael@0:
michael@0: /**
michael@0: * Union the set of all characters that may output by this object
michael@0: * into the given set.
michael@0: * @param toUnionTo the set into which to union the output characters
michael@0: */
michael@0: virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
michael@0:
michael@0: private:
michael@0:
michael@0: /**
michael@0: * The text to be matched.
michael@0: */
michael@0: UnicodeString pattern;
michael@0:
michael@0: /**
michael@0: * Context object that maps stand-ins to matcher and replacer
michael@0: * objects.
michael@0: */
michael@0: const TransliterationRuleData* data;
michael@0:
michael@0: /**
michael@0: * The segment number, 1-based, or 0 if not a segment.
michael@0: */
michael@0: int32_t segmentNumber;
michael@0:
michael@0: /**
michael@0: * Start offset, in the match text, of the rightmost
michael@0: * match.
michael@0: */
michael@0: int32_t matchStart;
michael@0:
michael@0: /**
michael@0: * Limit offset, in the match text, of the rightmost
michael@0: * match.
michael@0: */
michael@0: int32_t matchLimit;
michael@0:
michael@0: };
michael@0:
michael@0: U_NAMESPACE_END
michael@0:
michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0:
michael@0: #endif