michael@0: /*
michael@0:  * Copyright (C) 2001-2011, International Business Machines Corporation
michael@0:  * and others. All Rights Reserved.
michael@0:  **********************************************************************
michael@0:  *   Date        Name        Description
michael@0:  *   07/23/01    aliu        Creation.
michael@0:  **********************************************************************
michael@0:  */
michael@0: #ifndef STRMATCH_H
michael@0: #define STRMATCH_H
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: 
michael@0: #if !UCONFIG_NO_TRANSLITERATION
michael@0: 
michael@0: #include "unicode/unistr.h"
michael@0: #include "unicode/unifunct.h"
michael@0: #include "unicode/unimatch.h"
michael@0: #include "unicode/unirepl.h"
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: class TransliterationRuleData;
michael@0: 
michael@0: /**
michael@0:  * An object that matches a fixed input string, implementing the
michael@0:  * UnicodeMatcher API.  This object also implements the
michael@0:  * UnicodeReplacer API, allowing it to emit the matched text as
michael@0:  * output.  Since the match text may contain flexible match elements,
michael@0:  * such as UnicodeSets, the emitted text is not the match pattern, but
michael@0:  * instead a substring of the actual matched text.  Following
michael@0:  * convention, the output text is the leftmost match seen up to this
michael@0:  * point.
michael@0:  *
michael@0:  * A StringMatcher may represent a segment, in which case it has a
michael@0:  * positive segment number.  This affects how the matcher converts
michael@0:  * itself to a pattern but does not otherwise affect its function.
michael@0:  *
michael@0:  * A StringMatcher that is not a segment should not be used as a
michael@0:  * UnicodeReplacer.
michael@0:  */
michael@0: class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
michael@0: 
michael@0:  public:
michael@0: 
michael@0:     /**
michael@0:      * Construct a matcher that matches the given pattern string.
michael@0:      * @param string the pattern to be matched, possibly containing
michael@0:      * stand-ins that represent nested UnicodeMatcher objects.
michael@0:      * @param start inclusive start index of text to be replaced
michael@0:      * @param limit exclusive end index of text to be replaced;
michael@0:      * must be greater than or equal to start
michael@0:      * @param segmentNum the segment number from 1..n, or 0 if this is
michael@0:      * not a segment.
michael@0:      * @param data context object mapping stand-ins to
michael@0:      * UnicodeMatcher objects.
michael@0:      */
michael@0:     StringMatcher(const UnicodeString& string,
michael@0:                   int32_t start,
michael@0:                   int32_t limit,
michael@0:                   int32_t segmentNum,
michael@0:                   const TransliterationRuleData& data);
michael@0: 
michael@0:     /**
michael@0:      * Copy constructor
michael@0:      * @param o  the object to be copied.
michael@0:      */
michael@0:     StringMatcher(const StringMatcher& o);
michael@0:         
michael@0:     /**
michael@0:      * Destructor
michael@0:      */
michael@0:     virtual ~StringMatcher();
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeFunctor
michael@0:      * @return a copy of the object.
michael@0:      */
michael@0:     virtual UnicodeFunctor* clone() const;
michael@0: 
michael@0:     /**
michael@0:      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
michael@0:      * and return the pointer.
michael@0:      * @return the UnicodeMatcher point.
michael@0:      */
michael@0:     virtual UnicodeMatcher* toMatcher() const;
michael@0: 
michael@0:     /**
michael@0:      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
michael@0:      * and return the pointer.
michael@0:      * @return the UnicodeReplacer pointer.
michael@0:      */
michael@0:     virtual UnicodeReplacer* toReplacer() const;
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeMatcher
michael@0:      * @param text the text to be matched
michael@0:      * @param offset on input, the index into text at which to begin
michael@0:      * matching.  On output, the limit of the matched text.  The
michael@0:      * number of matched characters is the output value of offset
michael@0:      * minus the input value.  Offset should always point to the
michael@0:      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
michael@0:      * both on entry and upon return.
michael@0:      * @param limit the limit index of text to be matched.  Greater
michael@0:      * than offset for a forward direction match, less than offset for
michael@0:      * a backward direction match.  The last character to be
michael@0:      * considered for matching will be text.charAt(limit-1) in the
michael@0:      * forward direction or text.charAt(limit+1) in the backward
michael@0:      * direction.
michael@0:      * @param incremental  if TRUE, then assume further characters may
michael@0:      * be inserted at limit and check for partial matching.  Otherwise
michael@0:      * assume the text as given is complete.
michael@0:      * @return a match degree value indicating a full match, a partial
michael@0:      * match, or a mismatch.  If incremental is FALSE then
michael@0:      * U_PARTIAL_MATCH should never be returned.
michael@0:      */
michael@0:     virtual UMatchDegree matches(const Replaceable& text,
michael@0:                                  int32_t& offset,
michael@0:                                  int32_t limit,
michael@0:                                  UBool incremental);
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeMatcher
michael@0:      * @param result            Output param to receive the pattern.
michael@0:      * @param escapeUnprintable if True then escape the unprintable characters.
michael@0:      * @return                  A reference to 'result'.
michael@0:      */
michael@0:     virtual UnicodeString& toPattern(UnicodeString& result,
michael@0:                                      UBool escapeUnprintable = FALSE) const;
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeMatcher
michael@0:      * Returns TRUE if this matcher will match a character c, where c
michael@0:      * & 0xFF == v, at offset, in the forward direction (with limit >
michael@0:      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
michael@0:      * indexing.
michael@0:      * @param v    the given value
michael@0:      * @return     TRUE if this matcher will match a character c, 
michael@0:      *             where c & 0xFF == v
michael@0:      */
michael@0:     virtual UBool matchesIndexValue(uint8_t v) const;
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeMatcher
michael@0:      */
michael@0:     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
michael@0: 
michael@0:     /**
michael@0:      * Implement UnicodeFunctor
michael@0:      */
michael@0:     virtual void setData(const TransliterationRuleData*);
michael@0: 
michael@0:     /**
michael@0:      * Replace characters in 'text' from 'start' to 'limit' with the
michael@0:      * output text of this object.  Update the 'cursor' parameter to
michael@0:      * give the cursor position and return the length of the
michael@0:      * replacement text.
michael@0:      *
michael@0:      * @param text the text to be matched
michael@0:      * @param start inclusive start index of text to be replaced
michael@0:      * @param limit exclusive end index of text to be replaced;
michael@0:      * must be greater than or equal to start
michael@0:      * @param cursor output parameter for the cursor position.
michael@0:      * Not all replacer objects will update this, but in a complete
michael@0:      * tree of replacer objects, representing the entire output side
michael@0:      * of a transliteration rule, at least one must update it.
michael@0:      * @return the number of 16-bit code units in the text replacing
michael@0:      * the characters at offsets start..(limit-1) in text
michael@0:      */
michael@0:     virtual int32_t replace(Replaceable& text,
michael@0:                             int32_t start,
michael@0:                             int32_t limit,
michael@0:                             int32_t& cursor);
michael@0: 
michael@0:     /**
michael@0:      * Returns a string representation of this replacer.  If the
michael@0:      * result of calling this function is passed to the appropriate
michael@0:      * parser, typically TransliteratorParser, it will produce another
michael@0:      * replacer that is equal to this one.
michael@0:      * @param result the string to receive the pattern.  Previous
michael@0:      * contents will be deleted.
michael@0:      * @param escapeUnprintable if TRUE then convert unprintable
michael@0:      * character to their hex escape representations, \\uxxxx or
michael@0:      * \\Uxxxxxxxx.  Unprintable characters are defined by
michael@0:      * Utility.isUnprintable().
michael@0:      * @return a reference to 'result'.
michael@0:      */
michael@0:     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
michael@0:                                              UBool escapeUnprintable) const;
michael@0: 
michael@0:     /**
michael@0:      * Remove any match data.  This must be called before performing a
michael@0:      * set of matches with this segment.
michael@0:      */
michael@0:     void resetMatch();
michael@0: 
michael@0:     /**
michael@0:      * ICU "poor man's RTTI", returns a UClassID for the actual class.
michael@0:      */
michael@0:     virtual UClassID getDynamicClassID() const;
michael@0: 
michael@0:     /**
michael@0:      * ICU "poor man's RTTI", returns a UClassID for this class.
michael@0:      */
michael@0:     static UClassID U_EXPORT2 getStaticClassID();
michael@0: 
michael@0:     /**
michael@0:      * Union the set of all characters that may output by this object
michael@0:      * into the given set.
michael@0:      * @param toUnionTo the set into which to union the output characters
michael@0:      */
michael@0:     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
michael@0: 
michael@0:  private:
michael@0: 
michael@0:     /**
michael@0:      * The text to be matched.
michael@0:      */
michael@0:     UnicodeString pattern;
michael@0: 
michael@0:     /**
michael@0:      * Context object that maps stand-ins to matcher and replacer
michael@0:      * objects.
michael@0:      */
michael@0:     const TransliterationRuleData* data;
michael@0: 
michael@0:     /**
michael@0:      * The segment number, 1-based, or 0 if not a segment.
michael@0:      */
michael@0:     int32_t segmentNumber;
michael@0: 
michael@0:     /**
michael@0:      * Start offset, in the match text, of the <em>rightmost</em>
michael@0:      * match.
michael@0:      */
michael@0:     int32_t matchStart;
michael@0: 
michael@0:     /**
michael@0:      * Limit offset, in the match text, of the <em>rightmost</em>
michael@0:      * match.
michael@0:      */
michael@0:     int32_t matchLimit;
michael@0: 
michael@0: };
michael@0: 
michael@0: U_NAMESPACE_END
michael@0: 
michael@0: #endif /* #if !UCONFIG_NO_TRANSLITERATION */
michael@0: 
michael@0: #endif