intl/icu/source/i18n/strmatch.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/i18n/strmatch.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,250 @@
     1.4 +/*
     1.5 + * Copyright (C) 2001-2011, International Business Machines Corporation
     1.6 + * and others. All Rights Reserved.
     1.7 + **********************************************************************
     1.8 + *   Date        Name        Description
     1.9 + *   07/23/01    aliu        Creation.
    1.10 + **********************************************************************
    1.11 + */
    1.12 +#ifndef STRMATCH_H
    1.13 +#define STRMATCH_H
    1.14 +
    1.15 +#include "unicode/utypes.h"
    1.16 +
    1.17 +#if !UCONFIG_NO_TRANSLITERATION
    1.18 +
    1.19 +#include "unicode/unistr.h"
    1.20 +#include "unicode/unifunct.h"
    1.21 +#include "unicode/unimatch.h"
    1.22 +#include "unicode/unirepl.h"
    1.23 +
    1.24 +U_NAMESPACE_BEGIN
    1.25 +
    1.26 +class TransliterationRuleData;
    1.27 +
    1.28 +/**
    1.29 + * An object that matches a fixed input string, implementing the
    1.30 + * UnicodeMatcher API.  This object also implements the
    1.31 + * UnicodeReplacer API, allowing it to emit the matched text as
    1.32 + * output.  Since the match text may contain flexible match elements,
    1.33 + * such as UnicodeSets, the emitted text is not the match pattern, but
    1.34 + * instead a substring of the actual matched text.  Following
    1.35 + * convention, the output text is the leftmost match seen up to this
    1.36 + * point.
    1.37 + *
    1.38 + * A StringMatcher may represent a segment, in which case it has a
    1.39 + * positive segment number.  This affects how the matcher converts
    1.40 + * itself to a pattern but does not otherwise affect its function.
    1.41 + *
    1.42 + * A StringMatcher that is not a segment should not be used as a
    1.43 + * UnicodeReplacer.
    1.44 + */
    1.45 +class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
    1.46 +
    1.47 + public:
    1.48 +
    1.49 +    /**
    1.50 +     * Construct a matcher that matches the given pattern string.
    1.51 +     * @param string the pattern to be matched, possibly containing
    1.52 +     * stand-ins that represent nested UnicodeMatcher objects.
    1.53 +     * @param start inclusive start index of text to be replaced
    1.54 +     * @param limit exclusive end index of text to be replaced;
    1.55 +     * must be greater than or equal to start
    1.56 +     * @param segmentNum the segment number from 1..n, or 0 if this is
    1.57 +     * not a segment.
    1.58 +     * @param data context object mapping stand-ins to
    1.59 +     * UnicodeMatcher objects.
    1.60 +     */
    1.61 +    StringMatcher(const UnicodeString& string,
    1.62 +                  int32_t start,
    1.63 +                  int32_t limit,
    1.64 +                  int32_t segmentNum,
    1.65 +                  const TransliterationRuleData& data);
    1.66 +
    1.67 +    /**
    1.68 +     * Copy constructor
    1.69 +     * @param o  the object to be copied.
    1.70 +     */
    1.71 +    StringMatcher(const StringMatcher& o);
    1.72 +        
    1.73 +    /**
    1.74 +     * Destructor
    1.75 +     */
    1.76 +    virtual ~StringMatcher();
    1.77 +
    1.78 +    /**
    1.79 +     * Implement UnicodeFunctor
    1.80 +     * @return a copy of the object.
    1.81 +     */
    1.82 +    virtual UnicodeFunctor* clone() const;
    1.83 +
    1.84 +    /**
    1.85 +     * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
    1.86 +     * and return the pointer.
    1.87 +     * @return the UnicodeMatcher point.
    1.88 +     */
    1.89 +    virtual UnicodeMatcher* toMatcher() const;
    1.90 +
    1.91 +    /**
    1.92 +     * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
    1.93 +     * and return the pointer.
    1.94 +     * @return the UnicodeReplacer pointer.
    1.95 +     */
    1.96 +    virtual UnicodeReplacer* toReplacer() const;
    1.97 +
    1.98 +    /**
    1.99 +     * Implement UnicodeMatcher
   1.100 +     * @param text the text to be matched
   1.101 +     * @param offset on input, the index into text at which to begin
   1.102 +     * matching.  On output, the limit of the matched text.  The
   1.103 +     * number of matched characters is the output value of offset
   1.104 +     * minus the input value.  Offset should always point to the
   1.105 +     * HIGH SURROGATE (leading code unit) of a pair of surrogates,
   1.106 +     * both on entry and upon return.
   1.107 +     * @param limit the limit index of text to be matched.  Greater
   1.108 +     * than offset for a forward direction match, less than offset for
   1.109 +     * a backward direction match.  The last character to be
   1.110 +     * considered for matching will be text.charAt(limit-1) in the
   1.111 +     * forward direction or text.charAt(limit+1) in the backward
   1.112 +     * direction.
   1.113 +     * @param incremental  if TRUE, then assume further characters may
   1.114 +     * be inserted at limit and check for partial matching.  Otherwise
   1.115 +     * assume the text as given is complete.
   1.116 +     * @return a match degree value indicating a full match, a partial
   1.117 +     * match, or a mismatch.  If incremental is FALSE then
   1.118 +     * U_PARTIAL_MATCH should never be returned.
   1.119 +     */
   1.120 +    virtual UMatchDegree matches(const Replaceable& text,
   1.121 +                                 int32_t& offset,
   1.122 +                                 int32_t limit,
   1.123 +                                 UBool incremental);
   1.124 +
   1.125 +    /**
   1.126 +     * Implement UnicodeMatcher
   1.127 +     * @param result            Output param to receive the pattern.
   1.128 +     * @param escapeUnprintable if True then escape the unprintable characters.
   1.129 +     * @return                  A reference to 'result'.
   1.130 +     */
   1.131 +    virtual UnicodeString& toPattern(UnicodeString& result,
   1.132 +                                     UBool escapeUnprintable = FALSE) const;
   1.133 +
   1.134 +    /**
   1.135 +     * Implement UnicodeMatcher
   1.136 +     * Returns TRUE if this matcher will match a character c, where c
   1.137 +     * & 0xFF == v, at offset, in the forward direction (with limit >
   1.138 +     * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
   1.139 +     * indexing.
   1.140 +     * @param v    the given value
   1.141 +     * @return     TRUE if this matcher will match a character c, 
   1.142 +     *             where c & 0xFF == v
   1.143 +     */
   1.144 +    virtual UBool matchesIndexValue(uint8_t v) const;
   1.145 +
   1.146 +    /**
   1.147 +     * Implement UnicodeMatcher
   1.148 +     */
   1.149 +    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
   1.150 +
   1.151 +    /**
   1.152 +     * Implement UnicodeFunctor
   1.153 +     */
   1.154 +    virtual void setData(const TransliterationRuleData*);
   1.155 +
   1.156 +    /**
   1.157 +     * Replace characters in 'text' from 'start' to 'limit' with the
   1.158 +     * output text of this object.  Update the 'cursor' parameter to
   1.159 +     * give the cursor position and return the length of the
   1.160 +     * replacement text.
   1.161 +     *
   1.162 +     * @param text the text to be matched
   1.163 +     * @param start inclusive start index of text to be replaced
   1.164 +     * @param limit exclusive end index of text to be replaced;
   1.165 +     * must be greater than or equal to start
   1.166 +     * @param cursor output parameter for the cursor position.
   1.167 +     * Not all replacer objects will update this, but in a complete
   1.168 +     * tree of replacer objects, representing the entire output side
   1.169 +     * of a transliteration rule, at least one must update it.
   1.170 +     * @return the number of 16-bit code units in the text replacing
   1.171 +     * the characters at offsets start..(limit-1) in text
   1.172 +     */
   1.173 +    virtual int32_t replace(Replaceable& text,
   1.174 +                            int32_t start,
   1.175 +                            int32_t limit,
   1.176 +                            int32_t& cursor);
   1.177 +
   1.178 +    /**
   1.179 +     * Returns a string representation of this replacer.  If the
   1.180 +     * result of calling this function is passed to the appropriate
   1.181 +     * parser, typically TransliteratorParser, it will produce another
   1.182 +     * replacer that is equal to this one.
   1.183 +     * @param result the string to receive the pattern.  Previous
   1.184 +     * contents will be deleted.
   1.185 +     * @param escapeUnprintable if TRUE then convert unprintable
   1.186 +     * character to their hex escape representations, \\uxxxx or
   1.187 +     * \\Uxxxxxxxx.  Unprintable characters are defined by
   1.188 +     * Utility.isUnprintable().
   1.189 +     * @return a reference to 'result'.
   1.190 +     */
   1.191 +    virtual UnicodeString& toReplacerPattern(UnicodeString& result,
   1.192 +                                             UBool escapeUnprintable) const;
   1.193 +
   1.194 +    /**
   1.195 +     * Remove any match data.  This must be called before performing a
   1.196 +     * set of matches with this segment.
   1.197 +     */
   1.198 +    void resetMatch();
   1.199 +
   1.200 +    /**
   1.201 +     * ICU "poor man's RTTI", returns a UClassID for the actual class.
   1.202 +     */
   1.203 +    virtual UClassID getDynamicClassID() const;
   1.204 +
   1.205 +    /**
   1.206 +     * ICU "poor man's RTTI", returns a UClassID for this class.
   1.207 +     */
   1.208 +    static UClassID U_EXPORT2 getStaticClassID();
   1.209 +
   1.210 +    /**
   1.211 +     * Union the set of all characters that may output by this object
   1.212 +     * into the given set.
   1.213 +     * @param toUnionTo the set into which to union the output characters
   1.214 +     */
   1.215 +    virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
   1.216 +
   1.217 + private:
   1.218 +
   1.219 +    /**
   1.220 +     * The text to be matched.
   1.221 +     */
   1.222 +    UnicodeString pattern;
   1.223 +
   1.224 +    /**
   1.225 +     * Context object that maps stand-ins to matcher and replacer
   1.226 +     * objects.
   1.227 +     */
   1.228 +    const TransliterationRuleData* data;
   1.229 +
   1.230 +    /**
   1.231 +     * The segment number, 1-based, or 0 if not a segment.
   1.232 +     */
   1.233 +    int32_t segmentNumber;
   1.234 +
   1.235 +    /**
   1.236 +     * Start offset, in the match text, of the <em>rightmost</em>
   1.237 +     * match.
   1.238 +     */
   1.239 +    int32_t matchStart;
   1.240 +
   1.241 +    /**
   1.242 +     * Limit offset, in the match text, of the <em>rightmost</em>
   1.243 +     * match.
   1.244 +     */
   1.245 +    int32_t matchLimit;
   1.246 +
   1.247 +};
   1.248 +
   1.249 +U_NAMESPACE_END
   1.250 +
   1.251 +#endif /* #if !UCONFIG_NO_TRANSLITERATION */
   1.252 +
   1.253 +#endif

mercurial