intl/icu/source/i18n/strmatch.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  * Copyright (C) 2001-2011, International Business Machines Corporation
     3  * and others. All Rights Reserved.
     4  **********************************************************************
     5  *   Date        Name        Description
     6  *   07/23/01    aliu        Creation.
     7  **********************************************************************
     8  */
     9 #ifndef STRMATCH_H
    10 #define STRMATCH_H
    12 #include "unicode/utypes.h"
    14 #if !UCONFIG_NO_TRANSLITERATION
    16 #include "unicode/unistr.h"
    17 #include "unicode/unifunct.h"
    18 #include "unicode/unimatch.h"
    19 #include "unicode/unirepl.h"
    21 U_NAMESPACE_BEGIN
    23 class TransliterationRuleData;
    25 /**
    26  * An object that matches a fixed input string, implementing the
    27  * UnicodeMatcher API.  This object also implements the
    28  * UnicodeReplacer API, allowing it to emit the matched text as
    29  * output.  Since the match text may contain flexible match elements,
    30  * such as UnicodeSets, the emitted text is not the match pattern, but
    31  * instead a substring of the actual matched text.  Following
    32  * convention, the output text is the leftmost match seen up to this
    33  * point.
    34  *
    35  * A StringMatcher may represent a segment, in which case it has a
    36  * positive segment number.  This affects how the matcher converts
    37  * itself to a pattern but does not otherwise affect its function.
    38  *
    39  * A StringMatcher that is not a segment should not be used as a
    40  * UnicodeReplacer.
    41  */
    42 class StringMatcher : public UnicodeFunctor, public UnicodeMatcher, public UnicodeReplacer {
    44  public:
    46     /**
    47      * Construct a matcher that matches the given pattern string.
    48      * @param string the pattern to be matched, possibly containing
    49      * stand-ins that represent nested UnicodeMatcher objects.
    50      * @param start inclusive start index of text to be replaced
    51      * @param limit exclusive end index of text to be replaced;
    52      * must be greater than or equal to start
    53      * @param segmentNum the segment number from 1..n, or 0 if this is
    54      * not a segment.
    55      * @param data context object mapping stand-ins to
    56      * UnicodeMatcher objects.
    57      */
    58     StringMatcher(const UnicodeString& string,
    59                   int32_t start,
    60                   int32_t limit,
    61                   int32_t segmentNum,
    62                   const TransliterationRuleData& data);
    64     /**
    65      * Copy constructor
    66      * @param o  the object to be copied.
    67      */
    68     StringMatcher(const StringMatcher& o);
    70     /**
    71      * Destructor
    72      */
    73     virtual ~StringMatcher();
    75     /**
    76      * Implement UnicodeFunctor
    77      * @return a copy of the object.
    78      */
    79     virtual UnicodeFunctor* clone() const;
    81     /**
    82      * UnicodeFunctor API.  Cast 'this' to a UnicodeMatcher* pointer
    83      * and return the pointer.
    84      * @return the UnicodeMatcher point.
    85      */
    86     virtual UnicodeMatcher* toMatcher() const;
    88     /**
    89      * UnicodeFunctor API.  Cast 'this' to a UnicodeReplacer* pointer
    90      * and return the pointer.
    91      * @return the UnicodeReplacer pointer.
    92      */
    93     virtual UnicodeReplacer* toReplacer() const;
    95     /**
    96      * Implement UnicodeMatcher
    97      * @param text the text to be matched
    98      * @param offset on input, the index into text at which to begin
    99      * matching.  On output, the limit of the matched text.  The
   100      * number of matched characters is the output value of offset
   101      * minus the input value.  Offset should always point to the
   102      * HIGH SURROGATE (leading code unit) of a pair of surrogates,
   103      * both on entry and upon return.
   104      * @param limit the limit index of text to be matched.  Greater
   105      * than offset for a forward direction match, less than offset for
   106      * a backward direction match.  The last character to be
   107      * considered for matching will be text.charAt(limit-1) in the
   108      * forward direction or text.charAt(limit+1) in the backward
   109      * direction.
   110      * @param incremental  if TRUE, then assume further characters may
   111      * be inserted at limit and check for partial matching.  Otherwise
   112      * assume the text as given is complete.
   113      * @return a match degree value indicating a full match, a partial
   114      * match, or a mismatch.  If incremental is FALSE then
   115      * U_PARTIAL_MATCH should never be returned.
   116      */
   117     virtual UMatchDegree matches(const Replaceable& text,
   118                                  int32_t& offset,
   119                                  int32_t limit,
   120                                  UBool incremental);
   122     /**
   123      * Implement UnicodeMatcher
   124      * @param result            Output param to receive the pattern.
   125      * @param escapeUnprintable if True then escape the unprintable characters.
   126      * @return                  A reference to 'result'.
   127      */
   128     virtual UnicodeString& toPattern(UnicodeString& result,
   129                                      UBool escapeUnprintable = FALSE) const;
   131     /**
   132      * Implement UnicodeMatcher
   133      * Returns TRUE if this matcher will match a character c, where c
   134      * & 0xFF == v, at offset, in the forward direction (with limit >
   135      * offset).  This is used by <tt>RuleBasedTransliterator</tt> for
   136      * indexing.
   137      * @param v    the given value
   138      * @return     TRUE if this matcher will match a character c, 
   139      *             where c & 0xFF == v
   140      */
   141     virtual UBool matchesIndexValue(uint8_t v) const;
   143     /**
   144      * Implement UnicodeMatcher
   145      */
   146     virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
   148     /**
   149      * Implement UnicodeFunctor
   150      */
   151     virtual void setData(const TransliterationRuleData*);
   153     /**
   154      * Replace characters in 'text' from 'start' to 'limit' with the
   155      * output text of this object.  Update the 'cursor' parameter to
   156      * give the cursor position and return the length of the
   157      * replacement text.
   158      *
   159      * @param text the text to be matched
   160      * @param start inclusive start index of text to be replaced
   161      * @param limit exclusive end index of text to be replaced;
   162      * must be greater than or equal to start
   163      * @param cursor output parameter for the cursor position.
   164      * Not all replacer objects will update this, but in a complete
   165      * tree of replacer objects, representing the entire output side
   166      * of a transliteration rule, at least one must update it.
   167      * @return the number of 16-bit code units in the text replacing
   168      * the characters at offsets start..(limit-1) in text
   169      */
   170     virtual int32_t replace(Replaceable& text,
   171                             int32_t start,
   172                             int32_t limit,
   173                             int32_t& cursor);
   175     /**
   176      * Returns a string representation of this replacer.  If the
   177      * result of calling this function is passed to the appropriate
   178      * parser, typically TransliteratorParser, it will produce another
   179      * replacer that is equal to this one.
   180      * @param result the string to receive the pattern.  Previous
   181      * contents will be deleted.
   182      * @param escapeUnprintable if TRUE then convert unprintable
   183      * character to their hex escape representations, \\uxxxx or
   184      * \\Uxxxxxxxx.  Unprintable characters are defined by
   185      * Utility.isUnprintable().
   186      * @return a reference to 'result'.
   187      */
   188     virtual UnicodeString& toReplacerPattern(UnicodeString& result,
   189                                              UBool escapeUnprintable) const;
   191     /**
   192      * Remove any match data.  This must be called before performing a
   193      * set of matches with this segment.
   194      */
   195     void resetMatch();
   197     /**
   198      * ICU "poor man's RTTI", returns a UClassID for the actual class.
   199      */
   200     virtual UClassID getDynamicClassID() const;
   202     /**
   203      * ICU "poor man's RTTI", returns a UClassID for this class.
   204      */
   205     static UClassID U_EXPORT2 getStaticClassID();
   207     /**
   208      * Union the set of all characters that may output by this object
   209      * into the given set.
   210      * @param toUnionTo the set into which to union the output characters
   211      */
   212     virtual void addReplacementSetTo(UnicodeSet& toUnionTo) const;
   214  private:
   216     /**
   217      * The text to be matched.
   218      */
   219     UnicodeString pattern;
   221     /**
   222      * Context object that maps stand-ins to matcher and replacer
   223      * objects.
   224      */
   225     const TransliterationRuleData* data;
   227     /**
   228      * The segment number, 1-based, or 0 if not a segment.
   229      */
   230     int32_t segmentNumber;
   232     /**
   233      * Start offset, in the match text, of the <em>rightmost</em>
   234      * match.
   235      */
   236     int32_t matchStart;
   238     /**
   239      * Limit offset, in the match text, of the <em>rightmost</em>
   240      * match.
   241      */
   242     int32_t matchLimit;
   244 };
   246 U_NAMESPACE_END
   248 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
   250 #endif

mercurial