intl/icu/source/common/unicode/uniset.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/unicode/uniset.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1691 @@
     1.4 +/*
     1.5 +***************************************************************************
     1.6 +* Copyright (C) 1999-2013, International Business Machines Corporation
     1.7 +* and others. All Rights Reserved.
     1.8 +***************************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   10/20/99    alan        Creation.
    1.11 +***************************************************************************
    1.12 +*/
    1.13 +
    1.14 +#ifndef UNICODESET_H
    1.15 +#define UNICODESET_H
    1.16 +
    1.17 +#include "unicode/unifilt.h"
    1.18 +#include "unicode/unistr.h"
    1.19 +#include "unicode/uset.h"
    1.20 +
    1.21 +/**
    1.22 + * \file
    1.23 + * \brief C++ API: Unicode Set
    1.24 + */
    1.25 +
    1.26 +U_NAMESPACE_BEGIN
    1.27 +
    1.28 +// Forward Declarations.
    1.29 +void UnicodeSet_initInclusion(int32_t src, UErrorCode &status); /**< @internal */
    1.30 +
    1.31 +class BMPSet;
    1.32 +class ParsePosition;
    1.33 +class RBBIRuleScanner;
    1.34 +class SymbolTable;
    1.35 +class UnicodeSetStringSpan;
    1.36 +class UVector;
    1.37 +class RuleCharacterIterator;
    1.38 +
    1.39 +/**
    1.40 + * A mutable set of Unicode characters and multicharacter strings.  Objects of this class
    1.41 + * represent <em>character classes</em> used in regular expressions.
    1.42 + * A character specifies a subset of Unicode code points.  Legal
    1.43 + * code points are U+0000 to U+10FFFF, inclusive.
    1.44 + *
    1.45 + * <p>The UnicodeSet class is not designed to be subclassed.
    1.46 + *
    1.47 + * <p><code>UnicodeSet</code> supports two APIs. The first is the
    1.48 + * <em>operand</em> API that allows the caller to modify the value of
    1.49 + * a <code>UnicodeSet</code> object. It conforms to Java 2's
    1.50 + * <code>java.util.Set</code> interface, although
    1.51 + * <code>UnicodeSet</code> does not actually implement that
    1.52 + * interface. All methods of <code>Set</code> are supported, with the
    1.53 + * modification that they take a character range or single character
    1.54 + * instead of an <code>Object</code>, and they take a
    1.55 + * <code>UnicodeSet</code> instead of a <code>Collection</code>.  The
    1.56 + * operand API may be thought of in terms of boolean logic: a boolean
    1.57 + * OR is implemented by <code>add</code>, a boolean AND is implemented
    1.58 + * by <code>retain</code>, a boolean XOR is implemented by
    1.59 + * <code>complement</code> taking an argument, and a boolean NOT is
    1.60 + * implemented by <code>complement</code> with no argument.  In terms
    1.61 + * of traditional set theory function names, <code>add</code> is a
    1.62 + * union, <code>retain</code> is an intersection, <code>remove</code>
    1.63 + * is an asymmetric difference, and <code>complement</code> with no
    1.64 + * argument is a set complement with respect to the superset range
    1.65 + * <code>MIN_VALUE-MAX_VALUE</code>
    1.66 + *
    1.67 + * <p>The second API is the
    1.68 + * <code>applyPattern()</code>/<code>toPattern()</code> API from the
    1.69 + * <code>java.text.Format</code>-derived classes.  Unlike the
    1.70 + * methods that add characters, add categories, and control the logic
    1.71 + * of the set, the method <code>applyPattern()</code> sets all
    1.72 + * attributes of a <code>UnicodeSet</code> at once, based on a
    1.73 + * string pattern.
    1.74 + *
    1.75 + * <p><b>Pattern syntax</b></p>
    1.76 + *
    1.77 + * Patterns are accepted by the constructors and the
    1.78 + * <code>applyPattern()</code> methods and returned by the
    1.79 + * <code>toPattern()</code> method.  These patterns follow a syntax
    1.80 + * similar to that employed by version 8 regular expression character
    1.81 + * classes.  Here are some simple examples:
    1.82 + *
    1.83 + * \htmlonly<blockquote>\endhtmlonly
    1.84 + *   <table>
    1.85 + *     <tr align="top">
    1.86 + *       <td nowrap valign="top" align="left"><code>[]</code></td>
    1.87 + *       <td valign="top">No characters</td>
    1.88 + *     </tr><tr align="top">
    1.89 + *       <td nowrap valign="top" align="left"><code>[a]</code></td>
    1.90 + *       <td valign="top">The character 'a'</td>
    1.91 + *     </tr><tr align="top">
    1.92 + *       <td nowrap valign="top" align="left"><code>[ae]</code></td>
    1.93 + *       <td valign="top">The characters 'a' and 'e'</td>
    1.94 + *     </tr>
    1.95 + *     <tr>
    1.96 + *       <td nowrap valign="top" align="left"><code>[a-e]</code></td>
    1.97 + *       <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
    1.98 + *       point order</td>
    1.99 + *     </tr>
   1.100 + *     <tr>
   1.101 + *       <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
   1.102 + *       <td valign="top">The character U+4E01</td>
   1.103 + *     </tr>
   1.104 + *     <tr>
   1.105 + *       <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
   1.106 + *       <td valign="top">The character 'a' and the multicharacter strings &quot;ab&quot; and
   1.107 + *       &quot;ac&quot;</td>
   1.108 + *     </tr>
   1.109 + *     <tr>
   1.110 + *       <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
   1.111 + *       <td valign="top">All characters in the general category Uppercase Letter</td>
   1.112 + *     </tr>
   1.113 + *   </table>
   1.114 + * \htmlonly</blockquote>\endhtmlonly
   1.115 + *
   1.116 + * Any character may be preceded by a backslash in order to remove any special
   1.117 + * meaning.  White space characters, as defined by UCharacter.isWhitespace(), are
   1.118 + * ignored, unless they are escaped.
   1.119 + *
   1.120 + * <p>Property patterns specify a set of characters having a certain
   1.121 + * property as defined by the Unicode standard.  Both the POSIX-like
   1.122 + * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized.  For a
   1.123 + * complete list of supported property patterns, see the User's Guide
   1.124 + * for UnicodeSet at
   1.125 + * <a href="http://icu-project.org/userguide/unicodeSet.html">
   1.126 + * http://icu-project.org/userguide/unicodeSet.html</a>.
   1.127 + * Actual determination of property data is defined by the underlying
   1.128 + * Unicode database as implemented by UCharacter.
   1.129 + *
   1.130 + * <p>Patterns specify individual characters, ranges of characters, and
   1.131 + * Unicode property sets.  When elements are concatenated, they
   1.132 + * specify their union.  To complement a set, place a '^' immediately
   1.133 + * after the opening '['.  Property patterns are inverted by modifying
   1.134 + * their delimiters; "[:^foo]" and "\\P{foo}".  In any other location,
   1.135 + * '^' has no special meaning.
   1.136 + *
   1.137 + * <p>Ranges are indicated by placing two a '-' between two
   1.138 + * characters, as in "a-z".  This specifies the range of all
   1.139 + * characters from the left to the right, in Unicode order.  If the
   1.140 + * left character is greater than or equal to the
   1.141 + * right character it is a syntax error.  If a '-' occurs as the first
   1.142 + * character after the opening '[' or '[^', or if it occurs as the
   1.143 + * last character before the closing ']', then it is taken as a
   1.144 + * literal.  Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
   1.145 + * set of three characters, 'a', 'b', and '-'.
   1.146 + *
   1.147 + * <p>Sets may be intersected using the '&' operator or the asymmetric
   1.148 + * set difference may be taken using the '-' operator, for example,
   1.149 + * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
   1.150 + * with values less than 4096.  Operators ('&' and '|') have equal
   1.151 + * precedence and bind left-to-right.  Thus
   1.152 + * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
   1.153 + * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]".  This only really matters for
   1.154 + * difference; intersection is commutative.
   1.155 + *
   1.156 + * <table>
   1.157 + * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
   1.158 + * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
   1.159 + * through 'z' and all letters in between, in Unicode order
   1.160 + * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
   1.161 + * all characters but 'a' through 'z',
   1.162 + * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
   1.163 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
   1.164 + * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
   1.165 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
   1.166 + * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
   1.167 + * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
   1.168 + * <td>The asymmetric difference of sets specified by <em>pat1</em> and
   1.169 + * <em>pat2</em>
   1.170 + * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
   1.171 + * <td>The set of characters having the specified
   1.172 + * Unicode property; in
   1.173 + * this case, Unicode uppercase letters
   1.174 + * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
   1.175 + * <td>The set of characters <em>not</em> having the given
   1.176 + * Unicode property
   1.177 + * </table>
   1.178 + *
   1.179 + * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
   1.180 + *
   1.181 + * <p><b>Formal syntax</b></p>
   1.182 + *
   1.183 + * \htmlonly<blockquote>\endhtmlonly
   1.184 + *   <table>
   1.185 + *     <tr align="top">
   1.186 + *       <td nowrap valign="top" align="right"><code>pattern :=&nbsp; </code></td>
   1.187 + *       <td valign="top"><code>('[' '^'? item* ']') |
   1.188 + *       property</code></td>
   1.189 + *     </tr>
   1.190 + *     <tr align="top">
   1.191 + *       <td nowrap valign="top" align="right"><code>item :=&nbsp; </code></td>
   1.192 + *       <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
   1.193 + *       </code></td>
   1.194 + *     </tr>
   1.195 + *     <tr align="top">
   1.196 + *       <td nowrap valign="top" align="right"><code>pattern-expr :=&nbsp; </code></td>
   1.197 + *       <td valign="top"><code>pattern | pattern-expr pattern |
   1.198 + *       pattern-expr op pattern<br>
   1.199 + *       </code></td>
   1.200 + *     </tr>
   1.201 + *     <tr align="top">
   1.202 + *       <td nowrap valign="top" align="right"><code>op :=&nbsp; </code></td>
   1.203 + *       <td valign="top"><code>'&amp;' | '-'<br>
   1.204 + *       </code></td>
   1.205 + *     </tr>
   1.206 + *     <tr align="top">
   1.207 + *       <td nowrap valign="top" align="right"><code>special :=&nbsp; </code></td>
   1.208 + *       <td valign="top"><code>'[' | ']' | '-'<br>
   1.209 + *       </code></td>
   1.210 + *     </tr>
   1.211 + *     <tr align="top">
   1.212 + *       <td nowrap valign="top" align="right"><code>char :=&nbsp; </code></td>
   1.213 + *       <td valign="top"><em>any character that is not</em><code> special<br>
   1.214 + *       | ('\' </code><em>any character</em><code>)<br>
   1.215 + *       | ('\\u' hex hex hex hex)<br>
   1.216 + *       </code></td>
   1.217 + *     </tr>
   1.218 + *     <tr align="top">
   1.219 + *       <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
   1.220 + *       <td valign="top"><em>any character for which
   1.221 + *       </em><code>Character.digit(c, 16)</code><em>
   1.222 + *       returns a non-negative result</em></td>
   1.223 + *     </tr>
   1.224 + *     <tr>
   1.225 + *       <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
   1.226 + *       <td valign="top"><em>a Unicode property set pattern</em></td>
   1.227 + *     </tr>
   1.228 + *   </table>
   1.229 + *   <br>
   1.230 + *   <table border="1">
   1.231 + *     <tr>
   1.232 + *       <td>Legend: <table>
   1.233 + *         <tr>
   1.234 + *           <td nowrap valign="top"><code>a := b</code></td>
   1.235 + *           <td width="20" valign="top">&nbsp; </td>
   1.236 + *           <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
   1.237 + *         </tr>
   1.238 + *         <tr>
   1.239 + *           <td nowrap valign="top"><code>a?</code></td>
   1.240 + *           <td valign="top"></td>
   1.241 + *           <td valign="top">zero or one instance of <code>a</code><br>
   1.242 + *           </td>
   1.243 + *         </tr>
   1.244 + *         <tr>
   1.245 + *           <td nowrap valign="top"><code>a*</code></td>
   1.246 + *           <td valign="top"></td>
   1.247 + *           <td valign="top">one or more instances of <code>a</code><br>
   1.248 + *           </td>
   1.249 + *         </tr>
   1.250 + *         <tr>
   1.251 + *           <td nowrap valign="top"><code>a | b</code></td>
   1.252 + *           <td valign="top"></td>
   1.253 + *           <td valign="top">either <code>a</code> or <code>b</code><br>
   1.254 + *           </td>
   1.255 + *         </tr>
   1.256 + *         <tr>
   1.257 + *           <td nowrap valign="top"><code>'a'</code></td>
   1.258 + *           <td valign="top"></td>
   1.259 + *           <td valign="top">the literal string between the quotes </td>
   1.260 + *         </tr>
   1.261 + *       </table>
   1.262 + *       </td>
   1.263 + *     </tr>
   1.264 + *   </table>
   1.265 + * \htmlonly</blockquote>\endhtmlonly
   1.266 + * 
   1.267 + * <p>Note:
   1.268 + *  - Most UnicodeSet methods do not take a UErrorCode parameter because
   1.269 + *   there are usually very few opportunities for failure other than a shortage
   1.270 + *   of memory, error codes in low-level C++ string methods would be inconvenient,
   1.271 + *   and the error code as the last parameter (ICU convention) would prevent
   1.272 + *   the use of default parameter values.
   1.273 + *   Instead, such methods set the UnicodeSet into a "bogus" state
   1.274 + *   (see isBogus()) if an error occurs.
   1.275 + *
   1.276 + * @author Alan Liu
   1.277 + * @stable ICU 2.0
   1.278 + */
   1.279 +class U_COMMON_API UnicodeSet : public UnicodeFilter {
   1.280 +
   1.281 +    int32_t len; // length of list used; 0 <= len <= capacity
   1.282 +    int32_t capacity; // capacity of list
   1.283 +    UChar32* list; // MUST be terminated with HIGH
   1.284 +    BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
   1.285 +    UChar32* buffer; // internal buffer, may be NULL
   1.286 +    int32_t bufferCapacity; // capacity of buffer
   1.287 +    int32_t patLen;
   1.288 +
   1.289 +    /**
   1.290 +     * The pattern representation of this set.  This may not be the
   1.291 +     * most economical pattern.  It is the pattern supplied to
   1.292 +     * applyPattern(), with variables substituted and whitespace
   1.293 +     * removed.  For sets constructed without applyPattern(), or
   1.294 +     * modified using the non-pattern API, this string will be empty,
   1.295 +     * indicating that toPattern() must generate a pattern
   1.296 +     * representation from the inversion list.
   1.297 +     */
   1.298 +    UChar *pat;
   1.299 +    UVector* strings; // maintained in sorted order
   1.300 +    UnicodeSetStringSpan *stringSpan;
   1.301 +
   1.302 +private:
   1.303 +    enum { // constants
   1.304 +        kIsBogus = 1       // This set is bogus (i.e. not valid)
   1.305 +    };
   1.306 +    uint8_t fFlags;         // Bit flag (see constants above)
   1.307 +public:
   1.308 +    /**
   1.309 +     * Determine if this object contains a valid set.
   1.310 +     * A bogus set has no value. It is different from an empty set.
   1.311 +     * It can be used to indicate that no set value is available.
   1.312 +     *
   1.313 +     * @return TRUE if the set is valid, FALSE otherwise
   1.314 +     * @see setToBogus()
   1.315 +     * @stable ICU 4.0
   1.316 +     */
   1.317 +    inline UBool isBogus(void) const;
   1.318 +    
   1.319 +    /**
   1.320 +     * Make this UnicodeSet object invalid.
   1.321 +     * The string will test TRUE with isBogus().
   1.322 +     *
   1.323 +     * A bogus set has no value. It is different from an empty set.
   1.324 +     * It can be used to indicate that no set value is available.
   1.325 +     *
   1.326 +     * This utility function is used throughout the UnicodeSet
   1.327 +     * implementation to indicate that a UnicodeSet operation failed,
   1.328 +     * and may be used in other functions,
   1.329 +     * especially but not exclusively when such functions do not
   1.330 +     * take a UErrorCode for simplicity.
   1.331 +     *
   1.332 +     * @see isBogus()
   1.333 +     * @stable ICU 4.0
   1.334 +     */
   1.335 +    void setToBogus();
   1.336 +
   1.337 +public:
   1.338 +
   1.339 +    enum {
   1.340 +        /**
   1.341 +         * Minimum value that can be stored in a UnicodeSet.
   1.342 +         * @stable ICU 2.4
   1.343 +         */
   1.344 +        MIN_VALUE = 0,
   1.345 +
   1.346 +        /**
   1.347 +         * Maximum value that can be stored in a UnicodeSet.
   1.348 +         * @stable ICU 2.4
   1.349 +         */
   1.350 +        MAX_VALUE = 0x10ffff
   1.351 +    };
   1.352 +
   1.353 +    //----------------------------------------------------------------
   1.354 +    // Constructors &c
   1.355 +    //----------------------------------------------------------------
   1.356 +
   1.357 +public:
   1.358 +
   1.359 +    /**
   1.360 +     * Constructs an empty set.
   1.361 +     * @stable ICU 2.0
   1.362 +     */
   1.363 +    UnicodeSet();
   1.364 +
   1.365 +    /**
   1.366 +     * Constructs a set containing the given range. If <code>end >
   1.367 +     * start</code> then an empty set is created.
   1.368 +     *
   1.369 +     * @param start first character, inclusive, of range
   1.370 +     * @param end last character, inclusive, of range
   1.371 +     * @stable ICU 2.4
   1.372 +     */
   1.373 +    UnicodeSet(UChar32 start, UChar32 end);
   1.374 +
   1.375 +    /**
   1.376 +     * Constructs a set from the given pattern.  See the class
   1.377 +     * description for the syntax of the pattern language.
   1.378 +     * @param pattern a string specifying what characters are in the set
   1.379 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.380 +     * contains a syntax error.
   1.381 +     * @stable ICU 2.0
   1.382 +     */
   1.383 +    UnicodeSet(const UnicodeString& pattern,
   1.384 +               UErrorCode& status);
   1.385 +
   1.386 +#ifndef U_HIDE_INTERNAL_API
   1.387 +    /**
   1.388 +     * Constructs a set from the given pattern.  See the class
   1.389 +     * description for the syntax of the pattern language.
   1.390 +     * @param pattern a string specifying what characters are in the set
   1.391 +     * @param options bitmask for options to apply to the pattern.
   1.392 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.393 +     * @param symbols a symbol table mapping variable names to values
   1.394 +     * and stand-in characters to UnicodeSets; may be NULL
   1.395 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.396 +     * contains a syntax error.
   1.397 +     * @internal
   1.398 +     */
   1.399 +    UnicodeSet(const UnicodeString& pattern,
   1.400 +               uint32_t options,
   1.401 +               const SymbolTable* symbols,
   1.402 +               UErrorCode& status);
   1.403 +#endif  /* U_HIDE_INTERNAL_API */
   1.404 +
   1.405 +    /**
   1.406 +     * Constructs a set from the given pattern.  See the class description
   1.407 +     * for the syntax of the pattern language.
   1.408 +     * @param pattern a string specifying what characters are in the set
   1.409 +     * @param pos on input, the position in pattern at which to start parsing.
   1.410 +     * On output, the position after the last character parsed.
   1.411 +     * @param options bitmask for options to apply to the pattern.
   1.412 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.413 +     * @param symbols a symbol table mapping variable names to values
   1.414 +     * and stand-in characters to UnicodeSets; may be NULL
   1.415 +     * @param status input-output error code
   1.416 +     * @stable ICU 2.8
   1.417 +     */
   1.418 +    UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
   1.419 +               uint32_t options,
   1.420 +               const SymbolTable* symbols,
   1.421 +               UErrorCode& status);
   1.422 +
   1.423 +    /**
   1.424 +     * Constructs a set that is identical to the given UnicodeSet.
   1.425 +     * @stable ICU 2.0
   1.426 +     */
   1.427 +    UnicodeSet(const UnicodeSet& o);
   1.428 +
   1.429 +    /**
   1.430 +     * Destructs the set.
   1.431 +     * @stable ICU 2.0
   1.432 +     */
   1.433 +    virtual ~UnicodeSet();
   1.434 +
   1.435 +    /**
   1.436 +     * Assigns this object to be a copy of another.
   1.437 +     * A frozen set will not be modified.
   1.438 +     * @stable ICU 2.0
   1.439 +     */
   1.440 +    UnicodeSet& operator=(const UnicodeSet& o);
   1.441 +
   1.442 +    /**
   1.443 +     * Compares the specified object with this set for equality.  Returns
   1.444 +     * <tt>true</tt> if the two sets
   1.445 +     * have the same size, and every member of the specified set is
   1.446 +     * contained in this set (or equivalently, every member of this set is
   1.447 +     * contained in the specified set).
   1.448 +     *
   1.449 +     * @param o set to be compared for equality with this set.
   1.450 +     * @return <tt>true</tt> if the specified set is equal to this set.
   1.451 +     * @stable ICU 2.0
   1.452 +     */
   1.453 +    virtual UBool operator==(const UnicodeSet& o) const;
   1.454 +
   1.455 +    /**
   1.456 +     * Compares the specified object with this set for equality.  Returns
   1.457 +     * <tt>true</tt> if the specified set is not equal to this set.
   1.458 +     * @stable ICU 2.0
   1.459 +     */
   1.460 +    UBool operator!=(const UnicodeSet& o) const;
   1.461 +
   1.462 +    /**
   1.463 +     * Returns a copy of this object.  All UnicodeFunctor objects have
   1.464 +     * to support cloning in order to allow classes using
   1.465 +     * UnicodeFunctors, such as Transliterator, to implement cloning.
   1.466 +     * If this set is frozen, then the clone will be frozen as well.
   1.467 +     * Use cloneAsThawed() for a mutable clone of a frozen set.
   1.468 +     * @see cloneAsThawed
   1.469 +     * @stable ICU 2.0
   1.470 +     */
   1.471 +    virtual UnicodeFunctor* clone() const;
   1.472 +
   1.473 +    /**
   1.474 +     * Returns the hash code value for this set.
   1.475 +     *
   1.476 +     * @return the hash code value for this set.
   1.477 +     * @see Object#hashCode()
   1.478 +     * @stable ICU 2.0
   1.479 +     */
   1.480 +    virtual int32_t hashCode(void) const;
   1.481 +
   1.482 +    /**
   1.483 +     * Get a UnicodeSet pointer from a USet
   1.484 +     *
   1.485 +     * @param uset a USet (the ICU plain C type for UnicodeSet)
   1.486 +     * @return the corresponding UnicodeSet pointer.
   1.487 +     *
   1.488 +     * @stable ICU 4.2
   1.489 +     */
   1.490 +    inline static UnicodeSet *fromUSet(USet *uset);
   1.491 +
   1.492 +    /**
   1.493 +     * Get a UnicodeSet pointer from a const USet
   1.494 +     *
   1.495 +     * @param uset a const USet (the ICU plain C type for UnicodeSet)
   1.496 +     * @return the corresponding UnicodeSet pointer.
   1.497 +     *
   1.498 +     * @stable ICU 4.2
   1.499 +     */
   1.500 +    inline static const UnicodeSet *fromUSet(const USet *uset);
   1.501 +    
   1.502 +    /**
   1.503 +     * Produce a USet * pointer for this UnicodeSet.
   1.504 +     * USet is the plain C type for UnicodeSet
   1.505 +     *
   1.506 +     * @return a USet pointer for this UnicodeSet
   1.507 +     * @stable ICU 4.2
   1.508 +     */
   1.509 +    inline USet *toUSet();
   1.510 +
   1.511 +
   1.512 +    /**
   1.513 +     * Produce a const USet * pointer for this UnicodeSet.
   1.514 +     * USet is the plain C type for UnicodeSet
   1.515 +     *
   1.516 +     * @return a const USet pointer for this UnicodeSet
   1.517 +     * @stable ICU 4.2
   1.518 +     */
   1.519 +    inline const USet * toUSet() const;
   1.520 +
   1.521 +
   1.522 +    //----------------------------------------------------------------
   1.523 +    // Freezable API
   1.524 +    //----------------------------------------------------------------
   1.525 +
   1.526 +    /**
   1.527 +     * Determines whether the set has been frozen (made immutable) or not.
   1.528 +     * See the ICU4J Freezable interface for details.
   1.529 +     * @return TRUE/FALSE for whether the set has been frozen
   1.530 +     * @see freeze
   1.531 +     * @see cloneAsThawed
   1.532 +     * @stable ICU 3.8
   1.533 +     */
   1.534 +    inline UBool isFrozen() const;
   1.535 +
   1.536 +    /**
   1.537 +     * Freeze the set (make it immutable).
   1.538 +     * Once frozen, it cannot be unfrozen and is therefore thread-safe
   1.539 +     * until it is deleted.
   1.540 +     * See the ICU4J Freezable interface for details.
   1.541 +     * Freezing the set may also make some operations faster, for example
   1.542 +     * contains() and span().
   1.543 +     * A frozen set will not be modified. (It remains frozen.)
   1.544 +     * @return this set.
   1.545 +     * @see isFrozen
   1.546 +     * @see cloneAsThawed
   1.547 +     * @stable ICU 3.8
   1.548 +     */
   1.549 +    UnicodeFunctor *freeze();
   1.550 +
   1.551 +    /**
   1.552 +     * Clone the set and make the clone mutable.
   1.553 +     * See the ICU4J Freezable interface for details.
   1.554 +     * @return the mutable clone
   1.555 +     * @see freeze
   1.556 +     * @see isFrozen
   1.557 +     * @stable ICU 3.8
   1.558 +     */
   1.559 +    UnicodeFunctor *cloneAsThawed() const;
   1.560 +
   1.561 +    //----------------------------------------------------------------
   1.562 +    // Public API
   1.563 +    //----------------------------------------------------------------
   1.564 +
   1.565 +    /**
   1.566 +     * Make this object represent the range <code>start - end</code>.
   1.567 +     * If <code>end > start</code> then this object is set to an
   1.568 +     * an empty range.
   1.569 +     * A frozen set will not be modified.
   1.570 +     *
   1.571 +     * @param start first character in the set, inclusive
   1.572 +     * @param end last character in the set, inclusive
   1.573 +     * @stable ICU 2.4
   1.574 +     */
   1.575 +    UnicodeSet& set(UChar32 start, UChar32 end);
   1.576 +
   1.577 +    /**
   1.578 +     * Return true if the given position, in the given pattern, appears
   1.579 +     * to be the start of a UnicodeSet pattern.
   1.580 +     * @stable ICU 2.4
   1.581 +     */
   1.582 +    static UBool resemblesPattern(const UnicodeString& pattern,
   1.583 +                                  int32_t pos);
   1.584 +
   1.585 +    /**
   1.586 +     * Modifies this set to represent the set specified by the given
   1.587 +     * pattern, ignoring Unicode Pattern_White_Space characters.
   1.588 +     * See the class description for the syntax of the pattern language.
   1.589 +     * A frozen set will not be modified.
   1.590 +     * @param pattern a string specifying what characters are in the set
   1.591 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.592 +     * contains a syntax error.
   1.593 +     * <em> Empties the set passed before applying the pattern.</em>
   1.594 +     * @return a reference to this
   1.595 +     * @stable ICU 2.0
   1.596 +     */
   1.597 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.598 +                             UErrorCode& status);
   1.599 +
   1.600 +#ifndef U_HIDE_INTERNAL_API
   1.601 +    /**
   1.602 +     * Modifies this set to represent the set specified by the given
   1.603 +     * pattern, optionally ignoring Unicode Pattern_White_Space characters.
   1.604 +     * See the class description for the syntax of the pattern language.
   1.605 +     * A frozen set will not be modified.
   1.606 +     * @param pattern a string specifying what characters are in the set
   1.607 +     * @param options bitmask for options to apply to the pattern.
   1.608 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.609 +     * @param symbols a symbol table mapping variable names to
   1.610 +     * values and stand-ins to UnicodeSets; may be NULL
   1.611 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.612 +     * contains a syntax error.
   1.613 +     *<em> Empties the set passed before applying the pattern.</em>
   1.614 +     * @return a reference to this
   1.615 +     * @internal
   1.616 +     */
   1.617 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.618 +                             uint32_t options,
   1.619 +                             const SymbolTable* symbols,
   1.620 +                             UErrorCode& status);
   1.621 +#endif  /* U_HIDE_INTERNAL_API */
   1.622 +
   1.623 +    /**
   1.624 +     * Parses the given pattern, starting at the given position.  The
   1.625 +     * character at pattern.charAt(pos.getIndex()) must be '[', or the
   1.626 +     * parse fails.  Parsing continues until the corresponding closing
   1.627 +     * ']'.  If a syntax error is encountered between the opening and
   1.628 +     * closing brace, the parse fails.  Upon return from a successful
   1.629 +     * parse, the ParsePosition is updated to point to the character
   1.630 +     * following the closing ']', and a StringBuffer containing a
   1.631 +     * pairs list for the parsed pattern is returned.  This method calls
   1.632 +     * itself recursively to parse embedded subpatterns.
   1.633 +     *<em> Empties the set passed before applying the pattern.</em>
   1.634 +     * A frozen set will not be modified.
   1.635 +     *
   1.636 +     * @param pattern the string containing the pattern to be parsed.
   1.637 +     * The portion of the string from pos.getIndex(), which must be a
   1.638 +     * '[', to the corresponding closing ']', is parsed.
   1.639 +     * @param pos upon entry, the position at which to being parsing.
   1.640 +     * The character at pattern.charAt(pos.getIndex()) must be a '['.
   1.641 +     * Upon return from a successful parse, pos.getIndex() is either
   1.642 +     * the character after the closing ']' of the parsed pattern, or
   1.643 +     * pattern.length() if the closing ']' is the last character of
   1.644 +     * the pattern string.
   1.645 +     * @param options bitmask for options to apply to the pattern.
   1.646 +     * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
   1.647 +     * @param symbols a symbol table mapping variable names to
   1.648 +     * values and stand-ins to UnicodeSets; may be NULL
   1.649 +     * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
   1.650 +     * contains a syntax error.
   1.651 +     * @return a reference to this
   1.652 +     * @stable ICU 2.8
   1.653 +     */
   1.654 +    UnicodeSet& applyPattern(const UnicodeString& pattern,
   1.655 +                             ParsePosition& pos,
   1.656 +                             uint32_t options,
   1.657 +                             const SymbolTable* symbols,
   1.658 +                             UErrorCode& status);
   1.659 +
   1.660 +    /**
   1.661 +     * Returns a string representation of this set.  If the result of
   1.662 +     * calling this function is passed to a UnicodeSet constructor, it
   1.663 +     * will produce another set that is equal to this one.
   1.664 +     * A frozen set will not be modified.
   1.665 +     * @param result the string to receive the rules.  Previous
   1.666 +     * contents will be deleted.
   1.667 +     * @param escapeUnprintable if TRUE then convert unprintable
   1.668 +     * character to their hex escape representations, \\uxxxx or
   1.669 +     * \\Uxxxxxxxx.  Unprintable characters are those other than
   1.670 +     * U+000A, U+0020..U+007E.
   1.671 +     * @stable ICU 2.0
   1.672 +     */
   1.673 +    virtual UnicodeString& toPattern(UnicodeString& result,
   1.674 +                             UBool escapeUnprintable = FALSE) const;
   1.675 +
   1.676 +    /**
   1.677 +     * Modifies this set to contain those code points which have the given value
   1.678 +     * for the given binary or enumerated property, as returned by
   1.679 +     * u_getIntPropertyValue.  Prior contents of this set are lost.
   1.680 +     * A frozen set will not be modified.
   1.681 +     *
   1.682 +     * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
   1.683 +     * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
   1.684 +     * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
   1.685 +     *
   1.686 +     * @param value a value in the range u_getIntPropertyMinValue(prop)..
   1.687 +     * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
   1.688 +     * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
   1.689 +     * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
   1.690 +     * categories such as [:L:] to be represented.
   1.691 +     *
   1.692 +     * @param ec error code input/output parameter
   1.693 +     *
   1.694 +     * @return a reference to this set
   1.695 +     *
   1.696 +     * @stable ICU 2.4
   1.697 +     */
   1.698 +    UnicodeSet& applyIntPropertyValue(UProperty prop,
   1.699 +                                      int32_t value,
   1.700 +                                      UErrorCode& ec);
   1.701 +
   1.702 +    /**
   1.703 +     * Modifies this set to contain those code points which have the
   1.704 +     * given value for the given property.  Prior contents of this
   1.705 +     * set are lost.
   1.706 +     * A frozen set will not be modified.
   1.707 +     *
   1.708 +     * @param prop a property alias, either short or long.  The name is matched
   1.709 +     * loosely.  See PropertyAliases.txt for names and a description of loose
   1.710 +     * matching.  If the value string is empty, then this string is interpreted
   1.711 +     * as either a General_Category value alias, a Script value alias, a binary
   1.712 +     * property alias, or a special ID.  Special IDs are matched loosely and
   1.713 +     * correspond to the following sets:
   1.714 +     *
   1.715 +     * "ANY" = [\\u0000-\\U0010FFFF],
   1.716 +     * "ASCII" = [\\u0000-\\u007F],
   1.717 +     * "Assigned" = [:^Cn:].
   1.718 +     *
   1.719 +     * @param value a value alias, either short or long.  The name is matched
   1.720 +     * loosely.  See PropertyValueAliases.txt for names and a description of
   1.721 +     * loose matching.  In addition to aliases listed, numeric values and
   1.722 +     * canonical combining classes may be expressed numerically, e.g., ("nv",
   1.723 +     * "0.5") or ("ccc", "220").  The value string may also be empty.
   1.724 +     *
   1.725 +     * @param ec error code input/output parameter
   1.726 +     *
   1.727 +     * @return a reference to this set
   1.728 +     *
   1.729 +     * @stable ICU 2.4
   1.730 +     */
   1.731 +    UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
   1.732 +                                   const UnicodeString& value,
   1.733 +                                   UErrorCode& ec);
   1.734 +
   1.735 +    /**
   1.736 +     * Returns the number of elements in this set (its cardinality).
   1.737 +     * Note than the elements of a set may include both individual
   1.738 +     * codepoints and strings.
   1.739 +     *
   1.740 +     * @return the number of elements in this set (its cardinality).
   1.741 +     * @stable ICU 2.0
   1.742 +     */
   1.743 +    virtual int32_t size(void) const;
   1.744 +
   1.745 +    /**
   1.746 +     * Returns <tt>true</tt> if this set contains no elements.
   1.747 +     *
   1.748 +     * @return <tt>true</tt> if this set contains no elements.
   1.749 +     * @stable ICU 2.0
   1.750 +     */
   1.751 +    virtual UBool isEmpty(void) const;
   1.752 +
   1.753 +    /**
   1.754 +     * Returns true if this set contains the given character.
   1.755 +     * This function works faster with a frozen set.
   1.756 +     * @param c character to be checked for containment
   1.757 +     * @return true if the test condition is met
   1.758 +     * @stable ICU 2.0
   1.759 +     */
   1.760 +    virtual UBool contains(UChar32 c) const;
   1.761 +
   1.762 +    /**
   1.763 +     * Returns true if this set contains every character
   1.764 +     * of the given range.
   1.765 +     * @param start first character, inclusive, of the range
   1.766 +     * @param end last character, inclusive, of the range
   1.767 +     * @return true if the test condition is met
   1.768 +     * @stable ICU 2.0
   1.769 +     */
   1.770 +    virtual UBool contains(UChar32 start, UChar32 end) const;
   1.771 +
   1.772 +    /**
   1.773 +     * Returns <tt>true</tt> if this set contains the given
   1.774 +     * multicharacter string.
   1.775 +     * @param s string to be checked for containment
   1.776 +     * @return <tt>true</tt> if this set contains the specified string
   1.777 +     * @stable ICU 2.4
   1.778 +     */
   1.779 +    UBool contains(const UnicodeString& s) const;
   1.780 +
   1.781 +    /**
   1.782 +     * Returns true if this set contains all the characters and strings
   1.783 +     * of the given set.
   1.784 +     * @param c set to be checked for containment
   1.785 +     * @return true if the test condition is met
   1.786 +     * @stable ICU 2.4
   1.787 +     */
   1.788 +    virtual UBool containsAll(const UnicodeSet& c) const;
   1.789 +
   1.790 +    /**
   1.791 +     * Returns true if this set contains all the characters
   1.792 +     * of the given string.
   1.793 +     * @param s string containing characters to be checked for containment
   1.794 +     * @return true if the test condition is met
   1.795 +     * @stable ICU 2.4
   1.796 +     */
   1.797 +    UBool containsAll(const UnicodeString& s) const;
   1.798 +
   1.799 +    /**
   1.800 +     * Returns true if this set contains none of the characters
   1.801 +     * of the given range.
   1.802 +     * @param start first character, inclusive, of the range
   1.803 +     * @param end last character, inclusive, of the range
   1.804 +     * @return true if the test condition is met
   1.805 +     * @stable ICU 2.4
   1.806 +     */
   1.807 +    UBool containsNone(UChar32 start, UChar32 end) const;
   1.808 +
   1.809 +    /**
   1.810 +     * Returns true if this set contains none of the characters and strings
   1.811 +     * of the given set.
   1.812 +     * @param c set to be checked for containment
   1.813 +     * @return true if the test condition is met
   1.814 +     * @stable ICU 2.4
   1.815 +     */
   1.816 +    UBool containsNone(const UnicodeSet& c) const;
   1.817 +
   1.818 +    /**
   1.819 +     * Returns true if this set contains none of the characters
   1.820 +     * of the given string.
   1.821 +     * @param s string containing characters to be checked for containment
   1.822 +     * @return true if the test condition is met
   1.823 +     * @stable ICU 2.4
   1.824 +     */
   1.825 +    UBool containsNone(const UnicodeString& s) const;
   1.826 +
   1.827 +    /**
   1.828 +     * Returns true if this set contains one or more of the characters
   1.829 +     * in the given range.
   1.830 +     * @param start first character, inclusive, of the range
   1.831 +     * @param end last character, inclusive, of the range
   1.832 +     * @return true if the condition is met
   1.833 +     * @stable ICU 2.4
   1.834 +     */
   1.835 +    inline UBool containsSome(UChar32 start, UChar32 end) const;
   1.836 +
   1.837 +    /**
   1.838 +     * Returns true if this set contains one or more of the characters
   1.839 +     * and strings of the given set.
   1.840 +     * @param s The set to be checked for containment
   1.841 +     * @return true if the condition is met
   1.842 +     * @stable ICU 2.4
   1.843 +     */
   1.844 +    inline UBool containsSome(const UnicodeSet& s) const;
   1.845 +
   1.846 +    /**
   1.847 +     * Returns true if this set contains one or more of the characters
   1.848 +     * of the given string.
   1.849 +     * @param s string containing characters to be checked for containment
   1.850 +     * @return true if the condition is met
   1.851 +     * @stable ICU 2.4
   1.852 +     */
   1.853 +    inline UBool containsSome(const UnicodeString& s) const;
   1.854 +
   1.855 +    /**
   1.856 +     * Returns the length of the initial substring of the input string which
   1.857 +     * consists only of characters and strings that are contained in this set
   1.858 +     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
   1.859 +     * or only of characters and strings that are not contained
   1.860 +     * in this set (USET_SPAN_NOT_CONTAINED).
   1.861 +     * See USetSpanCondition for details.
   1.862 +     * Similar to the strspn() C library function.
   1.863 +     * Unpaired surrogates are treated according to contains() of their surrogate code points.
   1.864 +     * This function works faster with a frozen set and with a non-negative string length argument.
   1.865 +     * @param s start of the string
   1.866 +     * @param length of the string; can be -1 for NUL-terminated
   1.867 +     * @param spanCondition specifies the containment condition
   1.868 +     * @return the length of the initial substring according to the spanCondition;
   1.869 +     *         0 if the start of the string does not fit the spanCondition
   1.870 +     * @stable ICU 3.8
   1.871 +     * @see USetSpanCondition
   1.872 +     */
   1.873 +    int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
   1.874 +
   1.875 +    /**
   1.876 +     * Returns the end of the substring of the input string according to the USetSpanCondition.
   1.877 +     * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
   1.878 +     * after pinning start to 0<=start<=s.length().
   1.879 +     * @param s the string
   1.880 +     * @param start the start index in the string for the span operation
   1.881 +     * @param spanCondition specifies the containment condition
   1.882 +     * @return the exclusive end of the substring according to the spanCondition;
   1.883 +     *         the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
   1.884 +     * @stable ICU 4.4
   1.885 +     * @see USetSpanCondition
   1.886 +     */
   1.887 +    inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
   1.888 +
   1.889 +    /**
   1.890 +     * Returns the start of the trailing substring of the input string which
   1.891 +     * consists only of characters and strings that are contained in this set
   1.892 +     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
   1.893 +     * or only of characters and strings that are not contained
   1.894 +     * in this set (USET_SPAN_NOT_CONTAINED).
   1.895 +     * See USetSpanCondition for details.
   1.896 +     * Unpaired surrogates are treated according to contains() of their surrogate code points.
   1.897 +     * This function works faster with a frozen set and with a non-negative string length argument.
   1.898 +     * @param s start of the string
   1.899 +     * @param length of the string; can be -1 for NUL-terminated
   1.900 +     * @param spanCondition specifies the containment condition
   1.901 +     * @return the start of the trailing substring according to the spanCondition;
   1.902 +     *         the string length if the end of the string does not fit the spanCondition
   1.903 +     * @stable ICU 3.8
   1.904 +     * @see USetSpanCondition
   1.905 +     */
   1.906 +    int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
   1.907 +
   1.908 +    /**
   1.909 +     * Returns the start of the substring of the input string according to the USetSpanCondition.
   1.910 +     * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
   1.911 +     * after pinning limit to 0<=end<=s.length().
   1.912 +     * @param s the string
   1.913 +     * @param limit the exclusive-end index in the string for the span operation
   1.914 +     *              (use s.length() or INT32_MAX for spanning back from the end of the string)
   1.915 +     * @param spanCondition specifies the containment condition
   1.916 +     * @return the start of the substring according to the spanCondition;
   1.917 +     *         the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
   1.918 +     * @stable ICU 4.4
   1.919 +     * @see USetSpanCondition
   1.920 +     */
   1.921 +    inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
   1.922 +
   1.923 +    /**
   1.924 +     * Returns the length of the initial substring of the input string which
   1.925 +     * consists only of characters and strings that are contained in this set
   1.926 +     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
   1.927 +     * or only of characters and strings that are not contained
   1.928 +     * in this set (USET_SPAN_NOT_CONTAINED).
   1.929 +     * See USetSpanCondition for details.
   1.930 +     * Similar to the strspn() C library function.
   1.931 +     * Malformed byte sequences are treated according to contains(0xfffd).
   1.932 +     * This function works faster with a frozen set and with a non-negative string length argument.
   1.933 +     * @param s start of the string (UTF-8)
   1.934 +     * @param length of the string; can be -1 for NUL-terminated
   1.935 +     * @param spanCondition specifies the containment condition
   1.936 +     * @return the length of the initial substring according to the spanCondition;
   1.937 +     *         0 if the start of the string does not fit the spanCondition
   1.938 +     * @stable ICU 3.8
   1.939 +     * @see USetSpanCondition
   1.940 +     */
   1.941 +    int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
   1.942 +
   1.943 +    /**
   1.944 +     * Returns the start of the trailing substring of the input string which
   1.945 +     * consists only of characters and strings that are contained in this set
   1.946 +     * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
   1.947 +     * or only of characters and strings that are not contained
   1.948 +     * in this set (USET_SPAN_NOT_CONTAINED).
   1.949 +     * See USetSpanCondition for details.
   1.950 +     * Malformed byte sequences are treated according to contains(0xfffd).
   1.951 +     * This function works faster with a frozen set and with a non-negative string length argument.
   1.952 +     * @param s start of the string (UTF-8)
   1.953 +     * @param length of the string; can be -1 for NUL-terminated
   1.954 +     * @param spanCondition specifies the containment condition
   1.955 +     * @return the start of the trailing substring according to the spanCondition;
   1.956 +     *         the string length if the end of the string does not fit the spanCondition
   1.957 +     * @stable ICU 3.8
   1.958 +     * @see USetSpanCondition
   1.959 +     */
   1.960 +    int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
   1.961 +
   1.962 +    /**
   1.963 +     * Implement UnicodeMatcher::matches()
   1.964 +     * @stable ICU 2.4
   1.965 +     */
   1.966 +    virtual UMatchDegree matches(const Replaceable& text,
   1.967 +                         int32_t& offset,
   1.968 +                         int32_t limit,
   1.969 +                         UBool incremental);
   1.970 +
   1.971 +private:
   1.972 +    /**
   1.973 +     * Returns the longest match for s in text at the given position.
   1.974 +     * If limit > start then match forward from start+1 to limit
   1.975 +     * matching all characters except s.charAt(0).  If limit < start,
   1.976 +     * go backward starting from start-1 matching all characters
   1.977 +     * except s.charAt(s.length()-1).  This method assumes that the
   1.978 +     * first character, text.charAt(start), matches s, so it does not
   1.979 +     * check it.
   1.980 +     * @param text the text to match
   1.981 +     * @param start the first character to match.  In the forward
   1.982 +     * direction, text.charAt(start) is matched against s.charAt(0).
   1.983 +     * In the reverse direction, it is matched against
   1.984 +     * s.charAt(s.length()-1).
   1.985 +     * @param limit the limit offset for matching, either last+1 in
   1.986 +     * the forward direction, or last-1 in the reverse direction,
   1.987 +     * where last is the index of the last character to match.
   1.988 +     * @param s
   1.989 +     * @return If part of s matches up to the limit, return |limit -
   1.990 +     * start|.  If all of s matches before reaching the limit, return
   1.991 +     * s.length().  If there is a mismatch between s and text, return
   1.992 +     * 0
   1.993 +     */
   1.994 +    static int32_t matchRest(const Replaceable& text,
   1.995 +                             int32_t start, int32_t limit,
   1.996 +                             const UnicodeString& s);
   1.997 +
   1.998 +    /**
   1.999 +     * Returns the smallest value i such that c < list[i].  Caller
  1.1000 +     * must ensure that c is a legal value or this method will enter
  1.1001 +     * an infinite loop.  This method performs a binary search.
  1.1002 +     * @param c a character in the range MIN_VALUE..MAX_VALUE
  1.1003 +     * inclusive
  1.1004 +     * @return the smallest integer i in the range 0..len-1,
  1.1005 +     * inclusive, such that c < list[i]
  1.1006 +     */
  1.1007 +    int32_t findCodePoint(UChar32 c) const;
  1.1008 +
  1.1009 +public:
  1.1010 +
  1.1011 +    /**
  1.1012 +     * Implementation of UnicodeMatcher API.  Union the set of all
  1.1013 +     * characters that may be matched by this object into the given
  1.1014 +     * set.
  1.1015 +     * @param toUnionTo the set into which to union the source characters
  1.1016 +     * @stable ICU 2.4
  1.1017 +     */
  1.1018 +    virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
  1.1019 +
  1.1020 +    /**
  1.1021 +     * Returns the index of the given character within this set, where
  1.1022 +     * the set is ordered by ascending code point.  If the character
  1.1023 +     * is not in this set, return -1.  The inverse of this method is
  1.1024 +     * <code>charAt()</code>.
  1.1025 +     * @return an index from 0..size()-1, or -1
  1.1026 +     * @stable ICU 2.4
  1.1027 +     */
  1.1028 +    int32_t indexOf(UChar32 c) const;
  1.1029 +
  1.1030 +    /**
  1.1031 +     * Returns the character at the given index within this set, where
  1.1032 +     * the set is ordered by ascending code point.  If the index is
  1.1033 +     * out of range, return (UChar32)-1.  The inverse of this method is
  1.1034 +     * <code>indexOf()</code>.
  1.1035 +     * @param index an index from 0..size()-1
  1.1036 +     * @return the character at the given index, or (UChar32)-1.
  1.1037 +     * @stable ICU 2.4
  1.1038 +     */
  1.1039 +    UChar32 charAt(int32_t index) const;
  1.1040 +
  1.1041 +    /**
  1.1042 +     * Adds the specified range to this set if it is not already
  1.1043 +     * present.  If this set already contains the specified range,
  1.1044 +     * the call leaves this set unchanged.  If <code>end > start</code>
  1.1045 +     * then an empty range is added, leaving the set unchanged.
  1.1046 +     * This is equivalent to a boolean logic OR, or a set UNION.
  1.1047 +     * A frozen set will not be modified.
  1.1048 +     *
  1.1049 +     * @param start first character, inclusive, of range to be added
  1.1050 +     * to this set.
  1.1051 +     * @param end last character, inclusive, of range to be added
  1.1052 +     * to this set.
  1.1053 +     * @stable ICU 2.0
  1.1054 +     */
  1.1055 +    virtual UnicodeSet& add(UChar32 start, UChar32 end);
  1.1056 +
  1.1057 +    /**
  1.1058 +     * Adds the specified character to this set if it is not already
  1.1059 +     * present.  If this set already contains the specified character,
  1.1060 +     * the call leaves this set unchanged.
  1.1061 +     * A frozen set will not be modified.
  1.1062 +     * @stable ICU 2.0
  1.1063 +     */
  1.1064 +    UnicodeSet& add(UChar32 c);
  1.1065 +
  1.1066 +    /**
  1.1067 +     * Adds the specified multicharacter to this set if it is not already
  1.1068 +     * present.  If this set already contains the multicharacter,
  1.1069 +     * the call leaves this set unchanged.
  1.1070 +     * Thus "ch" => {"ch"}
  1.1071 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1072 +     * A frozen set will not be modified.
  1.1073 +     * @param s the source string
  1.1074 +     * @return this object, for chaining
  1.1075 +     * @stable ICU 2.4
  1.1076 +     */
  1.1077 +    UnicodeSet& add(const UnicodeString& s);
  1.1078 +
  1.1079 + private:
  1.1080 +    /**
  1.1081 +     * @return a code point IF the string consists of a single one.
  1.1082 +     * otherwise returns -1.
  1.1083 +     * @param s string to test
  1.1084 +     */
  1.1085 +    static int32_t getSingleCP(const UnicodeString& s);
  1.1086 +
  1.1087 +    void _add(const UnicodeString& s);
  1.1088 +
  1.1089 + public:
  1.1090 +    /**
  1.1091 +     * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
  1.1092 +     * If this set already any particular character, it has no effect on that character.
  1.1093 +     * A frozen set will not be modified.
  1.1094 +     * @param s the source string
  1.1095 +     * @return this object, for chaining
  1.1096 +     * @stable ICU 2.4
  1.1097 +     */
  1.1098 +    UnicodeSet& addAll(const UnicodeString& s);
  1.1099 +
  1.1100 +    /**
  1.1101 +     * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1102 +     * If this set already any particular character, it has no effect on that character.
  1.1103 +     * A frozen set will not be modified.
  1.1104 +     * @param s the source string
  1.1105 +     * @return this object, for chaining
  1.1106 +     * @stable ICU 2.4
  1.1107 +     */
  1.1108 +    UnicodeSet& retainAll(const UnicodeString& s);
  1.1109 +
  1.1110 +    /**
  1.1111 +     * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1112 +     * If this set already any particular character, it has no effect on that character.
  1.1113 +     * A frozen set will not be modified.
  1.1114 +     * @param s the source string
  1.1115 +     * @return this object, for chaining
  1.1116 +     * @stable ICU 2.4
  1.1117 +     */
  1.1118 +    UnicodeSet& complementAll(const UnicodeString& s);
  1.1119 +
  1.1120 +    /**
  1.1121 +     * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1122 +     * If this set already any particular character, it has no effect on that character.
  1.1123 +     * A frozen set will not be modified.
  1.1124 +     * @param s the source string
  1.1125 +     * @return this object, for chaining
  1.1126 +     * @stable ICU 2.4
  1.1127 +     */
  1.1128 +    UnicodeSet& removeAll(const UnicodeString& s);
  1.1129 +
  1.1130 +    /**
  1.1131 +     * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
  1.1132 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1133 +     * @param s the source string
  1.1134 +     * @return a newly created set containing the given string.
  1.1135 +     * The caller owns the return object and is responsible for deleting it.
  1.1136 +     * @stable ICU 2.4
  1.1137 +     */
  1.1138 +    static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
  1.1139 +
  1.1140 +
  1.1141 +    /**
  1.1142 +     * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
  1.1143 +     * @param s the source string
  1.1144 +     * @return a newly created set containing the given characters
  1.1145 +     * The caller owns the return object and is responsible for deleting it.
  1.1146 +     * @stable ICU 2.4
  1.1147 +     */
  1.1148 +    static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
  1.1149 +
  1.1150 +    /**
  1.1151 +     * Retain only the elements in this set that are contained in the
  1.1152 +     * specified range.  If <code>end > start</code> then an empty range is
  1.1153 +     * retained, leaving the set empty.  This is equivalent to
  1.1154 +     * a boolean logic AND, or a set INTERSECTION.
  1.1155 +     * A frozen set will not be modified.
  1.1156 +     *
  1.1157 +     * @param start first character, inclusive, of range to be retained
  1.1158 +     * to this set.
  1.1159 +     * @param end last character, inclusive, of range to be retained
  1.1160 +     * to this set.
  1.1161 +     * @stable ICU 2.0
  1.1162 +     */
  1.1163 +    virtual UnicodeSet& retain(UChar32 start, UChar32 end);
  1.1164 +
  1.1165 +
  1.1166 +    /**
  1.1167 +     * Retain the specified character from this set if it is present.
  1.1168 +     * A frozen set will not be modified.
  1.1169 +     * @stable ICU 2.0
  1.1170 +     */
  1.1171 +    UnicodeSet& retain(UChar32 c);
  1.1172 +
  1.1173 +    /**
  1.1174 +     * Removes the specified range from this set if it is present.
  1.1175 +     * The set will not contain the specified range once the call
  1.1176 +     * returns.  If <code>end > start</code> then an empty range is
  1.1177 +     * removed, leaving the set unchanged.
  1.1178 +     * A frozen set will not be modified.
  1.1179 +     *
  1.1180 +     * @param start first character, inclusive, of range to be removed
  1.1181 +     * from this set.
  1.1182 +     * @param end last character, inclusive, of range to be removed
  1.1183 +     * from this set.
  1.1184 +     * @stable ICU 2.0
  1.1185 +     */
  1.1186 +    virtual UnicodeSet& remove(UChar32 start, UChar32 end);
  1.1187 +
  1.1188 +    /**
  1.1189 +     * Removes the specified character from this set if it is present.
  1.1190 +     * The set will not contain the specified range once the call
  1.1191 +     * returns.
  1.1192 +     * A frozen set will not be modified.
  1.1193 +     * @stable ICU 2.0
  1.1194 +     */
  1.1195 +    UnicodeSet& remove(UChar32 c);
  1.1196 +
  1.1197 +    /**
  1.1198 +     * Removes the specified string from this set if it is present.
  1.1199 +     * The set will not contain the specified character once the call
  1.1200 +     * returns.
  1.1201 +     * A frozen set will not be modified.
  1.1202 +     * @param s the source string
  1.1203 +     * @return this object, for chaining
  1.1204 +     * @stable ICU 2.4
  1.1205 +     */
  1.1206 +    UnicodeSet& remove(const UnicodeString& s);
  1.1207 +
  1.1208 +    /**
  1.1209 +     * Inverts this set.  This operation modifies this set so that
  1.1210 +     * its value is its complement.  This is equivalent to
  1.1211 +     * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
  1.1212 +     * A frozen set will not be modified.
  1.1213 +     * @stable ICU 2.0
  1.1214 +     */
  1.1215 +    virtual UnicodeSet& complement(void);
  1.1216 +
  1.1217 +    /**
  1.1218 +     * Complements the specified range in this set.  Any character in
  1.1219 +     * the range will be removed if it is in this set, or will be
  1.1220 +     * added if it is not in this set.  If <code>end > start</code>
  1.1221 +     * then an empty range is complemented, leaving the set unchanged.
  1.1222 +     * This is equivalent to a boolean logic XOR.
  1.1223 +     * A frozen set will not be modified.
  1.1224 +     *
  1.1225 +     * @param start first character, inclusive, of range to be removed
  1.1226 +     * from this set.
  1.1227 +     * @param end last character, inclusive, of range to be removed
  1.1228 +     * from this set.
  1.1229 +     * @stable ICU 2.0
  1.1230 +     */
  1.1231 +    virtual UnicodeSet& complement(UChar32 start, UChar32 end);
  1.1232 +
  1.1233 +    /**
  1.1234 +     * Complements the specified character in this set.  The character
  1.1235 +     * will be removed if it is in this set, or will be added if it is
  1.1236 +     * not in this set.
  1.1237 +     * A frozen set will not be modified.
  1.1238 +     * @stable ICU 2.0
  1.1239 +     */
  1.1240 +    UnicodeSet& complement(UChar32 c);
  1.1241 +
  1.1242 +    /**
  1.1243 +     * Complement the specified string in this set.
  1.1244 +     * The set will not contain the specified string once the call
  1.1245 +     * returns.
  1.1246 +     * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1247 +     * A frozen set will not be modified.
  1.1248 +     * @param s the string to complement
  1.1249 +     * @return this object, for chaining
  1.1250 +     * @stable ICU 2.4
  1.1251 +     */
  1.1252 +    UnicodeSet& complement(const UnicodeString& s);
  1.1253 +
  1.1254 +    /**
  1.1255 +     * Adds all of the elements in the specified set to this set if
  1.1256 +     * they're not already present.  This operation effectively
  1.1257 +     * modifies this set so that its value is the <i>union</i> of the two
  1.1258 +     * sets.  The behavior of this operation is unspecified if the specified
  1.1259 +     * collection is modified while the operation is in progress.
  1.1260 +     * A frozen set will not be modified.
  1.1261 +     *
  1.1262 +     * @param c set whose elements are to be added to this set.
  1.1263 +     * @see #add(UChar32, UChar32)
  1.1264 +     * @stable ICU 2.0
  1.1265 +     */
  1.1266 +    virtual UnicodeSet& addAll(const UnicodeSet& c);
  1.1267 +
  1.1268 +    /**
  1.1269 +     * Retains only the elements in this set that are contained in the
  1.1270 +     * specified set.  In other words, removes from this set all of
  1.1271 +     * its elements that are not contained in the specified set.  This
  1.1272 +     * operation effectively modifies this set so that its value is
  1.1273 +     * the <i>intersection</i> of the two sets.
  1.1274 +     * A frozen set will not be modified.
  1.1275 +     *
  1.1276 +     * @param c set that defines which elements this set will retain.
  1.1277 +     * @stable ICU 2.0
  1.1278 +     */
  1.1279 +    virtual UnicodeSet& retainAll(const UnicodeSet& c);
  1.1280 +
  1.1281 +    /**
  1.1282 +     * Removes from this set all of its elements that are contained in the
  1.1283 +     * specified set.  This operation effectively modifies this
  1.1284 +     * set so that its value is the <i>asymmetric set difference</i> of
  1.1285 +     * the two sets.
  1.1286 +     * A frozen set will not be modified.
  1.1287 +     *
  1.1288 +     * @param c set that defines which elements will be removed from
  1.1289 +     *          this set.
  1.1290 +     * @stable ICU 2.0
  1.1291 +     */
  1.1292 +    virtual UnicodeSet& removeAll(const UnicodeSet& c);
  1.1293 +
  1.1294 +    /**
  1.1295 +     * Complements in this set all elements contained in the specified
  1.1296 +     * set.  Any character in the other set will be removed if it is
  1.1297 +     * in this set, or will be added if it is not in this set.
  1.1298 +     * A frozen set will not be modified.
  1.1299 +     *
  1.1300 +     * @param c set that defines which elements will be xor'ed from
  1.1301 +     *          this set.
  1.1302 +     * @stable ICU 2.4
  1.1303 +     */
  1.1304 +    virtual UnicodeSet& complementAll(const UnicodeSet& c);
  1.1305 +
  1.1306 +    /**
  1.1307 +     * Removes all of the elements from this set.  This set will be
  1.1308 +     * empty after this call returns.
  1.1309 +     * A frozen set will not be modified.
  1.1310 +     * @stable ICU 2.0
  1.1311 +     */
  1.1312 +    virtual UnicodeSet& clear(void);
  1.1313 +
  1.1314 +    /**
  1.1315 +     * Close this set over the given attribute.  For the attribute
  1.1316 +     * USET_CASE, the result is to modify this set so that:
  1.1317 +     *
  1.1318 +     * 1. For each character or string 'a' in this set, all strings or
  1.1319 +     * characters 'b' such that foldCase(a) == foldCase(b) are added
  1.1320 +     * to this set.
  1.1321 +     *
  1.1322 +     * 2. For each string 'e' in the resulting set, if e !=
  1.1323 +     * foldCase(e), 'e' will be removed.
  1.1324 +     *
  1.1325 +     * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
  1.1326 +     *
  1.1327 +     * (Here foldCase(x) refers to the operation u_strFoldCase, and a
  1.1328 +     * == b denotes that the contents are the same, not pointer
  1.1329 +     * comparison.)
  1.1330 +     *
  1.1331 +     * A frozen set will not be modified.
  1.1332 +     *
  1.1333 +     * @param attribute bitmask for attributes to close over.
  1.1334 +     * Currently only the USET_CASE bit is supported.  Any undefined bits
  1.1335 +     * are ignored.
  1.1336 +     * @return a reference to this set.
  1.1337 +     * @stable ICU 4.2
  1.1338 +     */
  1.1339 +    UnicodeSet& closeOver(int32_t attribute);
  1.1340 +
  1.1341 +    /**
  1.1342 +     * Remove all strings from this set.
  1.1343 +     *
  1.1344 +     * @return a reference to this set.
  1.1345 +     * @stable ICU 4.2
  1.1346 +     */
  1.1347 +    virtual UnicodeSet &removeAllStrings();
  1.1348 +
  1.1349 +    /**
  1.1350 +     * Iteration method that returns the number of ranges contained in
  1.1351 +     * this set.
  1.1352 +     * @see #getRangeStart
  1.1353 +     * @see #getRangeEnd
  1.1354 +     * @stable ICU 2.4
  1.1355 +     */
  1.1356 +    virtual int32_t getRangeCount(void) const;
  1.1357 +
  1.1358 +    /**
  1.1359 +     * Iteration method that returns the first character in the
  1.1360 +     * specified range of this set.
  1.1361 +     * @see #getRangeCount
  1.1362 +     * @see #getRangeEnd
  1.1363 +     * @stable ICU 2.4
  1.1364 +     */
  1.1365 +    virtual UChar32 getRangeStart(int32_t index) const;
  1.1366 +
  1.1367 +    /**
  1.1368 +     * Iteration method that returns the last character in the
  1.1369 +     * specified range of this set.
  1.1370 +     * @see #getRangeStart
  1.1371 +     * @see #getRangeEnd
  1.1372 +     * @stable ICU 2.4
  1.1373 +     */
  1.1374 +    virtual UChar32 getRangeEnd(int32_t index) const;
  1.1375 +
  1.1376 +    /**
  1.1377 +     * Serializes this set into an array of 16-bit integers.  Serialization
  1.1378 +     * (currently) only records the characters in the set; multicharacter
  1.1379 +     * strings are ignored.
  1.1380 +     *
  1.1381 +     * The array has following format (each line is one 16-bit
  1.1382 +     * integer):
  1.1383 +     *
  1.1384 +     *  length     = (n+2*m) | (m!=0?0x8000:0)
  1.1385 +     *  bmpLength  = n; present if m!=0
  1.1386 +     *  bmp[0]
  1.1387 +     *  bmp[1]
  1.1388 +     *  ...
  1.1389 +     *  bmp[n-1]
  1.1390 +     *  supp-high[0]
  1.1391 +     *  supp-low[0]
  1.1392 +     *  supp-high[1]
  1.1393 +     *  supp-low[1]
  1.1394 +     *  ...
  1.1395 +     *  supp-high[m-1]
  1.1396 +     *  supp-low[m-1]
  1.1397 +     *
  1.1398 +     * The array starts with a header.  After the header are n bmp
  1.1399 +     * code points, then m supplementary code points.  Either n or m
  1.1400 +     * or both may be zero.  n+2*m is always <= 0x7FFF.
  1.1401 +     *
  1.1402 +     * If there are no supplementary characters (if m==0) then the
  1.1403 +     * header is one 16-bit integer, 'length', with value n.
  1.1404 +     *
  1.1405 +     * If there are supplementary characters (if m!=0) then the header
  1.1406 +     * is two 16-bit integers.  The first, 'length', has value
  1.1407 +     * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
  1.1408 +     *
  1.1409 +     * After the header the code points are stored in ascending order.
  1.1410 +     * Supplementary code points are stored as most significant 16
  1.1411 +     * bits followed by least significant 16 bits.
  1.1412 +     *
  1.1413 +     * @param dest pointer to buffer of destCapacity 16-bit integers.
  1.1414 +     * May be NULL only if destCapacity is zero.
  1.1415 +     * @param destCapacity size of dest, or zero.  Must not be negative.
  1.1416 +     * @param ec error code.  Will be set to U_INDEX_OUTOFBOUNDS_ERROR
  1.1417 +     * if n+2*m > 0x7FFF.  Will be set to U_BUFFER_OVERFLOW_ERROR if
  1.1418 +     * n+2*m+(m!=0?2:1) > destCapacity.
  1.1419 +     * @return the total length of the serialized format, including
  1.1420 +     * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
  1.1421 +     * than U_BUFFER_OVERFLOW_ERROR.
  1.1422 +     * @stable ICU 2.4
  1.1423 +     */
  1.1424 +    int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
  1.1425 +
  1.1426 +    /**
  1.1427 +     * Reallocate this objects internal structures to take up the least
  1.1428 +     * possible space, without changing this object's value.
  1.1429 +     * A frozen set will not be modified.
  1.1430 +     * @stable ICU 2.4
  1.1431 +     */
  1.1432 +    virtual UnicodeSet& compact();
  1.1433 +
  1.1434 +    /**
  1.1435 +     * Return the class ID for this class.  This is useful only for
  1.1436 +     * comparing to a return value from getDynamicClassID().  For example:
  1.1437 +     * <pre>
  1.1438 +     * .      Base* polymorphic_pointer = createPolymorphicObject();
  1.1439 +     * .      if (polymorphic_pointer->getDynamicClassID() ==
  1.1440 +     * .          Derived::getStaticClassID()) ...
  1.1441 +     * </pre>
  1.1442 +     * @return          The class ID for all objects of this class.
  1.1443 +     * @stable ICU 2.0
  1.1444 +     */
  1.1445 +    static UClassID U_EXPORT2 getStaticClassID(void);
  1.1446 +
  1.1447 +    /**
  1.1448 +     * Implement UnicodeFunctor API.
  1.1449 +     *
  1.1450 +     * @return The class ID for this object. All objects of a given
  1.1451 +     * class have the same class ID.  Objects of other classes have
  1.1452 +     * different class IDs.
  1.1453 +     * @stable ICU 2.4
  1.1454 +     */
  1.1455 +    virtual UClassID getDynamicClassID(void) const;
  1.1456 +
  1.1457 +private:
  1.1458 +
  1.1459 +    // Private API for the USet API
  1.1460 +
  1.1461 +    friend class USetAccess;
  1.1462 +
  1.1463 +    int32_t getStringCount() const;
  1.1464 +
  1.1465 +    const UnicodeString* getString(int32_t index) const;
  1.1466 +
  1.1467 +    //----------------------------------------------------------------
  1.1468 +    // RuleBasedTransliterator support
  1.1469 +    //----------------------------------------------------------------
  1.1470 +
  1.1471 +private:
  1.1472 +
  1.1473 +    /**
  1.1474 +     * Returns <tt>true</tt> if this set contains any character whose low byte
  1.1475 +     * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
  1.1476 +     * indexing.
  1.1477 +     */
  1.1478 +    virtual UBool matchesIndexValue(uint8_t v) const;
  1.1479 +
  1.1480 +private:
  1.1481 +    friend class RBBIRuleScanner;
  1.1482 +
  1.1483 +    //----------------------------------------------------------------
  1.1484 +    // Implementation: Clone as thawed (see ICU4J Freezable)
  1.1485 +    //----------------------------------------------------------------
  1.1486 +
  1.1487 +    UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
  1.1488 +
  1.1489 +    //----------------------------------------------------------------
  1.1490 +    // Implementation: Pattern parsing
  1.1491 +    //----------------------------------------------------------------
  1.1492 +
  1.1493 +    void applyPatternIgnoreSpace(const UnicodeString& pattern,
  1.1494 +                                 ParsePosition& pos,
  1.1495 +                                 const SymbolTable* symbols,
  1.1496 +                                 UErrorCode& status);
  1.1497 +
  1.1498 +    void applyPattern(RuleCharacterIterator& chars,
  1.1499 +                      const SymbolTable* symbols,
  1.1500 +                      UnicodeString& rebuiltPat,
  1.1501 +                      uint32_t options,
  1.1502 +                      UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
  1.1503 +                      UErrorCode& ec);
  1.1504 +
  1.1505 +    //----------------------------------------------------------------
  1.1506 +    // Implementation: Utility methods
  1.1507 +    //----------------------------------------------------------------
  1.1508 +
  1.1509 +    void ensureCapacity(int32_t newLen, UErrorCode& ec);
  1.1510 +
  1.1511 +    void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
  1.1512 +
  1.1513 +    void swapBuffers(void);
  1.1514 +
  1.1515 +    UBool allocateStrings(UErrorCode &status);
  1.1516 +
  1.1517 +    UnicodeString& _toPattern(UnicodeString& result,
  1.1518 +                              UBool escapeUnprintable) const;
  1.1519 +
  1.1520 +    UnicodeString& _generatePattern(UnicodeString& result,
  1.1521 +                                    UBool escapeUnprintable) const;
  1.1522 +
  1.1523 +    static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
  1.1524 +
  1.1525 +    static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
  1.1526 +
  1.1527 +    //----------------------------------------------------------------
  1.1528 +    // Implementation: Fundamental operators
  1.1529 +    //----------------------------------------------------------------
  1.1530 +
  1.1531 +    void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1532 +
  1.1533 +    void add(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1534 +
  1.1535 +    void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
  1.1536 +
  1.1537 +    /**
  1.1538 +     * Return true if the given position, in the given pattern, appears
  1.1539 +     * to be the start of a property set pattern [:foo:], \\p{foo}, or
  1.1540 +     * \\P{foo}, or \\N{name}.
  1.1541 +     */
  1.1542 +    static UBool resemblesPropertyPattern(const UnicodeString& pattern,
  1.1543 +                                          int32_t pos);
  1.1544 +
  1.1545 +    static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
  1.1546 +                                          int32_t iterOpts);
  1.1547 +
  1.1548 +    /**
  1.1549 +     * Parse the given property pattern at the given parse position
  1.1550 +     * and set this UnicodeSet to the result.
  1.1551 +     *
  1.1552 +     * The original design document is out of date, but still useful.
  1.1553 +     * Ignore the property and value names:
  1.1554 +     * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/unicodeset_properties.html
  1.1555 +     *
  1.1556 +     * Recognized syntax:
  1.1557 +     *
  1.1558 +     * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
  1.1559 +     * \\p{foo} \\P{foo}  - white space not allowed within "\\p" or "\\P"
  1.1560 +     * \\N{name}         - white space not allowed within "\\N"
  1.1561 +     *
  1.1562 +     * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
  1.1563 +     * Case is ignored except in "\\p" and "\\P" and "\\N".  In 'name' leading
  1.1564 +     * and trailing space is deleted, and internal runs of whitespace
  1.1565 +     * are collapsed to a single space.
  1.1566 +     *
  1.1567 +     * We support binary properties, enumerated properties, and the
  1.1568 +     * following non-enumerated properties:
  1.1569 +     *
  1.1570 +     *  Numeric_Value
  1.1571 +     *  Name
  1.1572 +     *  Unicode_1_Name
  1.1573 +     *
  1.1574 +     * @param pattern the pattern string
  1.1575 +     * @param ppos on entry, the position at which to begin parsing.
  1.1576 +     * This should be one of the locations marked '^':
  1.1577 +     *
  1.1578 +     *   [:blah:]     \\p{blah}     \\P{blah}     \\N{name}
  1.1579 +     *   ^       %    ^       %    ^       %    ^       %
  1.1580 +     *
  1.1581 +     * On return, the position after the last character parsed, that is,
  1.1582 +     * the locations marked '%'.  If the parse fails, ppos is returned
  1.1583 +     * unchanged.
  1.1584 +     * @param ec status
  1.1585 +     * @return a reference to this.
  1.1586 +     */
  1.1587 +    UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
  1.1588 +                                     ParsePosition& ppos,
  1.1589 +                                     UErrorCode &ec);
  1.1590 +
  1.1591 +    void applyPropertyPattern(RuleCharacterIterator& chars,
  1.1592 +                              UnicodeString& rebuiltPat,
  1.1593 +                              UErrorCode& ec);
  1.1594 +
  1.1595 +    friend void UnicodeSet_initInclusion(int32_t src, UErrorCode &status);
  1.1596 +    static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
  1.1597 +
  1.1598 +    /**
  1.1599 +     * A filter that returns TRUE if the given code point should be
  1.1600 +     * included in the UnicodeSet being constructed.
  1.1601 +     */
  1.1602 +    typedef UBool (*Filter)(UChar32 codePoint, void* context);
  1.1603 +
  1.1604 +    /**
  1.1605 +     * Given a filter, set this UnicodeSet to the code points
  1.1606 +     * contained by that filter.  The filter MUST be
  1.1607 +     * property-conformant.  That is, if it returns value v for one
  1.1608 +     * code point, then it must return v for all affiliated code
  1.1609 +     * points, as defined by the inclusions list.  See
  1.1610 +     * getInclusions().
  1.1611 +     * src is a UPropertySource value.
  1.1612 +     */
  1.1613 +    void applyFilter(Filter filter,
  1.1614 +                     void* context,
  1.1615 +                     int32_t src,
  1.1616 +                     UErrorCode &status);
  1.1617 +
  1.1618 +    /**
  1.1619 +     * Set the new pattern to cache.
  1.1620 +     */
  1.1621 +    void setPattern(const UnicodeString& newPat);
  1.1622 +    /**
  1.1623 +     * Release existing cached pattern.
  1.1624 +     */
  1.1625 +    void releasePattern();
  1.1626 +
  1.1627 +    friend class UnicodeSetIterator;
  1.1628 +};
  1.1629 +
  1.1630 +
  1.1631 +
  1.1632 +inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
  1.1633 +    return !operator==(o);
  1.1634 +}
  1.1635 +
  1.1636 +inline UBool UnicodeSet::isFrozen() const {
  1.1637 +    return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
  1.1638 +}
  1.1639 +
  1.1640 +inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
  1.1641 +    return !containsNone(start, end);
  1.1642 +}
  1.1643 +
  1.1644 +inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
  1.1645 +    return !containsNone(s);
  1.1646 +}
  1.1647 +
  1.1648 +inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
  1.1649 +    return !containsNone(s);
  1.1650 +}
  1.1651 +
  1.1652 +inline UBool UnicodeSet::isBogus() const {
  1.1653 +    return (UBool)(fFlags & kIsBogus);
  1.1654 +}
  1.1655 +
  1.1656 +inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
  1.1657 +    return reinterpret_cast<UnicodeSet *>(uset);
  1.1658 +}
  1.1659 +
  1.1660 +inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
  1.1661 +    return reinterpret_cast<const UnicodeSet *>(uset);
  1.1662 +}
  1.1663 +
  1.1664 +inline USet *UnicodeSet::toUSet() {
  1.1665 +    return reinterpret_cast<USet *>(this);
  1.1666 +}
  1.1667 +
  1.1668 +inline const USet *UnicodeSet::toUSet() const {
  1.1669 +    return reinterpret_cast<const USet *>(this);
  1.1670 +}
  1.1671 +
  1.1672 +inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
  1.1673 +    int32_t sLength=s.length();
  1.1674 +    if(start<0) {
  1.1675 +        start=0;
  1.1676 +    } else if(start>sLength) {
  1.1677 +        start=sLength;
  1.1678 +    }
  1.1679 +    return start+span(s.getBuffer()+start, sLength-start, spanCondition);
  1.1680 +}
  1.1681 +
  1.1682 +inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
  1.1683 +    int32_t sLength=s.length();
  1.1684 +    if(limit<0) {
  1.1685 +        limit=0;
  1.1686 +    } else if(limit>sLength) {
  1.1687 +        limit=sLength;
  1.1688 +    }
  1.1689 +    return spanBack(s.getBuffer(), limit, spanCondition);
  1.1690 +}
  1.1691 +
  1.1692 +U_NAMESPACE_END
  1.1693 +
  1.1694 +#endif

mercurial