The Tor Browser: diff intl/icu/source/common/uniset.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/uniset.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2283 @@
     1.4 +/*
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 1999-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   Date        Name        Description
    1.10 +*   10/20/99    alan        Creation.
    1.11 +**********************************************************************
    1.12 +*/
    1.13 +
    1.14 +#include "unicode/utypes.h"
    1.15 +#include "unicode/parsepos.h"
    1.16 +#include "unicode/symtable.h"
    1.17 +#include "unicode/uniset.h"
    1.18 +#include "unicode/utf8.h"
    1.19 +#include "unicode/utf16.h"
    1.20 +#include "ruleiter.h"
    1.21 +#include "cmemory.h"
    1.22 +#include "cstring.h"
    1.23 +#include "patternprops.h"
    1.24 +#include "uelement.h"
    1.25 +#include "util.h"
    1.26 +#include "uvector.h"
    1.27 +#include "charstr.h"
    1.28 +#include "ustrfmt.h"
    1.29 +#include "uassert.h"
    1.30 +#include "bmpset.h"
    1.31 +#include "unisetspan.h"
    1.32 +
    1.33 +// Define UChar constants using hex for EBCDIC compatibility
    1.34 +// Used #define to reduce private static exports and memory access time.
    1.35 +#define SET_OPEN        ((UChar)0x005B) /*[*/
    1.36 +#define SET_CLOSE       ((UChar)0x005D) /*]*/
    1.37 +#define HYPHEN          ((UChar)0x002D) /*-*/
    1.38 +#define COMPLEMENT      ((UChar)0x005E) /*^*/
    1.39 +#define COLON           ((UChar)0x003A) /*:*/
    1.40 +#define BACKSLASH       ((UChar)0x005C) /*\*/
    1.41 +#define INTERSECTION    ((UChar)0x0026) /*&*/
    1.42 +#define UPPER_U         ((UChar)0x0055) /*U*/
    1.43 +#define LOWER_U         ((UChar)0x0075) /*u*/
    1.44 +#define OPEN_BRACE      ((UChar)123)    /*{*/
    1.45 +#define CLOSE_BRACE     ((UChar)125)    /*}*/
    1.46 +#define UPPER_P         ((UChar)0x0050) /*P*/
    1.47 +#define LOWER_P         ((UChar)0x0070) /*p*/
    1.48 +#define UPPER_N         ((UChar)78)     /*N*/
    1.49 +#define EQUALS          ((UChar)0x003D) /*=*/
    1.50 +
    1.51 +// HIGH_VALUE > all valid values. 110000 for codepoints
    1.52 +#define UNICODESET_HIGH 0x0110000
    1.53 +
    1.54 +// LOW <= all valid values. ZERO for codepoints
    1.55 +#define UNICODESET_LOW 0x000000
    1.56 +
    1.57 +// initial storage. Must be >= 0
    1.58 +#define START_EXTRA 16
    1.59 +
    1.60 +// extra amount for growth. Must be >= 0
    1.61 +#define GROW_EXTRA START_EXTRA
    1.62 +
    1.63 +U_NAMESPACE_BEGIN
    1.64 +
    1.65 +SymbolTable::~SymbolTable() {}
    1.66 +
    1.67 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
    1.68 +
    1.69 +/**
    1.70 + * Modify the given UChar32 variable so that it is in range, by
    1.71 + * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
    1.72 + * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
    1.73 + * It modifies its argument in-place and also returns it.
    1.74 + */
    1.75 +static inline UChar32 pinCodePoint(UChar32& c) {
    1.76 +    if (c < UNICODESET_LOW) {
    1.77 +        c = UNICODESET_LOW;
    1.78 +    } else if (c > (UNICODESET_HIGH-1)) {
    1.79 +        c = (UNICODESET_HIGH-1);
    1.80 +    }
    1.81 +    return c;
    1.82 +}
    1.83 +
    1.84 +//----------------------------------------------------------------
    1.85 +// Debugging
    1.86 +//----------------------------------------------------------------
    1.87 +
    1.88 +// DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
    1.89 +// To enable the debugging, define the symbol DEBUG_MEM in the line
    1.90 +// below.  This will result in text being sent to stdout that looks
    1.91 +// like this:
    1.92 +//   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
    1.93 +//   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
    1.94 +// Each line lists a construction (ct) or destruction (dt) event, the
    1.95 +// object address, the number of outstanding objects after the event,
    1.96 +// and the pattern of the object in question.
    1.97 +
    1.98 +// #define DEBUG_MEM
    1.99 +
   1.100 +#ifdef DEBUG_MEM
   1.101 +#include <stdio.h>
   1.102 +static int32_t _dbgCount = 0;
   1.103 +
   1.104 +static inline void _dbgct(UnicodeSet* set) {
   1.105 +    UnicodeString str;
   1.106 +    set->toPattern(str, TRUE);
   1.107 +    char buf[40];
   1.108 +    str.extract(0, 39, buf, "");
   1.109 +    printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
   1.110 +}
   1.111 +
   1.112 +static inline void _dbgdt(UnicodeSet* set) {
   1.113 +    UnicodeString str;
   1.114 +    set->toPattern(str, TRUE);
   1.115 +    char buf[40];
   1.116 +    str.extract(0, 39, buf, "");
   1.117 +    printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
   1.118 +}
   1.119 +
   1.120 +#else
   1.121 +
   1.122 +#define _dbgct(set)
   1.123 +#define _dbgdt(set)
   1.124 +
   1.125 +#endif
   1.126 +
   1.127 +//----------------------------------------------------------------
   1.128 +// UnicodeString in UVector support
   1.129 +//----------------------------------------------------------------
   1.130 +
   1.131 +static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
   1.132 +    dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
   1.133 +}
   1.134 +
   1.135 +static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
   1.136 +    const UnicodeString &a = *(const UnicodeString*)t1.pointer;
   1.137 +    const UnicodeString &b = *(const UnicodeString*)t2.pointer;
   1.138 +    return a.compare(b);
   1.139 +}
   1.140 +
   1.141 +//----------------------------------------------------------------
   1.142 +// Constructors &c
   1.143 +//----------------------------------------------------------------
   1.144 +
   1.145 +/**
   1.146 + * Constructs an empty set.
   1.147 + */
   1.148 +UnicodeSet::UnicodeSet() :
   1.149 +    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
   1.150 +    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
   1.151 +    fFlags(0)
   1.152 +{
   1.153 +    UErrorCode status = U_ZERO_ERROR;
   1.154 +    allocateStrings(status);
   1.155 +    if (U_FAILURE(status)) {
   1.156 +        return;
   1.157 +    }
   1.158 +    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
   1.159 +    if(list!=NULL){
   1.160 +        list[0] = UNICODESET_HIGH;
   1.161 +    } else { // If memory allocation failed, set to bogus state.
   1.162 +        setToBogus();
   1.163 +        return;
   1.164 +    }
   1.165 +    _dbgct(this);
   1.166 +}
   1.167 +
   1.168 +/**
   1.169 + * Constructs a set containing the given range. If <code>end >
   1.170 + * start</code> then an empty set is created.
   1.171 + *
   1.172 + * @param start first character, inclusive, of range
   1.173 + * @param end last character, inclusive, of range
   1.174 + */
   1.175 +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
   1.176 +    len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
   1.177 +    bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
   1.178 +    fFlags(0)
   1.179 +{
   1.180 +    UErrorCode status = U_ZERO_ERROR;
   1.181 +    allocateStrings(status);
   1.182 +    if (U_FAILURE(status)) {
   1.183 +        return;
   1.184 +    }
   1.185 +    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
   1.186 +    if(list!=NULL){
   1.187 +        list[0] = UNICODESET_HIGH;
   1.188 +        complement(start, end);
   1.189 +    } else { // If memory allocation failed, set to bogus state.
   1.190 +        setToBogus();
   1.191 +        return;
   1.192 +    }
   1.193 +    _dbgct(this);
   1.194 +}
   1.195 +
   1.196 +/**
   1.197 + * Constructs a set that is identical to the given UnicodeSet.
   1.198 + */
   1.199 +UnicodeSet::UnicodeSet(const UnicodeSet& o) :
   1.200 +    UnicodeFilter(o),
   1.201 +    len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
   1.202 +    bmpSet(0),
   1.203 +    buffer(0), bufferCapacity(0),
   1.204 +    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
   1.205 +    fFlags(0)
   1.206 +{
   1.207 +    UErrorCode status = U_ZERO_ERROR;
   1.208 +    allocateStrings(status);
   1.209 +    if (U_FAILURE(status)) {
   1.210 +        return;
   1.211 +    }
   1.212 +    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
   1.213 +    if(list!=NULL){
   1.214 +        *this = o;
   1.215 +    } else { // If memory allocation failed, set to bogus state.
   1.216 +        setToBogus();
   1.217 +        return;
   1.218 +    }
   1.219 +    _dbgct(this);
   1.220 +}
   1.221 +
   1.222 +// Copy-construct as thawed.
   1.223 +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
   1.224 +    UnicodeFilter(o),
   1.225 +    len(0), capacity(o.len + GROW_EXTRA), list(0),
   1.226 +    bmpSet(0),
   1.227 +    buffer(0), bufferCapacity(0),
   1.228 +    patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
   1.229 +    fFlags(0)
   1.230 +{
   1.231 +    UErrorCode status = U_ZERO_ERROR;
   1.232 +    allocateStrings(status);
   1.233 +    if (U_FAILURE(status)) {
   1.234 +        return;
   1.235 +    }
   1.236 +    list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
   1.237 +    if(list!=NULL){
   1.238 +        // *this = o except for bmpSet and stringSpan
   1.239 +        len = o.len;
   1.240 +        uprv_memcpy(list, o.list, len*sizeof(UChar32));
   1.241 +        if (strings != NULL && o.strings != NULL) {
   1.242 +            strings->assign(*o.strings, cloneUnicodeString, status);
   1.243 +        } else { // Invalid strings.
   1.244 +            setToBogus();
   1.245 +            return;
   1.246 +        }
   1.247 +        if (o.pat) {
   1.248 +            setPattern(UnicodeString(o.pat, o.patLen));
   1.249 +        }
   1.250 +    } else { // If memory allocation failed, set to bogus state.
   1.251 +        setToBogus();
   1.252 +        return;
   1.253 +    }
   1.254 +    _dbgct(this);
   1.255 +}
   1.256 +
   1.257 +/**
   1.258 + * Destructs the set.
   1.259 + */
   1.260 +UnicodeSet::~UnicodeSet() {
   1.261 +    _dbgdt(this); // first!
   1.262 +    uprv_free(list);
   1.263 +    delete bmpSet;
   1.264 +    if (buffer) {
   1.265 +        uprv_free(buffer);
   1.266 +    }
   1.267 +    delete strings;
   1.268 +    delete stringSpan;
   1.269 +    releasePattern();
   1.270 +}
   1.271 +
   1.272 +/**
   1.273 + * Assigns this object to be a copy of another.
   1.274 + */
   1.275 +UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
   1.276 +    if (this == &o) {
   1.277 +        return *this;
   1.278 +    }
   1.279 +    if (isFrozen()) {
   1.280 +        return *this;
   1.281 +    }
   1.282 +    if (o.isBogus()) {
   1.283 +        setToBogus();
   1.284 +        return *this;
   1.285 +    }
   1.286 +    UErrorCode ec = U_ZERO_ERROR;
   1.287 +    ensureCapacity(o.len, ec);
   1.288 +    if (U_FAILURE(ec)) {
   1.289 +        return *this; // There is no way to report this error :-(
   1.290 +    }
   1.291 +    len = o.len;
   1.292 +    uprv_memcpy(list, o.list, len*sizeof(UChar32));
   1.293 +    if (o.bmpSet == NULL) {
   1.294 +        bmpSet = NULL;
   1.295 +    } else {
   1.296 +        bmpSet = new BMPSet(*o.bmpSet, list, len);
   1.297 +        if (bmpSet == NULL) { // Check for memory allocation error.
   1.298 +            setToBogus();
   1.299 +            return *this;
   1.300 +        }
   1.301 +    }
   1.302 +    if (strings != NULL && o.strings != NULL) {
   1.303 +        strings->assign(*o.strings, cloneUnicodeString, ec);
   1.304 +    } else { // Invalid strings.
   1.305 +        setToBogus();
   1.306 +        return *this;
   1.307 +    }
   1.308 +    if (o.stringSpan == NULL) {
   1.309 +        stringSpan = NULL;
   1.310 +    } else {
   1.311 +        stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
   1.312 +        if (stringSpan == NULL) { // Check for memory allocation error.
   1.313 +            setToBogus();
   1.314 +            return *this;
   1.315 +        }
   1.316 +    }
   1.317 +    releasePattern();
   1.318 +    if (o.pat) {
   1.319 +        setPattern(UnicodeString(o.pat, o.patLen));
   1.320 +    }
   1.321 +    return *this;
   1.322 +}
   1.323 +
   1.324 +/**
   1.325 + * Returns a copy of this object.  All UnicodeMatcher objects have
   1.326 + * to support cloning in order to allow classes using
   1.327 + * UnicodeMatchers, such as Transliterator, to implement cloning.
   1.328 + */
   1.329 +UnicodeFunctor* UnicodeSet::clone() const {
   1.330 +    return new UnicodeSet(*this);
   1.331 +}
   1.332 +
   1.333 +UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
   1.334 +    return new UnicodeSet(*this, TRUE);
   1.335 +}
   1.336 +
   1.337 +/**
   1.338 + * Compares the specified object with this set for equality.  Returns
   1.339 + * <tt>true</tt> if the two sets
   1.340 + * have the same size, and every member of the specified set is
   1.341 + * contained in this set (or equivalently, every member of this set is
   1.342 + * contained in the specified set).
   1.343 + *
   1.344 + * @param o set to be compared for equality with this set.
   1.345 + * @return <tt>true</tt> if the specified set is equal to this set.
   1.346 + */
   1.347 +UBool UnicodeSet::operator==(const UnicodeSet& o) const {
   1.348 +    if (len != o.len) return FALSE;
   1.349 +    for (int32_t i = 0; i < len; ++i) {
   1.350 +        if (list[i] != o.list[i]) return FALSE;
   1.351 +    }
   1.352 +    if (*strings != *o.strings) return FALSE;
   1.353 +    return TRUE;
   1.354 +}
   1.355 +
   1.356 +/**
   1.357 + * Returns the hash code value for this set.
   1.358 + *
   1.359 + * @return the hash code value for this set.
   1.360 + * @see Object#hashCode()
   1.361 + */
   1.362 +int32_t UnicodeSet::hashCode(void) const {
   1.363 +    int32_t result = len;
   1.364 +    for (int32_t i = 0; i < len; ++i) {
   1.365 +        result *= 1000003;
   1.366 +        result += list[i];
   1.367 +    }
   1.368 +    return result;
   1.369 +}
   1.370 +
   1.371 +//----------------------------------------------------------------
   1.372 +// Public API
   1.373 +//----------------------------------------------------------------
   1.374 +
   1.375 +/**
   1.376 + * Returns the number of elements in this set (its cardinality),
   1.377 + * Note than the elements of a set may include both individual
   1.378 + * codepoints and strings.
   1.379 + *
   1.380 + * @return the number of elements in this set (its cardinality).
   1.381 + */
   1.382 +int32_t UnicodeSet::size(void) const {
   1.383 +    int32_t n = 0;
   1.384 +    int32_t count = getRangeCount();
   1.385 +    for (int32_t i = 0; i < count; ++i) {
   1.386 +        n += getRangeEnd(i) - getRangeStart(i) + 1;
   1.387 +    }
   1.388 +    return n + strings->size();
   1.389 +}
   1.390 +
   1.391 +/**
   1.392 + * Returns <tt>true</tt> if this set contains no elements.
   1.393 + *
   1.394 + * @return <tt>true</tt> if this set contains no elements.
   1.395 + */
   1.396 +UBool UnicodeSet::isEmpty(void) const {
   1.397 +    return len == 1 && strings->size() == 0;
   1.398 +}
   1.399 +
   1.400 +/**
   1.401 + * Returns true if this set contains the given character.
   1.402 + * @param c character to be checked for containment
   1.403 + * @return true if the test condition is met
   1.404 + */
   1.405 +UBool UnicodeSet::contains(UChar32 c) const {
   1.406 +    // Set i to the index of the start item greater than ch
   1.407 +    // We know we will terminate without length test!
   1.408 +    // LATER: for large sets, add binary search
   1.409 +    //int32_t i = -1;
   1.410 +    //for (;;) {
   1.411 +    //    if (c < list[++i]) break;
   1.412 +    //}
   1.413 +    if (bmpSet != NULL) {
   1.414 +        return bmpSet->contains(c);
   1.415 +    }
   1.416 +    if (stringSpan != NULL) {
   1.417 +        return stringSpan->contains(c);
   1.418 +    }
   1.419 +    if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
   1.420 +        return FALSE;
   1.421 +    }
   1.422 +    int32_t i = findCodePoint(c);
   1.423 +    return (UBool)(i & 1); // return true if odd
   1.424 +}
   1.425 +
   1.426 +/**
   1.427 + * Returns the smallest value i such that c < list[i].  Caller
   1.428 + * must ensure that c is a legal value or this method will enter
   1.429 + * an infinite loop.  This method performs a binary search.
   1.430 + * @param c a character in the range MIN_VALUE..MAX_VALUE
   1.431 + * inclusive
   1.432 + * @return the smallest integer i in the range 0..len-1,
   1.433 + * inclusive, such that c < list[i]
   1.434 + */
   1.435 +int32_t UnicodeSet::findCodePoint(UChar32 c) const {
   1.436 +    /* Examples:
   1.437 +                                       findCodePoint(c)
   1.438 +       set              list[]         c=0 1 3 4 7 8
   1.439 +       ===              ==============   ===========
   1.440 +       []               [110000]         0 0 0 0 0 0
   1.441 +       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
   1.442 +       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
   1.443 +       [:Any:]          [0, 110000]      1 1 1 1 1 1
   1.444 +     */
   1.445 +
   1.446 +    // Return the smallest i such that c < list[i].  Assume
   1.447 +    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
   1.448 +    if (c < list[0])
   1.449 +        return 0;
   1.450 +    // High runner test.  c is often after the last range, so an
   1.451 +    // initial check for this condition pays off.
   1.452 +    int32_t lo = 0;
   1.453 +    int32_t hi = len - 1;
   1.454 +    if (lo >= hi || c >= list[hi-1])
   1.455 +        return hi;
   1.456 +    // invariant: c >= list[lo]
   1.457 +    // invariant: c < list[hi]
   1.458 +    for (;;) {
   1.459 +        int32_t i = (lo + hi) >> 1;
   1.460 +        if (i == lo) {
   1.461 +            break; // Found!
   1.462 +        } else if (c < list[i]) {
   1.463 +            hi = i;
   1.464 +        } else {
   1.465 +            lo = i;
   1.466 +        }
   1.467 +    }
   1.468 +    return hi;
   1.469 +}
   1.470 +
   1.471 +/**
   1.472 + * Returns true if this set contains every character
   1.473 + * of the given range.
   1.474 + * @param start first character, inclusive, of the range
   1.475 + * @param end last character, inclusive, of the range
   1.476 + * @return true if the test condition is met
   1.477 + */
   1.478 +UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
   1.479 +    //int32_t i = -1;
   1.480 +    //for (;;) {
   1.481 +    //    if (start < list[++i]) break;
   1.482 +    //}
   1.483 +    int32_t i = findCodePoint(start);
   1.484 +    return ((i & 1) != 0 && end < list[i]);
   1.485 +}
   1.486 +
   1.487 +/**
   1.488 + * Returns <tt>true</tt> if this set contains the given
   1.489 + * multicharacter string.
   1.490 + * @param s string to be checked for containment
   1.491 + * @return <tt>true</tt> if this set contains the specified string
   1.492 + */
   1.493 +UBool UnicodeSet::contains(const UnicodeString& s) const {
   1.494 +    if (s.length() == 0) return FALSE;
   1.495 +    int32_t cp = getSingleCP(s);
   1.496 +    if (cp < 0) {
   1.497 +        return strings->contains((void*) &s);
   1.498 +    } else {
   1.499 +        return contains((UChar32) cp);
   1.500 +    }
   1.501 +}
   1.502 +
   1.503 +/**
   1.504 + * Returns true if this set contains all the characters and strings
   1.505 + * of the given set.
   1.506 + * @param c set to be checked for containment
   1.507 + * @return true if the test condition is met
   1.508 + */
   1.509 +UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
   1.510 +    // The specified set is a subset if all of its pairs are contained in
   1.511 +    // this set.  It's possible to code this more efficiently in terms of
   1.512 +    // direct manipulation of the inversion lists if the need arises.
   1.513 +    int32_t n = c.getRangeCount();
   1.514 +    for (int i=0; i<n; ++i) {
   1.515 +        if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
   1.516 +            return FALSE;
   1.517 +        }
   1.518 +    }
   1.519 +    if (!strings->containsAll(*c.strings)) return FALSE;
   1.520 +    return TRUE;
   1.521 +}
   1.522 +
   1.523 +/**
   1.524 + * Returns true if this set contains all the characters
   1.525 + * of the given string.
   1.526 + * @param s string containing characters to be checked for containment
   1.527 + * @return true if the test condition is met
   1.528 + */
   1.529 +UBool UnicodeSet::containsAll(const UnicodeString& s) const {
   1.530 +    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
   1.531 +                   s.length());
   1.532 +}
   1.533 +
   1.534 +/**
   1.535 + * Returns true if this set contains none of the characters
   1.536 + * of the given range.
   1.537 + * @param start first character, inclusive, of the range
   1.538 + * @param end last character, inclusive, of the range
   1.539 + * @return true if the test condition is met
   1.540 + */
   1.541 +UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
   1.542 +    //int32_t i = -1;
   1.543 +    //for (;;) {
   1.544 +    //    if (start < list[++i]) break;
   1.545 +    //}
   1.546 +    int32_t i = findCodePoint(start);
   1.547 +    return ((i & 1) == 0 && end < list[i]);
   1.548 +}
   1.549 +
   1.550 +/**
   1.551 + * Returns true if this set contains none of the characters and strings
   1.552 + * of the given set.
   1.553 + * @param c set to be checked for containment
   1.554 + * @return true if the test condition is met
   1.555 + */
   1.556 +UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
   1.557 +    // The specified set is a subset if all of its pairs are contained in
   1.558 +    // this set.  It's possible to code this more efficiently in terms of
   1.559 +    // direct manipulation of the inversion lists if the need arises.
   1.560 +    int32_t n = c.getRangeCount();
   1.561 +    for (int32_t i=0; i<n; ++i) {
   1.562 +        if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
   1.563 +            return FALSE;
   1.564 +        }
   1.565 +    }
   1.566 +    if (!strings->containsNone(*c.strings)) return FALSE;
   1.567 +    return TRUE;
   1.568 +}
   1.569 +
   1.570 +/**
   1.571 + * Returns true if this set contains none of the characters
   1.572 + * of the given string.
   1.573 + * @param s string containing characters to be checked for containment
   1.574 + * @return true if the test condition is met
   1.575 + */
   1.576 +UBool UnicodeSet::containsNone(const UnicodeString& s) const {
   1.577 +    return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
   1.578 +                   s.length());
   1.579 +}
   1.580 +
   1.581 +/**
   1.582 + * Returns <tt>true</tt> if this set contains any character whose low byte
   1.583 + * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
   1.584 + * indexing.
   1.585 + */
   1.586 +UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
   1.587 +    /* The index value v, in the range [0,255], is contained in this set if
   1.588 +     * it is contained in any pair of this set.  Pairs either have the high
   1.589 +     * bytes equal, or unequal.  If the high bytes are equal, then we have
   1.590 +     * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
   1.591 +     * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
   1.592 +     * Then v is contained if xx <= v || v <= yy.  (This is identical to the
   1.593 +     * time zone month containment logic.)
   1.594 +     */
   1.595 +    int32_t i;
   1.596 +    int32_t rangeCount=getRangeCount();
   1.597 +    for (i=0; i<rangeCount; ++i) {
   1.598 +        UChar32 low = getRangeStart(i);
   1.599 +        UChar32 high = getRangeEnd(i);
   1.600 +        if ((low & ~0xFF) == (high & ~0xFF)) {
   1.601 +            if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
   1.602 +                return TRUE;
   1.603 +            }
   1.604 +        } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
   1.605 +            return TRUE;
   1.606 +        }
   1.607 +    }
   1.608 +    if (strings->size() != 0) {
   1.609 +        for (i=0; i<strings->size(); ++i) {
   1.610 +            const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
   1.611 +            //if (s.length() == 0) {
   1.612 +            //    // Empty strings match everything
   1.613 +            //    return TRUE;
   1.614 +            //}
   1.615 +            // assert(s.length() != 0); // We enforce this elsewhere
   1.616 +            UChar32 c = s.char32At(0);
   1.617 +            if ((c & 0xFF) == v) {
   1.618 +                return TRUE;
   1.619 +            }
   1.620 +        }
   1.621 +    }
   1.622 +    return FALSE;
   1.623 +}
   1.624 +
   1.625 +/**
   1.626 + * Implementation of UnicodeMatcher::matches().  Always matches the
   1.627 + * longest possible multichar string.
   1.628 + */
   1.629 +UMatchDegree UnicodeSet::matches(const Replaceable& text,
   1.630 +                                 int32_t& offset,
   1.631 +                                 int32_t limit,
   1.632 +                                 UBool incremental) {
   1.633 +    if (offset == limit) {
   1.634 +        // Strings, if any, have length != 0, so we don't worry
   1.635 +        // about them here.  If we ever allow zero-length strings
   1.636 +        // we much check for them here.
   1.637 +        if (contains(U_ETHER)) {
   1.638 +            return incremental ? U_PARTIAL_MATCH : U_MATCH;
   1.639 +        } else {
   1.640 +            return U_MISMATCH;
   1.641 +        }
   1.642 +    } else {
   1.643 +        if (strings->size() != 0) { // try strings first
   1.644 +
   1.645 +            // might separate forward and backward loops later
   1.646 +            // for now they are combined
   1.647 +
   1.648 +            // TODO Improve efficiency of this, at least in the forward
   1.649 +            // direction, if not in both.  In the forward direction we
   1.650 +            // can assume the strings are sorted.
   1.651 +
   1.652 +            int32_t i;
   1.653 +            UBool forward = offset < limit;
   1.654 +
   1.655 +            // firstChar is the leftmost char to match in the
   1.656 +            // forward direction or the rightmost char to match in
   1.657 +            // the reverse direction.
   1.658 +            UChar firstChar = text.charAt(offset);
   1.659 +
   1.660 +            // If there are multiple strings that can match we
   1.661 +            // return the longest match.
   1.662 +            int32_t highWaterLength = 0;
   1.663 +
   1.664 +            for (i=0; i<strings->size(); ++i) {
   1.665 +                const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
   1.666 +
   1.667 +                //if (trial.length() == 0) {
   1.668 +                //    return U_MATCH; // null-string always matches
   1.669 +                //}
   1.670 +                // assert(trial.length() != 0); // We ensure this elsewhere
   1.671 +
   1.672 +                UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
   1.673 +
   1.674 +                // Strings are sorted, so we can optimize in the
   1.675 +                // forward direction.
   1.676 +                if (forward && c > firstChar) break;
   1.677 +                if (c != firstChar) continue;
   1.678 +
   1.679 +                int32_t matchLen = matchRest(text, offset, limit, trial);
   1.680 +
   1.681 +                if (incremental) {
   1.682 +                    int32_t maxLen = forward ? limit-offset : offset-limit;
   1.683 +                    if (matchLen == maxLen) {
   1.684 +                        // We have successfully matched but only up to limit.
   1.685 +                        return U_PARTIAL_MATCH;
   1.686 +                    }
   1.687 +                }
   1.688 +
   1.689 +                if (matchLen == trial.length()) {
   1.690 +                    // We have successfully matched the whole string.
   1.691 +                    if (matchLen > highWaterLength) {
   1.692 +                        highWaterLength = matchLen;
   1.693 +                    }
   1.694 +                    // In the forward direction we know strings
   1.695 +                    // are sorted so we can bail early.
   1.696 +                    if (forward && matchLen < highWaterLength) {
   1.697 +                        break;
   1.698 +                    }
   1.699 +                    continue;
   1.700 +                }
   1.701 +            }
   1.702 +
   1.703 +            // We've checked all strings without a partial match.
   1.704 +            // If we have full matches, return the longest one.
   1.705 +            if (highWaterLength != 0) {
   1.706 +                offset += forward ? highWaterLength : -highWaterLength;
   1.707 +                return U_MATCH;
   1.708 +            }
   1.709 +        }
   1.710 +        return UnicodeFilter::matches(text, offset, limit, incremental);
   1.711 +    }
   1.712 +}
   1.713 +
   1.714 +/**
   1.715 + * Returns the longest match for s in text at the given position.
   1.716 + * If limit > start then match forward from start+1 to limit
   1.717 + * matching all characters except s.charAt(0).  If limit < start,
   1.718 + * go backward starting from start-1 matching all characters
   1.719 + * except s.charAt(s.length()-1).  This method assumes that the
   1.720 + * first character, text.charAt(start), matches s, so it does not
   1.721 + * check it.
   1.722 + * @param text the text to match
   1.723 + * @param start the first character to match.  In the forward
   1.724 + * direction, text.charAt(start) is matched against s.charAt(0).
   1.725 + * In the reverse direction, it is matched against
   1.726 + * s.charAt(s.length()-1).
   1.727 + * @param limit the limit offset for matching, either last+1 in
   1.728 + * the forward direction, or last-1 in the reverse direction,
   1.729 + * where last is the index of the last character to match.
   1.730 + * @return If part of s matches up to the limit, return |limit -
   1.731 + * start|.  If all of s matches before reaching the limit, return
   1.732 + * s.length().  If there is a mismatch between s and text, return
   1.733 + * 0
   1.734 + */
   1.735 +int32_t UnicodeSet::matchRest(const Replaceable& text,
   1.736 +                              int32_t start, int32_t limit,
   1.737 +                              const UnicodeString& s) {
   1.738 +    int32_t i;
   1.739 +    int32_t maxLen;
   1.740 +    int32_t slen = s.length();
   1.741 +    if (start < limit) {
   1.742 +        maxLen = limit - start;
   1.743 +        if (maxLen > slen) maxLen = slen;
   1.744 +        for (i = 1; i < maxLen; ++i) {
   1.745 +            if (text.charAt(start + i) != s.charAt(i)) return 0;
   1.746 +        }
   1.747 +    } else {
   1.748 +        maxLen = start - limit;
   1.749 +        if (maxLen > slen) maxLen = slen;
   1.750 +        --slen; // <=> slen = s.length() - 1;
   1.751 +        for (i = 1; i < maxLen; ++i) {
   1.752 +            if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
   1.753 +        }
   1.754 +    }
   1.755 +    return maxLen;
   1.756 +}
   1.757 +
   1.758 +/**
   1.759 + * Implement of UnicodeMatcher
   1.760 + */
   1.761 +void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
   1.762 +    toUnionTo.addAll(*this);
   1.763 +}
   1.764 +
   1.765 +/**
   1.766 + * Returns the index of the given character within this set, where
   1.767 + * the set is ordered by ascending code point.  If the character
   1.768 + * is not in this set, return -1.  The inverse of this method is
   1.769 + * <code>charAt()</code>.
   1.770 + * @return an index from 0..size()-1, or -1
   1.771 + */
   1.772 +int32_t UnicodeSet::indexOf(UChar32 c) const {
   1.773 +    if (c < MIN_VALUE || c > MAX_VALUE) {
   1.774 +        return -1;
   1.775 +    }
   1.776 +    int32_t i = 0;
   1.777 +    int32_t n = 0;
   1.778 +    for (;;) {
   1.779 +        UChar32 start = list[i++];
   1.780 +        if (c < start) {
   1.781 +            return -1;
   1.782 +        }
   1.783 +        UChar32 limit = list[i++];
   1.784 +        if (c < limit) {
   1.785 +            return n + c - start;
   1.786 +        }
   1.787 +        n += limit - start;
   1.788 +    }
   1.789 +}
   1.790 +
   1.791 +/**
   1.792 + * Returns the character at the given index within this set, where
   1.793 + * the set is ordered by ascending code point.  If the index is
   1.794 + * out of range, return (UChar32)-1.  The inverse of this method is
   1.795 + * <code>indexOf()</code>.
   1.796 + * @param index an index from 0..size()-1
   1.797 + * @return the character at the given index, or (UChar32)-1.
   1.798 + */
   1.799 +UChar32 UnicodeSet::charAt(int32_t index) const {
   1.800 +    if (index >= 0) {
   1.801 +        // len2 is the largest even integer <= len, that is, it is len
   1.802 +        // for even values and len-1 for odd values.  With odd values
   1.803 +        // the last entry is UNICODESET_HIGH.
   1.804 +        int32_t len2 = len & ~1;
   1.805 +        for (int32_t i=0; i < len2;) {
   1.806 +            UChar32 start = list[i++];
   1.807 +            int32_t count = list[i++] - start;
   1.808 +            if (index < count) {
   1.809 +                return (UChar32)(start + index);
   1.810 +            }
   1.811 +            index -= count;
   1.812 +        }
   1.813 +    }
   1.814 +    return (UChar32)-1;
   1.815 +}
   1.816 +
   1.817 +/**
   1.818 + * Make this object represent the range <code>start - end</code>.
   1.819 + * If <code>end > start</code> then this object is set to an
   1.820 + * an empty range.
   1.821 + *
   1.822 + * @param start first character in the set, inclusive
   1.823 + * @rparam end last character in the set, inclusive
   1.824 + */
   1.825 +UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
   1.826 +    clear();
   1.827 +    complement(start, end);
   1.828 +    return *this;
   1.829 +}
   1.830 +
   1.831 +/**
   1.832 + * Adds the specified range to this set if it is not already
   1.833 + * present.  If this set already contains the specified range,
   1.834 + * the call leaves this set unchanged.  If <code>end > start</code>
   1.835 + * then an empty range is added, leaving the set unchanged.
   1.836 + *
   1.837 + * @param start first character, inclusive, of range to be added
   1.838 + * to this set.
   1.839 + * @param end last character, inclusive, of range to be added
   1.840 + * to this set.
   1.841 + */
   1.842 +UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
   1.843 +    if (pinCodePoint(start) < pinCodePoint(end)) {
   1.844 +        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
   1.845 +        add(range, 2, 0);
   1.846 +    } else if (start == end) {
   1.847 +        add(start);
   1.848 +    }
   1.849 +    return *this;
   1.850 +}
   1.851 +
   1.852 +// #define DEBUG_US_ADD
   1.853 +
   1.854 +#ifdef DEBUG_US_ADD
   1.855 +#include <stdio.h>
   1.856 +void dump(UChar32 c) {
   1.857 +    if (c <= 0xFF) {
   1.858 +        printf("%c", (char)c);
   1.859 +    } else {
   1.860 +        printf("U+%04X", c);
   1.861 +    }
   1.862 +}
   1.863 +void dump(const UChar32* list, int32_t len) {
   1.864 +    printf("[");
   1.865 +    for (int32_t i=0; i<len; ++i) {
   1.866 +        if (i != 0) printf(", ");
   1.867 +        dump(list[i]);
   1.868 +    }
   1.869 +    printf("]");
   1.870 +}
   1.871 +#endif
   1.872 +
   1.873 +/**
   1.874 + * Adds the specified character to this set if it is not already
   1.875 + * present.  If this set already contains the specified character,
   1.876 + * the call leaves this set unchanged.
   1.877 + */
   1.878 +UnicodeSet& UnicodeSet::add(UChar32 c) {
   1.879 +    // find smallest i such that c < list[i]
   1.880 +    // if odd, then it is IN the set
   1.881 +    // if even, then it is OUT of the set
   1.882 +    int32_t i = findCodePoint(pinCodePoint(c));
   1.883 +
   1.884 +    // already in set?
   1.885 +    if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
   1.886 +
   1.887 +    // HIGH is 0x110000
   1.888 +    // assert(list[len-1] == HIGH);
   1.889 +
   1.890 +    // empty = [HIGH]
   1.891 +    // [start_0, limit_0, start_1, limit_1, HIGH]
   1.892 +
   1.893 +    // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
   1.894 +    //                             ^
   1.895 +    //                             list[i]
   1.896 +
   1.897 +    // i == 0 means c is before the first range
   1.898 +
   1.899 +#ifdef DEBUG_US_ADD
   1.900 +    printf("Add of ");
   1.901 +    dump(c);
   1.902 +    printf(" found at %d", i);
   1.903 +    printf(": ");
   1.904 +    dump(list, len);
   1.905 +    printf(" => ");
   1.906 +#endif
   1.907 +
   1.908 +    if (c == list[i]-1) {
   1.909 +        // c is before start of next range
   1.910 +        list[i] = c;
   1.911 +        // if we touched the HIGH mark, then add a new one
   1.912 +        if (c == (UNICODESET_HIGH - 1)) {
   1.913 +            UErrorCode status = U_ZERO_ERROR;
   1.914 +            ensureCapacity(len+1, status);
   1.915 +            if (U_FAILURE(status)) {
   1.916 +                return *this; // There is no way to report this error :-(
   1.917 +            }
   1.918 +            list[len++] = UNICODESET_HIGH;
   1.919 +        }
   1.920 +        if (i > 0 && c == list[i-1]) {
   1.921 +            // collapse adjacent ranges
   1.922 +
   1.923 +            // [..., start_k-1, c, c, limit_k, ..., HIGH]
   1.924 +            //                     ^
   1.925 +            //                     list[i]
   1.926 +
   1.927 +            //for (int32_t k=i-1; k<len-2; ++k) {
   1.928 +            //    list[k] = list[k+2];
   1.929 +            //}
   1.930 +            UChar32* dst = list + i - 1;
   1.931 +            UChar32* src = dst + 2;
   1.932 +            UChar32* srclimit = list + len;
   1.933 +            while (src < srclimit) *(dst++) = *(src++);
   1.934 +
   1.935 +            len -= 2;
   1.936 +        }
   1.937 +    }
   1.938 +
   1.939 +    else if (i > 0 && c == list[i-1]) {
   1.940 +        // c is after end of prior range
   1.941 +        list[i-1]++;
   1.942 +        // no need to check for collapse here
   1.943 +    }
   1.944 +
   1.945 +    else {
   1.946 +        // At this point we know the new char is not adjacent to
   1.947 +        // any existing ranges, and it is not 10FFFF.
   1.948 +
   1.949 +
   1.950 +        // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
   1.951 +        //                             ^
   1.952 +        //                             list[i]
   1.953 +
   1.954 +        // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
   1.955 +        //                             ^
   1.956 +        //                             list[i]
   1.957 +
   1.958 +        UErrorCode status = U_ZERO_ERROR;
   1.959 +        ensureCapacity(len+2, status);
   1.960 +        if (U_FAILURE(status)) {
   1.961 +            return *this; // There is no way to report this error :-(
   1.962 +        }
   1.963 +
   1.964 +        //for (int32_t k=len-1; k>=i; --k) {
   1.965 +        //    list[k+2] = list[k];
   1.966 +        //}
   1.967 +        UChar32* src = list + len;
   1.968 +        UChar32* dst = src + 2;
   1.969 +        UChar32* srclimit = list + i;
   1.970 +        while (src > srclimit) *(--dst) = *(--src);
   1.971 +
   1.972 +        list[i] = c;
   1.973 +        list[i+1] = c+1;
   1.974 +        len += 2;
   1.975 +    }
   1.976 +
   1.977 +#ifdef DEBUG_US_ADD
   1.978 +    dump(list, len);
   1.979 +    printf("\n");
   1.980 +
   1.981 +    for (i=1; i<len; ++i) {
   1.982 +        if (list[i] <= list[i-1]) {
   1.983 +            // Corrupt array!
   1.984 +            printf("ERROR: list has been corrupted\n");
   1.985 +            exit(1);
   1.986 +        }
   1.987 +    }
   1.988 +#endif
   1.989 +
   1.990 +    releasePattern();
   1.991 +    return *this;
   1.992 +}
   1.993 +
   1.994 +/**
   1.995 + * Adds the specified multicharacter to this set if it is not already
   1.996 + * present.  If this set already contains the multicharacter,
   1.997 + * the call leaves this set unchanged.
   1.998 + * Thus "ch" => {"ch"}
   1.999 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1000 + * @param s the source string
  1.1001 + * @return the modified set, for chaining
  1.1002 + */
  1.1003 +UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
  1.1004 +    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
  1.1005 +    int32_t cp = getSingleCP(s);
  1.1006 +    if (cp < 0) {
  1.1007 +        if (!strings->contains((void*) &s)) {
  1.1008 +            _add(s);
  1.1009 +            releasePattern();
  1.1010 +        }
  1.1011 +    } else {
  1.1012 +        add((UChar32)cp);
  1.1013 +    }
  1.1014 +    return *this;
  1.1015 +}
  1.1016 +
  1.1017 +/**
  1.1018 + * Adds the given string, in order, to 'strings'.  The given string
  1.1019 + * must have been checked by the caller to not be empty and to not
  1.1020 + * already be in 'strings'.
  1.1021 + */
  1.1022 +void UnicodeSet::_add(const UnicodeString& s) {
  1.1023 +    if (isFrozen() || isBogus()) {
  1.1024 +        return;
  1.1025 +    }
  1.1026 +    UnicodeString* t = new UnicodeString(s);
  1.1027 +    if (t == NULL) { // Check for memory allocation error.
  1.1028 +        setToBogus();
  1.1029 +        return;
  1.1030 +    }
  1.1031 +    UErrorCode ec = U_ZERO_ERROR;
  1.1032 +    strings->sortedInsert(t, compareUnicodeString, ec);
  1.1033 +    if (U_FAILURE(ec)) {
  1.1034 +        setToBogus();
  1.1035 +        delete t;
  1.1036 +    }
  1.1037 +}
  1.1038 +
  1.1039 +/**
  1.1040 + * @return a code point IF the string consists of a single one.
  1.1041 + * otherwise returns -1.
  1.1042 + * @param string to test
  1.1043 + */
  1.1044 +int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
  1.1045 +    //if (s.length() < 1) {
  1.1046 +    //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
  1.1047 +    //}
  1.1048 +    if (s.length() > 2) return -1;
  1.1049 +    if (s.length() == 1) return s.charAt(0);
  1.1050 +
  1.1051 +    // at this point, len = 2
  1.1052 +    UChar32 cp = s.char32At(0);
  1.1053 +    if (cp > 0xFFFF) { // is surrogate pair
  1.1054 +        return cp;
  1.1055 +    }
  1.1056 +    return -1;
  1.1057 +}
  1.1058 +
  1.1059 +/**
  1.1060 + * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
  1.1061 + * If this set already any particular character, it has no effect on that character.
  1.1062 + * @param the source string
  1.1063 + * @return the modified set, for chaining
  1.1064 + */
  1.1065 +UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
  1.1066 +    UChar32 cp;
  1.1067 +    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
  1.1068 +        cp = s.char32At(i);
  1.1069 +        add(cp);
  1.1070 +    }
  1.1071 +    return *this;
  1.1072 +}
  1.1073 +
  1.1074 +/**
  1.1075 + * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1076 + * If this set already any particular character, it has no effect on that character.
  1.1077 + * @param the source string
  1.1078 + * @return the modified set, for chaining
  1.1079 + */
  1.1080 +UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
  1.1081 +    UnicodeSet set;
  1.1082 +    set.addAll(s);
  1.1083 +    retainAll(set);
  1.1084 +    return *this;
  1.1085 +}
  1.1086 +
  1.1087 +/**
  1.1088 + * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1089 + * If this set already any particular character, it has no effect on that character.
  1.1090 + * @param the source string
  1.1091 + * @return the modified set, for chaining
  1.1092 + */
  1.1093 +UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
  1.1094 +    UnicodeSet set;
  1.1095 +    set.addAll(s);
  1.1096 +    complementAll(set);
  1.1097 +    return *this;
  1.1098 +}
  1.1099 +
  1.1100 +/**
  1.1101 + * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
  1.1102 + * If this set already any particular character, it has no effect on that character.
  1.1103 + * @param the source string
  1.1104 + * @return the modified set, for chaining
  1.1105 + */
  1.1106 +UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
  1.1107 +    UnicodeSet set;
  1.1108 +    set.addAll(s);
  1.1109 +    removeAll(set);
  1.1110 +    return *this;
  1.1111 +}
  1.1112 +
  1.1113 +UnicodeSet& UnicodeSet::removeAllStrings() {
  1.1114 +    strings->removeAllElements();
  1.1115 +    return *this;
  1.1116 +}
  1.1117 +
  1.1118 +
  1.1119 +/**
  1.1120 + * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
  1.1121 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1122 + * @param the source string
  1.1123 + * @return a newly created set containing the given string
  1.1124 + */
  1.1125 +UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
  1.1126 +    UnicodeSet *set = new UnicodeSet();
  1.1127 +    if (set != NULL) { // Check for memory allocation error.
  1.1128 +        set->add(s);
  1.1129 +    }
  1.1130 +    return set;
  1.1131 +}
  1.1132 +
  1.1133 +
  1.1134 +/**
  1.1135 + * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
  1.1136 + * @param the source string
  1.1137 + * @return a newly created set containing the given characters
  1.1138 + */
  1.1139 +UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
  1.1140 +    UnicodeSet *set = new UnicodeSet();
  1.1141 +    if (set != NULL) { // Check for memory allocation error.
  1.1142 +        set->addAll(s);
  1.1143 +    }
  1.1144 +    return set;
  1.1145 +}
  1.1146 +
  1.1147 +/**
  1.1148 + * Retain only the elements in this set that are contained in the
  1.1149 + * specified range.  If <code>end > start</code> then an empty range is
  1.1150 + * retained, leaving the set empty.
  1.1151 + *
  1.1152 + * @param start first character, inclusive, of range to be retained
  1.1153 + * to this set.
  1.1154 + * @param end last character, inclusive, of range to be retained
  1.1155 + * to this set.
  1.1156 + */
  1.1157 +UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
  1.1158 +    if (pinCodePoint(start) <= pinCodePoint(end)) {
  1.1159 +        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
  1.1160 +        retain(range, 2, 0);
  1.1161 +    } else {
  1.1162 +        clear();
  1.1163 +    }
  1.1164 +    return *this;
  1.1165 +}
  1.1166 +
  1.1167 +UnicodeSet& UnicodeSet::retain(UChar32 c) {
  1.1168 +    return retain(c, c);
  1.1169 +}
  1.1170 +
  1.1171 +/**
  1.1172 + * Removes the specified range from this set if it is present.
  1.1173 + * The set will not contain the specified range once the call
  1.1174 + * returns.  If <code>end > start</code> then an empty range is
  1.1175 + * removed, leaving the set unchanged.
  1.1176 + *
  1.1177 + * @param start first character, inclusive, of range to be removed
  1.1178 + * from this set.
  1.1179 + * @param end last character, inclusive, of range to be removed
  1.1180 + * from this set.
  1.1181 + */
  1.1182 +UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
  1.1183 +    if (pinCodePoint(start) <= pinCodePoint(end)) {
  1.1184 +        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
  1.1185 +        retain(range, 2, 2);
  1.1186 +    }
  1.1187 +    return *this;
  1.1188 +}
  1.1189 +
  1.1190 +/**
  1.1191 + * Removes the specified character from this set if it is present.
  1.1192 + * The set will not contain the specified range once the call
  1.1193 + * returns.
  1.1194 + */
  1.1195 +UnicodeSet& UnicodeSet::remove(UChar32 c) {
  1.1196 +    return remove(c, c);
  1.1197 +}
  1.1198 +
  1.1199 +/**
  1.1200 + * Removes the specified string from this set if it is present.
  1.1201 + * The set will not contain the specified character once the call
  1.1202 + * returns.
  1.1203 + * @param the source string
  1.1204 + * @return the modified set, for chaining
  1.1205 + */
  1.1206 +UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
  1.1207 +    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
  1.1208 +    int32_t cp = getSingleCP(s);
  1.1209 +    if (cp < 0) {
  1.1210 +        strings->removeElement((void*) &s);
  1.1211 +        releasePattern();
  1.1212 +    } else {
  1.1213 +        remove((UChar32)cp, (UChar32)cp);
  1.1214 +    }
  1.1215 +    return *this;
  1.1216 +}
  1.1217 +
  1.1218 +/**
  1.1219 + * Complements the specified range in this set.  Any character in
  1.1220 + * the range will be removed if it is in this set, or will be
  1.1221 + * added if it is not in this set.  If <code>end > start</code>
  1.1222 + * then an empty range is xor'ed, leaving the set unchanged.
  1.1223 + *
  1.1224 + * @param start first character, inclusive, of range to be removed
  1.1225 + * from this set.
  1.1226 + * @param end last character, inclusive, of range to be removed
  1.1227 + * from this set.
  1.1228 + */
  1.1229 +UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
  1.1230 +    if (isFrozen() || isBogus()) {
  1.1231 +        return *this;
  1.1232 +    }
  1.1233 +    if (pinCodePoint(start) <= pinCodePoint(end)) {
  1.1234 +        UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
  1.1235 +        exclusiveOr(range, 2, 0);
  1.1236 +    }
  1.1237 +    releasePattern();
  1.1238 +    return *this;
  1.1239 +}
  1.1240 +
  1.1241 +UnicodeSet& UnicodeSet::complement(UChar32 c) {
  1.1242 +    return complement(c, c);
  1.1243 +}
  1.1244 +
  1.1245 +/**
  1.1246 + * This is equivalent to
  1.1247 + * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
  1.1248 + */
  1.1249 +UnicodeSet& UnicodeSet::complement(void) {
  1.1250 +    if (isFrozen() || isBogus()) {
  1.1251 +        return *this;
  1.1252 +    }
  1.1253 +    UErrorCode status = U_ZERO_ERROR;
  1.1254 +    if (list[0] == UNICODESET_LOW) {
  1.1255 +        ensureBufferCapacity(len-1, status);
  1.1256 +        if (U_FAILURE(status)) {
  1.1257 +            return *this;
  1.1258 +        }
  1.1259 +        uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
  1.1260 +        --len;
  1.1261 +    } else {
  1.1262 +        ensureBufferCapacity(len+1, status);
  1.1263 +        if (U_FAILURE(status)) {
  1.1264 +            return *this;
  1.1265 +        }
  1.1266 +        uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
  1.1267 +        buffer[0] = UNICODESET_LOW;
  1.1268 +        ++len;
  1.1269 +    }
  1.1270 +    swapBuffers();
  1.1271 +    releasePattern();
  1.1272 +    return *this;
  1.1273 +}
  1.1274 +
  1.1275 +/**
  1.1276 + * Complement the specified string in this set.
  1.1277 + * The set will not contain the specified string once the call
  1.1278 + * returns.
  1.1279 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
  1.1280 + * @param s the string to complement
  1.1281 + * @return this object, for chaining
  1.1282 + */
  1.1283 +UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
  1.1284 +    if (s.length() == 0 || isFrozen() || isBogus()) return *this;
  1.1285 +    int32_t cp = getSingleCP(s);
  1.1286 +    if (cp < 0) {
  1.1287 +        if (strings->contains((void*) &s)) {
  1.1288 +            strings->removeElement((void*) &s);
  1.1289 +        } else {
  1.1290 +            _add(s);
  1.1291 +        }
  1.1292 +        releasePattern();
  1.1293 +    } else {
  1.1294 +        complement((UChar32)cp, (UChar32)cp);
  1.1295 +    }
  1.1296 +    return *this;
  1.1297 +}
  1.1298 +
  1.1299 +/**
  1.1300 + * Adds all of the elements in the specified set to this set if
  1.1301 + * they're not already present.  This operation effectively
  1.1302 + * modifies this set so that its value is the <i>union</i> of the two
  1.1303 + * sets.  The behavior of this operation is unspecified if the specified
  1.1304 + * collection is modified while the operation is in progress.
  1.1305 + *
  1.1306 + * @param c set whose elements are to be added to this set.
  1.1307 + * @see #add(char, char)
  1.1308 + */
  1.1309 +UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
  1.1310 +    if ( c.len>0 && c.list!=NULL ) {
  1.1311 +        add(c.list, c.len, 0);
  1.1312 +    }
  1.1313 +
  1.1314 +    // Add strings in order
  1.1315 +    if ( c.strings!=NULL ) {
  1.1316 +        for (int32_t i=0; i<c.strings->size(); ++i) {
  1.1317 +            const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
  1.1318 +            if (!strings->contains((void*) s)) {
  1.1319 +                _add(*s);
  1.1320 +            }
  1.1321 +        }
  1.1322 +    }
  1.1323 +    return *this;
  1.1324 +}
  1.1325 +
  1.1326 +/**
  1.1327 + * Retains only the elements in this set that are contained in the
  1.1328 + * specified set.  In other words, removes from this set all of
  1.1329 + * its elements that are not contained in the specified set.  This
  1.1330 + * operation effectively modifies this set so that its value is
  1.1331 + * the <i>intersection</i> of the two sets.
  1.1332 + *
  1.1333 + * @param c set that defines which elements this set will retain.
  1.1334 + */
  1.1335 +UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
  1.1336 +    if (isFrozen() || isBogus()) {
  1.1337 +        return *this;
  1.1338 +    }
  1.1339 +    retain(c.list, c.len, 0);
  1.1340 +    strings->retainAll(*c.strings);
  1.1341 +    return *this;
  1.1342 +}
  1.1343 +
  1.1344 +/**
  1.1345 + * Removes from this set all of its elements that are contained in the
  1.1346 + * specified set.  This operation effectively modifies this
  1.1347 + * set so that its value is the <i>asymmetric set difference</i> of
  1.1348 + * the two sets.
  1.1349 + *
  1.1350 + * @param c set that defines which elements will be removed from
  1.1351 + *          this set.
  1.1352 + */
  1.1353 +UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
  1.1354 +    if (isFrozen() || isBogus()) {
  1.1355 +        return *this;
  1.1356 +    }
  1.1357 +    retain(c.list, c.len, 2);
  1.1358 +    strings->removeAll(*c.strings);
  1.1359 +    return *this;
  1.1360 +}
  1.1361 +
  1.1362 +/**
  1.1363 + * Complements in this set all elements contained in the specified
  1.1364 + * set.  Any character in the other set will be removed if it is
  1.1365 + * in this set, or will be added if it is not in this set.
  1.1366 + *
  1.1367 + * @param c set that defines which elements will be xor'ed from
  1.1368 + *          this set.
  1.1369 + */
  1.1370 +UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
  1.1371 +    if (isFrozen() || isBogus()) {
  1.1372 +        return *this;
  1.1373 +    }
  1.1374 +    exclusiveOr(c.list, c.len, 0);
  1.1375 +
  1.1376 +    for (int32_t i=0; i<c.strings->size(); ++i) {
  1.1377 +        void* e = c.strings->elementAt(i);
  1.1378 +        if (!strings->removeElement(e)) {
  1.1379 +            _add(*(const UnicodeString*)e);
  1.1380 +        }
  1.1381 +    }
  1.1382 +    return *this;
  1.1383 +}
  1.1384 +
  1.1385 +/**
  1.1386 + * Removes all of the elements from this set.  This set will be
  1.1387 + * empty after this call returns.
  1.1388 + */
  1.1389 +UnicodeSet& UnicodeSet::clear(void) {
  1.1390 +    if (isFrozen()) {
  1.1391 +        return *this;
  1.1392 +    }
  1.1393 +    if (list != NULL) {
  1.1394 +        list[0] = UNICODESET_HIGH;
  1.1395 +    }
  1.1396 +    len = 1;
  1.1397 +    releasePattern();
  1.1398 +    if (strings != NULL) {
  1.1399 +        strings->removeAllElements();
  1.1400 +    }
  1.1401 +    if (list != NULL && strings != NULL) {
  1.1402 +        // Remove bogus
  1.1403 +        fFlags = 0;
  1.1404 +    }
  1.1405 +    return *this;
  1.1406 +}
  1.1407 +
  1.1408 +/**
  1.1409 + * Iteration method that returns the number of ranges contained in
  1.1410 + * this set.
  1.1411 + * @see #getRangeStart
  1.1412 + * @see #getRangeEnd
  1.1413 + */
  1.1414 +int32_t UnicodeSet::getRangeCount() const {
  1.1415 +    return len/2;
  1.1416 +}
  1.1417 +
  1.1418 +/**
  1.1419 + * Iteration method that returns the first character in the
  1.1420 + * specified range of this set.
  1.1421 + * @see #getRangeCount
  1.1422 + * @see #getRangeEnd
  1.1423 + */
  1.1424 +UChar32 UnicodeSet::getRangeStart(int32_t index) const {
  1.1425 +    return list[index*2];
  1.1426 +}
  1.1427 +
  1.1428 +/**
  1.1429 + * Iteration method that returns the last character in the
  1.1430 + * specified range of this set.
  1.1431 + * @see #getRangeStart
  1.1432 + * @see #getRangeEnd
  1.1433 + */
  1.1434 +UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
  1.1435 +    return list[index*2 + 1] - 1;
  1.1436 +}
  1.1437 +
  1.1438 +int32_t UnicodeSet::getStringCount() const {
  1.1439 +    return strings->size();
  1.1440 +}
  1.1441 +
  1.1442 +const UnicodeString* UnicodeSet::getString(int32_t index) const {
  1.1443 +    return (const UnicodeString*) strings->elementAt(index);
  1.1444 +}
  1.1445 +
  1.1446 +/**
  1.1447 + * Reallocate this objects internal structures to take up the least
  1.1448 + * possible space, without changing this object's value.
  1.1449 + */
  1.1450 +UnicodeSet& UnicodeSet::compact() {
  1.1451 +    if (isFrozen() || isBogus()) {
  1.1452 +        return *this;
  1.1453 +    }
  1.1454 +    // Delete buffer first to defragment memory less.
  1.1455 +    if (buffer != NULL) {
  1.1456 +        uprv_free(buffer);
  1.1457 +        buffer = NULL;
  1.1458 +    }
  1.1459 +    if (len < capacity) {
  1.1460 +        // Make the capacity equal to len or 1.
  1.1461 +        // We don't want to realloc of 0 size.
  1.1462 +        int32_t newCapacity = len + (len == 0);
  1.1463 +        UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
  1.1464 +        if (temp) {
  1.1465 +            list = temp;
  1.1466 +            capacity = newCapacity;
  1.1467 +        }
  1.1468 +        // else what the heck happened?! We allocated less memory!
  1.1469 +        // Oh well. We'll keep our original array.
  1.1470 +    }
  1.1471 +    return *this;
  1.1472 +}
  1.1473 +
  1.1474 +int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
  1.1475 +    int32_t bmpLength, length, destLength;
  1.1476 +
  1.1477 +    if (U_FAILURE(ec)) {
  1.1478 +        return 0;
  1.1479 +    }
  1.1480 +
  1.1481 +    if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
  1.1482 +        ec=U_ILLEGAL_ARGUMENT_ERROR;
  1.1483 +        return 0;
  1.1484 +    }
  1.1485 +
  1.1486 +    /* count necessary 16-bit units */
  1.1487 +    length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
  1.1488 +    // assert(length>=0);
  1.1489 +    if (length==0) {
  1.1490 +        /* empty set */
  1.1491 +        if (destCapacity>0) {
  1.1492 +            *dest=0;
  1.1493 +        } else {
  1.1494 +            ec=U_BUFFER_OVERFLOW_ERROR;
  1.1495 +        }
  1.1496 +        return 1;
  1.1497 +    }
  1.1498 +    /* now length>0 */
  1.1499 +
  1.1500 +    if (this->list[length-1]<=0xffff) {
  1.1501 +        /* all BMP */
  1.1502 +        bmpLength=length;
  1.1503 +    } else if (this->list[0]>=0x10000) {
  1.1504 +        /* all supplementary */
  1.1505 +        bmpLength=0;
  1.1506 +        length*=2;
  1.1507 +    } else {
  1.1508 +        /* some BMP, some supplementary */
  1.1509 +        for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
  1.1510 +        length=bmpLength+2*(length-bmpLength);
  1.1511 +    }
  1.1512 +
  1.1513 +    /* length: number of 16-bit array units */
  1.1514 +    if (length>0x7fff) {
  1.1515 +        /* there are only 15 bits for the length in the first serialized word */
  1.1516 +        ec=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1517 +        return 0;
  1.1518 +    }
  1.1519 +
  1.1520 +    /*
  1.1521 +     * total serialized length:
  1.1522 +     * number of 16-bit array units (length) +
  1.1523 +     * 1 length unit (always) +
  1.1524 +     * 1 bmpLength unit (if there are supplementary values)
  1.1525 +     */
  1.1526 +    destLength=length+((length>bmpLength)?2:1);
  1.1527 +    if (destLength<=destCapacity) {
  1.1528 +        const UChar32 *p;
  1.1529 +        int32_t i;
  1.1530 +
  1.1531 +        *dest=(uint16_t)length;
  1.1532 +        if (length>bmpLength) {
  1.1533 +            *dest|=0x8000;
  1.1534 +            *++dest=(uint16_t)bmpLength;
  1.1535 +        }
  1.1536 +        ++dest;
  1.1537 +
  1.1538 +        /* write the BMP part of the array */
  1.1539 +        p=this->list;
  1.1540 +        for (i=0; i<bmpLength; ++i) {
  1.1541 +            *dest++=(uint16_t)*p++;
  1.1542 +        }
  1.1543 +
  1.1544 +        /* write the supplementary part of the array */
  1.1545 +        for (; i<length; i+=2) {
  1.1546 +            *dest++=(uint16_t)(*p>>16);
  1.1547 +            *dest++=(uint16_t)*p++;
  1.1548 +        }
  1.1549 +    } else {
  1.1550 +        ec=U_BUFFER_OVERFLOW_ERROR;
  1.1551 +    }
  1.1552 +    return destLength;
  1.1553 +}
  1.1554 +
  1.1555 +//----------------------------------------------------------------
  1.1556 +// Implementation: Utility methods
  1.1557 +//----------------------------------------------------------------
  1.1558 +
  1.1559 +/**
  1.1560 + * Allocate our strings vector and return TRUE if successful.
  1.1561 + */
  1.1562 +UBool UnicodeSet::allocateStrings(UErrorCode &status) {
  1.1563 +    if (U_FAILURE(status)) {
  1.1564 +        return FALSE;
  1.1565 +    }
  1.1566 +    strings = new UVector(uprv_deleteUObject,
  1.1567 +                          uhash_compareUnicodeString, 1, status);
  1.1568 +    if (strings == NULL) { // Check for memory allocation error.
  1.1569 +        status = U_MEMORY_ALLOCATION_ERROR;
  1.1570 +        return FALSE;
  1.1571 +    }
  1.1572 +    if (U_FAILURE(status)) {
  1.1573 +        delete strings;
  1.1574 +        strings = NULL;
  1.1575 +        return FALSE;
  1.1576 +    } 
  1.1577 +    return TRUE;
  1.1578 +}
  1.1579 +
  1.1580 +void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
  1.1581 +    if (newLen <= capacity)
  1.1582 +        return;
  1.1583 +    UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
  1.1584 +    if (temp == NULL) {
  1.1585 +        ec = U_MEMORY_ALLOCATION_ERROR;
  1.1586 +        setToBogus();
  1.1587 +        return;
  1.1588 +    }
  1.1589 +    list = temp;
  1.1590 +    capacity = newLen + GROW_EXTRA;
  1.1591 +    // else we keep the original contents on the memory failure.
  1.1592 +}
  1.1593 +
  1.1594 +void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
  1.1595 +    if (buffer != NULL && newLen <= bufferCapacity)
  1.1596 +        return;
  1.1597 +    UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
  1.1598 +    if (temp == NULL) {
  1.1599 +        ec = U_MEMORY_ALLOCATION_ERROR;
  1.1600 +        setToBogus();
  1.1601 +        return;
  1.1602 +    }
  1.1603 +    buffer = temp;
  1.1604 +    bufferCapacity = newLen + GROW_EXTRA;
  1.1605 +    // else we keep the original contents on the memory failure.
  1.1606 +}
  1.1607 +
  1.1608 +/**
  1.1609 + * Swap list and buffer.
  1.1610 + */
  1.1611 +void UnicodeSet::swapBuffers(void) {
  1.1612 +    // swap list and buffer
  1.1613 +    UChar32* temp = list;
  1.1614 +    list = buffer;
  1.1615 +    buffer = temp;
  1.1616 +
  1.1617 +    int32_t c = capacity;
  1.1618 +    capacity = bufferCapacity;
  1.1619 +    bufferCapacity = c;
  1.1620 +}
  1.1621 +
  1.1622 +void UnicodeSet::setToBogus() {
  1.1623 +    clear(); // Remove everything in the set.
  1.1624 +    fFlags = kIsBogus;
  1.1625 +}
  1.1626 +
  1.1627 +//----------------------------------------------------------------
  1.1628 +// Implementation: Fundamental operators
  1.1629 +//----------------------------------------------------------------
  1.1630 +
  1.1631 +static inline UChar32 max(UChar32 a, UChar32 b) {
  1.1632 +    return (a > b) ? a : b;
  1.1633 +}
  1.1634 +
  1.1635 +// polarity = 0, 3 is normal: x xor y
  1.1636 +// polarity = 1, 2: x xor ~y == x === y
  1.1637 +
  1.1638 +void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
  1.1639 +    if (isFrozen() || isBogus()) {
  1.1640 +        return;
  1.1641 +    }
  1.1642 +    UErrorCode status = U_ZERO_ERROR;
  1.1643 +    ensureBufferCapacity(len + otherLen, status);
  1.1644 +    if (U_FAILURE(status)) {
  1.1645 +        return;
  1.1646 +    }
  1.1647 +
  1.1648 +    int32_t i = 0, j = 0, k = 0;
  1.1649 +    UChar32 a = list[i++];
  1.1650 +    UChar32 b;
  1.1651 +    if (polarity == 1 || polarity == 2) {
  1.1652 +        b = UNICODESET_LOW;
  1.1653 +        if (other[j] == UNICODESET_LOW) { // skip base if already LOW
  1.1654 +            ++j;
  1.1655 +            b = other[j];
  1.1656 +        }
  1.1657 +    } else {
  1.1658 +        b = other[j++];
  1.1659 +    }
  1.1660 +    // simplest of all the routines
  1.1661 +    // sort the values, discarding identicals!
  1.1662 +    for (;;) {
  1.1663 +        if (a < b) {
  1.1664 +            buffer[k++] = a;
  1.1665 +            a = list[i++];
  1.1666 +        } else if (b < a) {
  1.1667 +            buffer[k++] = b;
  1.1668 +            b = other[j++];
  1.1669 +        } else if (a != UNICODESET_HIGH) { // at this point, a == b
  1.1670 +            // discard both values!
  1.1671 +            a = list[i++];
  1.1672 +            b = other[j++];
  1.1673 +        } else { // DONE!
  1.1674 +            buffer[k++] = UNICODESET_HIGH;
  1.1675 +            len = k;
  1.1676 +            break;
  1.1677 +        }
  1.1678 +    }
  1.1679 +    swapBuffers();
  1.1680 +    releasePattern();
  1.1681 +}
  1.1682 +
  1.1683 +// polarity = 0 is normal: x union y
  1.1684 +// polarity = 2: x union ~y
  1.1685 +// polarity = 1: ~x union y
  1.1686 +// polarity = 3: ~x union ~y
  1.1687 +
  1.1688 +void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
  1.1689 +    if (isFrozen() || isBogus() || other==NULL) {
  1.1690 +        return;
  1.1691 +    }
  1.1692 +    UErrorCode status = U_ZERO_ERROR;
  1.1693 +    ensureBufferCapacity(len + otherLen, status);
  1.1694 +    if (U_FAILURE(status)) {
  1.1695 +        return;
  1.1696 +    }
  1.1697 +
  1.1698 +    int32_t i = 0, j = 0, k = 0;
  1.1699 +    UChar32 a = list[i++];
  1.1700 +    UChar32 b = other[j++];
  1.1701 +    // change from xor is that we have to check overlapping pairs
  1.1702 +    // polarity bit 1 means a is second, bit 2 means b is.
  1.1703 +    for (;;) {
  1.1704 +        switch (polarity) {
  1.1705 +          case 0: // both first; take lower if unequal
  1.1706 +            if (a < b) { // take a
  1.1707 +                // Back up over overlapping ranges in buffer[]
  1.1708 +                if (k > 0 && a <= buffer[k-1]) {
  1.1709 +                    // Pick latter end value in buffer[] vs. list[]
  1.1710 +                    a = max(list[i], buffer[--k]);
  1.1711 +                } else {
  1.1712 +                    // No overlap
  1.1713 +                    buffer[k++] = a;
  1.1714 +                    a = list[i];
  1.1715 +                }
  1.1716 +                i++; // Common if/else code factored out
  1.1717 +                polarity ^= 1;
  1.1718 +            } else if (b < a) { // take b
  1.1719 +                if (k > 0 && b <= buffer[k-1]) {
  1.1720 +                    b = max(other[j], buffer[--k]);
  1.1721 +                } else {
  1.1722 +                    buffer[k++] = b;
  1.1723 +                    b = other[j];
  1.1724 +                }
  1.1725 +                j++;
  1.1726 +                polarity ^= 2;
  1.1727 +            } else { // a == b, take a, drop b
  1.1728 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1729 +                // This is symmetrical; it doesn't matter if
  1.1730 +                // we backtrack with a or b. - liu
  1.1731 +                if (k > 0 && a <= buffer[k-1]) {
  1.1732 +                    a = max(list[i], buffer[--k]);
  1.1733 +                } else {
  1.1734 +                    // No overlap
  1.1735 +                    buffer[k++] = a;
  1.1736 +                    a = list[i];
  1.1737 +                }
  1.1738 +                i++;
  1.1739 +                polarity ^= 1;
  1.1740 +                b = other[j++];
  1.1741 +                polarity ^= 2;
  1.1742 +            }
  1.1743 +            break;
  1.1744 +          case 3: // both second; take higher if unequal, and drop other
  1.1745 +            if (b <= a) { // take a
  1.1746 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1747 +                buffer[k++] = a;
  1.1748 +            } else { // take b
  1.1749 +                if (b == UNICODESET_HIGH) goto loop_end;
  1.1750 +                buffer[k++] = b;
  1.1751 +            }
  1.1752 +            a = list[i++];
  1.1753 +            polarity ^= 1;   // factored common code
  1.1754 +            b = other[j++];
  1.1755 +            polarity ^= 2;
  1.1756 +            break;
  1.1757 +          case 1: // a second, b first; if b < a, overlap
  1.1758 +            if (a < b) { // no overlap, take a
  1.1759 +                buffer[k++] = a; a = list[i++]; polarity ^= 1;
  1.1760 +            } else if (b < a) { // OVERLAP, drop b
  1.1761 +                b = other[j++];
  1.1762 +                polarity ^= 2;
  1.1763 +            } else { // a == b, drop both!
  1.1764 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1765 +                a = list[i++];
  1.1766 +                polarity ^= 1;
  1.1767 +                b = other[j++];
  1.1768 +                polarity ^= 2;
  1.1769 +            }
  1.1770 +            break;
  1.1771 +          case 2: // a first, b second; if a < b, overlap
  1.1772 +            if (b < a) { // no overlap, take b
  1.1773 +                buffer[k++] = b;
  1.1774 +                b = other[j++];
  1.1775 +                polarity ^= 2;
  1.1776 +            } else  if (a < b) { // OVERLAP, drop a
  1.1777 +                a = list[i++];
  1.1778 +                polarity ^= 1;
  1.1779 +            } else { // a == b, drop both!
  1.1780 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1781 +                a = list[i++];
  1.1782 +                polarity ^= 1;
  1.1783 +                b = other[j++];
  1.1784 +                polarity ^= 2;
  1.1785 +            }
  1.1786 +            break;
  1.1787 +        }
  1.1788 +    }
  1.1789 + loop_end:
  1.1790 +    buffer[k++] = UNICODESET_HIGH;    // terminate
  1.1791 +    len = k;
  1.1792 +    swapBuffers();
  1.1793 +    releasePattern();
  1.1794 +}
  1.1795 +
  1.1796 +// polarity = 0 is normal: x intersect y
  1.1797 +// polarity = 2: x intersect ~y == set-minus
  1.1798 +// polarity = 1: ~x intersect y
  1.1799 +// polarity = 3: ~x intersect ~y
  1.1800 +
  1.1801 +void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
  1.1802 +    if (isFrozen() || isBogus()) {
  1.1803 +        return;
  1.1804 +    }
  1.1805 +    UErrorCode status = U_ZERO_ERROR;
  1.1806 +    ensureBufferCapacity(len + otherLen, status);
  1.1807 +    if (U_FAILURE(status)) {
  1.1808 +        return;
  1.1809 +    }
  1.1810 +
  1.1811 +    int32_t i = 0, j = 0, k = 0;
  1.1812 +    UChar32 a = list[i++];
  1.1813 +    UChar32 b = other[j++];
  1.1814 +    // change from xor is that we have to check overlapping pairs
  1.1815 +    // polarity bit 1 means a is second, bit 2 means b is.
  1.1816 +    for (;;) {
  1.1817 +        switch (polarity) {
  1.1818 +          case 0: // both first; drop the smaller
  1.1819 +            if (a < b) { // drop a
  1.1820 +                a = list[i++];
  1.1821 +                polarity ^= 1;
  1.1822 +            } else if (b < a) { // drop b
  1.1823 +                b = other[j++];
  1.1824 +                polarity ^= 2;
  1.1825 +            } else { // a == b, take one, drop other
  1.1826 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1827 +                buffer[k++] = a;
  1.1828 +                a = list[i++];
  1.1829 +                polarity ^= 1;
  1.1830 +                b = other[j++];
  1.1831 +                polarity ^= 2;
  1.1832 +            }
  1.1833 +            break;
  1.1834 +          case 3: // both second; take lower if unequal
  1.1835 +            if (a < b) { // take a
  1.1836 +                buffer[k++] = a;
  1.1837 +                a = list[i++];
  1.1838 +                polarity ^= 1;
  1.1839 +            } else if (b < a) { // take b
  1.1840 +                buffer[k++] = b;
  1.1841 +                b = other[j++];
  1.1842 +                polarity ^= 2;
  1.1843 +            } else { // a == b, take one, drop other
  1.1844 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1845 +                buffer[k++] = a;
  1.1846 +                a = list[i++];
  1.1847 +                polarity ^= 1;
  1.1848 +                b = other[j++];
  1.1849 +                polarity ^= 2;
  1.1850 +            }
  1.1851 +            break;
  1.1852 +          case 1: // a second, b first;
  1.1853 +            if (a < b) { // NO OVERLAP, drop a
  1.1854 +                a = list[i++];
  1.1855 +                polarity ^= 1;
  1.1856 +            } else if (b < a) { // OVERLAP, take b
  1.1857 +                buffer[k++] = b;
  1.1858 +                b = other[j++];
  1.1859 +                polarity ^= 2;
  1.1860 +            } else { // a == b, drop both!
  1.1861 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1862 +                a = list[i++];
  1.1863 +                polarity ^= 1;
  1.1864 +                b = other[j++];
  1.1865 +                polarity ^= 2;
  1.1866 +            }
  1.1867 +            break;
  1.1868 +          case 2: // a first, b second; if a < b, overlap
  1.1869 +            if (b < a) { // no overlap, drop b
  1.1870 +                b = other[j++];
  1.1871 +                polarity ^= 2;
  1.1872 +            } else  if (a < b) { // OVERLAP, take a
  1.1873 +                buffer[k++] = a;
  1.1874 +                a = list[i++];
  1.1875 +                polarity ^= 1;
  1.1876 +            } else { // a == b, drop both!
  1.1877 +                if (a == UNICODESET_HIGH) goto loop_end;
  1.1878 +                a = list[i++];
  1.1879 +                polarity ^= 1;
  1.1880 +                b = other[j++];
  1.1881 +                polarity ^= 2;
  1.1882 +            }
  1.1883 +            break;
  1.1884 +        }
  1.1885 +    }
  1.1886 + loop_end:
  1.1887 +    buffer[k++] = UNICODESET_HIGH;    // terminate
  1.1888 +    len = k;
  1.1889 +    swapBuffers();
  1.1890 +    releasePattern();
  1.1891 +}
  1.1892 +
  1.1893 +/**
  1.1894 + * Append the <code>toPattern()</code> representation of a
  1.1895 + * string to the given <code>StringBuffer</code>.
  1.1896 + */
  1.1897 +void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
  1.1898 +escapeUnprintable) {
  1.1899 +    UChar32 cp;
  1.1900 +    for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
  1.1901 +        _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
  1.1902 +    }
  1.1903 +}
  1.1904 +
  1.1905 +/**
  1.1906 + * Append the <code>toPattern()</code> representation of a
  1.1907 + * character to the given <code>StringBuffer</code>.
  1.1908 + */
  1.1909 +void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
  1.1910 +escapeUnprintable) {
  1.1911 +    if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
  1.1912 +        // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
  1.1913 +        // unprintable
  1.1914 +        if (ICU_Utility::escapeUnprintable(buf, c)) {
  1.1915 +            return;
  1.1916 +        }
  1.1917 +    }
  1.1918 +    // Okay to let ':' pass through
  1.1919 +    switch (c) {
  1.1920 +    case SET_OPEN:
  1.1921 +    case SET_CLOSE:
  1.1922 +    case HYPHEN:
  1.1923 +    case COMPLEMENT:
  1.1924 +    case INTERSECTION:
  1.1925 +    case BACKSLASH:
  1.1926 +    case OPEN_BRACE:
  1.1927 +    case CLOSE_BRACE:
  1.1928 +    case COLON:
  1.1929 +    case SymbolTable::SYMBOL_REF:
  1.1930 +        buf.append(BACKSLASH);
  1.1931 +        break;
  1.1932 +    default:
  1.1933 +        // Escape whitespace
  1.1934 +        if (PatternProps::isWhiteSpace(c)) {
  1.1935 +            buf.append(BACKSLASH);
  1.1936 +        }
  1.1937 +        break;
  1.1938 +    }
  1.1939 +    buf.append(c);
  1.1940 +}
  1.1941 +
  1.1942 +/**
  1.1943 + * Append a string representation of this set to result.  This will be
  1.1944 + * a cleaned version of the string passed to applyPattern(), if there
  1.1945 + * is one.  Otherwise it will be generated.
  1.1946 + */
  1.1947 +UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
  1.1948 +                                      UBool escapeUnprintable) const
  1.1949 +{
  1.1950 +    if (pat != NULL) {
  1.1951 +        int32_t i;
  1.1952 +        int32_t backslashCount = 0;
  1.1953 +        for (i=0; i<patLen; ) {
  1.1954 +            UChar32 c;
  1.1955 +            U16_NEXT(pat, i, patLen, c);
  1.1956 +            if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
  1.1957 +                // If the unprintable character is preceded by an odd
  1.1958 +                // number of backslashes, then it has been escaped.
  1.1959 +                // Before unescaping it, we delete the final
  1.1960 +                // backslash.
  1.1961 +                if ((backslashCount % 2) == 1) {
  1.1962 +                    result.truncate(result.length() - 1);
  1.1963 +                }
  1.1964 +                ICU_Utility::escapeUnprintable(result, c);
  1.1965 +                backslashCount = 0;
  1.1966 +            } else {
  1.1967 +                result.append(c);
  1.1968 +                if (c == BACKSLASH) {
  1.1969 +                    ++backslashCount;
  1.1970 +                } else {
  1.1971 +                    backslashCount = 0;
  1.1972 +                }
  1.1973 +            }
  1.1974 +        }
  1.1975 +        return result;
  1.1976 +    }
  1.1977 +
  1.1978 +    return _generatePattern(result, escapeUnprintable);
  1.1979 +}
  1.1980 +
  1.1981 +/**
  1.1982 + * Returns a string representation of this set.  If the result of
  1.1983 + * calling this function is passed to a UnicodeSet constructor, it
  1.1984 + * will produce another set that is equal to this one.
  1.1985 + */
  1.1986 +UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
  1.1987 +                                     UBool escapeUnprintable) const
  1.1988 +{
  1.1989 +    result.truncate(0);
  1.1990 +    return _toPattern(result, escapeUnprintable);
  1.1991 +}
  1.1992 +
  1.1993 +/**
  1.1994 + * Generate and append a string representation of this set to result.
  1.1995 + * This does not use this.pat, the cleaned up copy of the string
  1.1996 + * passed to applyPattern().
  1.1997 + */
  1.1998 +UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
  1.1999 +                                            UBool escapeUnprintable) const
  1.2000 +{
  1.2001 +    result.append(SET_OPEN);
  1.2002 +
  1.2003 +//  // Check against the predefined categories.  We implicitly build
  1.2004 +//  // up ALL category sets the first time toPattern() is called.
  1.2005 +//  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
  1.2006 +//      if (*this == getCategorySet(cat)) {
  1.2007 +//          result.append(COLON);
  1.2008 +//          result.append(CATEGORY_NAMES, cat*2, 2);
  1.2009 +//          return result.append(CATEGORY_CLOSE);
  1.2010 +//      }
  1.2011 +//  }
  1.2012 +
  1.2013 +    int32_t count = getRangeCount();
  1.2014 +
  1.2015 +    // If the set contains at least 2 intervals and includes both
  1.2016 +    // MIN_VALUE and MAX_VALUE, then the inverse representation will
  1.2017 +    // be more economical.
  1.2018 +    if (count > 1 &&
  1.2019 +        getRangeStart(0) == MIN_VALUE &&
  1.2020 +        getRangeEnd(count-1) == MAX_VALUE) {
  1.2021 +
  1.2022 +        // Emit the inverse
  1.2023 +        result.append(COMPLEMENT);
  1.2024 +
  1.2025 +        for (int32_t i = 1; i < count; ++i) {
  1.2026 +            UChar32 start = getRangeEnd(i-1)+1;
  1.2027 +            UChar32 end = getRangeStart(i)-1;
  1.2028 +            _appendToPat(result, start, escapeUnprintable);
  1.2029 +            if (start != end) {
  1.2030 +                if ((start+1) != end) {
  1.2031 +                    result.append(HYPHEN);
  1.2032 +                }
  1.2033 +                _appendToPat(result, end, escapeUnprintable);
  1.2034 +            }
  1.2035 +        }
  1.2036 +    }
  1.2037 +
  1.2038 +    // Default; emit the ranges as pairs
  1.2039 +    else {
  1.2040 +        for (int32_t i = 0; i < count; ++i) {
  1.2041 +            UChar32 start = getRangeStart(i);
  1.2042 +            UChar32 end = getRangeEnd(i);
  1.2043 +            _appendToPat(result, start, escapeUnprintable);
  1.2044 +            if (start != end) {
  1.2045 +                if ((start+1) != end) {
  1.2046 +                    result.append(HYPHEN);
  1.2047 +                }
  1.2048 +                _appendToPat(result, end, escapeUnprintable);
  1.2049 +            }
  1.2050 +        }
  1.2051 +    }
  1.2052 +
  1.2053 +    for (int32_t i = 0; i<strings->size(); ++i) {
  1.2054 +        result.append(OPEN_BRACE);
  1.2055 +        _appendToPat(result,
  1.2056 +                     *(const UnicodeString*) strings->elementAt(i),
  1.2057 +                     escapeUnprintable);
  1.2058 +        result.append(CLOSE_BRACE);
  1.2059 +    }
  1.2060 +    return result.append(SET_CLOSE);
  1.2061 +}
  1.2062 +
  1.2063 +/**
  1.2064 +* Release existing cached pattern
  1.2065 +*/
  1.2066 +void UnicodeSet::releasePattern() {
  1.2067 +    if (pat) {
  1.2068 +        uprv_free(pat);
  1.2069 +        pat = NULL;
  1.2070 +        patLen = 0;
  1.2071 +    }
  1.2072 +}
  1.2073 +
  1.2074 +/**
  1.2075 +* Set the new pattern to cache.
  1.2076 +*/
  1.2077 +void UnicodeSet::setPattern(const UnicodeString& newPat) {
  1.2078 +    releasePattern();
  1.2079 +    int32_t newPatLen = newPat.length();
  1.2080 +    pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
  1.2081 +    if (pat) {
  1.2082 +        patLen = newPatLen;
  1.2083 +        newPat.extractBetween(0, patLen, pat);
  1.2084 +        pat[patLen] = 0;
  1.2085 +    }
  1.2086 +    // else we don't care if malloc failed. This was just a nice cache.
  1.2087 +    // We can regenerate an equivalent pattern later when requested.
  1.2088 +}
  1.2089 +
  1.2090 +UnicodeFunctor *UnicodeSet::freeze() {
  1.2091 +    if(!isFrozen() && !isBogus()) {
  1.2092 +        // Do most of what compact() does before freezing because
  1.2093 +        // compact() will not work when the set is frozen.
  1.2094 +        // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
  1.2095 +
  1.2096 +        // Delete buffer first to defragment memory less.
  1.2097 +        if (buffer != NULL) {
  1.2098 +            uprv_free(buffer);
  1.2099 +            buffer = NULL;
  1.2100 +        }
  1.2101 +        if (capacity > (len + GROW_EXTRA)) {
  1.2102 +            // Make the capacity equal to len or 1.
  1.2103 +            // We don't want to realloc of 0 size.
  1.2104 +            capacity = len + (len == 0);
  1.2105 +            list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
  1.2106 +            if (list == NULL) { // Check for memory allocation error.
  1.2107 +                setToBogus();
  1.2108 +                return this;
  1.2109 +            }
  1.2110 +        }
  1.2111 +
  1.2112 +        // Optimize contains() and span() and similar functions.
  1.2113 +        if (!strings->isEmpty()) {
  1.2114 +            stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
  1.2115 +            if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
  1.2116 +                // All strings are irrelevant for span() etc. because
  1.2117 +                // all of each string's code points are contained in this set.
  1.2118 +                // Do not check needsStringSpanUTF8() because UTF-8 has at most as
  1.2119 +                // many relevant strings as UTF-16.
  1.2120 +                // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
  1.2121 +                delete stringSpan;
  1.2122 +                stringSpan = NULL;
  1.2123 +            }
  1.2124 +        }
  1.2125 +        if (stringSpan == NULL) {
  1.2126 +            // No span-relevant strings: Optimize for code point spans.
  1.2127 +            bmpSet=new BMPSet(list, len);
  1.2128 +            if (bmpSet == NULL) { // Check for memory allocation error.
  1.2129 +                setToBogus();
  1.2130 +            }
  1.2131 +        }
  1.2132 +    }
  1.2133 +    return this;
  1.2134 +}
  1.2135 +
  1.2136 +int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
  1.2137 +    if(length>0 && bmpSet!=NULL) {
  1.2138 +        return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
  1.2139 +    }
  1.2140 +    if(length<0) {
  1.2141 +        length=u_strlen(s);
  1.2142 +    }
  1.2143 +    if(length==0) {
  1.2144 +        return 0;
  1.2145 +    }
  1.2146 +    if(stringSpan!=NULL) {
  1.2147 +        return stringSpan->span(s, length, spanCondition);
  1.2148 +    } else if(!strings->isEmpty()) {
  1.2149 +        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
  1.2150 +                            UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
  1.2151 +                            UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
  1.2152 +        UnicodeSetStringSpan strSpan(*this, *strings, which);
  1.2153 +        if(strSpan.needsStringSpanUTF16()) {
  1.2154 +            return strSpan.span(s, length, spanCondition);
  1.2155 +        }
  1.2156 +    }
  1.2157 +
  1.2158 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  1.2159 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
  1.2160 +    }
  1.2161 +
  1.2162 +    UChar32 c;
  1.2163 +    int32_t start=0, prev=0;
  1.2164 +    do {
  1.2165 +        U16_NEXT(s, start, length, c);
  1.2166 +        if(spanCondition!=contains(c)) {
  1.2167 +            break;
  1.2168 +        }
  1.2169 +    } while((prev=start)<length);
  1.2170 +    return prev;
  1.2171 +}
  1.2172 +
  1.2173 +int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
  1.2174 +    if(length>0 && bmpSet!=NULL) {
  1.2175 +        return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
  1.2176 +    }
  1.2177 +    if(length<0) {
  1.2178 +        length=u_strlen(s);
  1.2179 +    }
  1.2180 +    if(length==0) {
  1.2181 +        return 0;
  1.2182 +    }
  1.2183 +    if(stringSpan!=NULL) {
  1.2184 +        return stringSpan->spanBack(s, length, spanCondition);
  1.2185 +    } else if(!strings->isEmpty()) {
  1.2186 +        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
  1.2187 +                            UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
  1.2188 +                            UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
  1.2189 +        UnicodeSetStringSpan strSpan(*this, *strings, which);
  1.2190 +        if(strSpan.needsStringSpanUTF16()) {
  1.2191 +            return strSpan.spanBack(s, length, spanCondition);
  1.2192 +        }
  1.2193 +    }
  1.2194 +
  1.2195 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  1.2196 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
  1.2197 +    }
  1.2198 +
  1.2199 +    UChar32 c;
  1.2200 +    int32_t prev=length;
  1.2201 +    do {
  1.2202 +        U16_PREV(s, 0, length, c);
  1.2203 +        if(spanCondition!=contains(c)) {
  1.2204 +            break;
  1.2205 +        }
  1.2206 +    } while((prev=length)>0);
  1.2207 +    return prev;
  1.2208 +}
  1.2209 +
  1.2210 +int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
  1.2211 +    if(length>0 && bmpSet!=NULL) {
  1.2212 +        const uint8_t *s0=(const uint8_t *)s;
  1.2213 +        return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
  1.2214 +    }
  1.2215 +    if(length<0) {
  1.2216 +        length=(int32_t)uprv_strlen(s);
  1.2217 +    }
  1.2218 +    if(length==0) {
  1.2219 +        return 0;
  1.2220 +    }
  1.2221 +    if(stringSpan!=NULL) {
  1.2222 +        return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
  1.2223 +    } else if(!strings->isEmpty()) {
  1.2224 +        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
  1.2225 +                            UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
  1.2226 +                            UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
  1.2227 +        UnicodeSetStringSpan strSpan(*this, *strings, which);
  1.2228 +        if(strSpan.needsStringSpanUTF8()) {
  1.2229 +            return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
  1.2230 +        }
  1.2231 +    }
  1.2232 +
  1.2233 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  1.2234 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
  1.2235 +    }
  1.2236 +
  1.2237 +    UChar32 c;
  1.2238 +    int32_t start=0, prev=0;
  1.2239 +    do {
  1.2240 +        U8_NEXT_OR_FFFD(s, start, length, c);
  1.2241 +        if(spanCondition!=contains(c)) {
  1.2242 +            break;
  1.2243 +        }
  1.2244 +    } while((prev=start)<length);
  1.2245 +    return prev;
  1.2246 +}
  1.2247 +
  1.2248 +int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
  1.2249 +    if(length>0 && bmpSet!=NULL) {
  1.2250 +        const uint8_t *s0=(const uint8_t *)s;
  1.2251 +        return bmpSet->spanBackUTF8(s0, length, spanCondition);
  1.2252 +    }
  1.2253 +    if(length<0) {
  1.2254 +        length=(int32_t)uprv_strlen(s);
  1.2255 +    }
  1.2256 +    if(length==0) {
  1.2257 +        return 0;
  1.2258 +    }
  1.2259 +    if(stringSpan!=NULL) {
  1.2260 +        return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
  1.2261 +    } else if(!strings->isEmpty()) {
  1.2262 +        uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
  1.2263 +                            UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
  1.2264 +                            UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
  1.2265 +        UnicodeSetStringSpan strSpan(*this, *strings, which);
  1.2266 +        if(strSpan.needsStringSpanUTF8()) {
  1.2267 +            return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
  1.2268 +        }
  1.2269 +    }
  1.2270 +
  1.2271 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
  1.2272 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
  1.2273 +    }
  1.2274 +
  1.2275 +    UChar32 c;
  1.2276 +    int32_t prev=length;
  1.2277 +    do {
  1.2278 +        U8_PREV_OR_FFFD(s, 0, length, c);
  1.2279 +        if(spanCondition!=contains(c)) {
  1.2280 +            break;
  1.2281 +        }
  1.2282 +    } while((prev=length)>0);
  1.2283 +    return prev;
  1.2284 +}
  1.2285 +
  1.2286 +U_NAMESPACE_END
The Tor Browser / file diff

diff: intl/icu/source/common/uniset.cpp

intl/icu/source/common/uniset.cpp