1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/uniset.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2283 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 1999-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* Date Name Description 1.10 +* 10/20/99 alan Creation. 1.11 +********************************************************************** 1.12 +*/ 1.13 + 1.14 +#include "unicode/utypes.h" 1.15 +#include "unicode/parsepos.h" 1.16 +#include "unicode/symtable.h" 1.17 +#include "unicode/uniset.h" 1.18 +#include "unicode/utf8.h" 1.19 +#include "unicode/utf16.h" 1.20 +#include "ruleiter.h" 1.21 +#include "cmemory.h" 1.22 +#include "cstring.h" 1.23 +#include "patternprops.h" 1.24 +#include "uelement.h" 1.25 +#include "util.h" 1.26 +#include "uvector.h" 1.27 +#include "charstr.h" 1.28 +#include "ustrfmt.h" 1.29 +#include "uassert.h" 1.30 +#include "bmpset.h" 1.31 +#include "unisetspan.h" 1.32 + 1.33 +// Define UChar constants using hex for EBCDIC compatibility 1.34 +// Used #define to reduce private static exports and memory access time. 1.35 +#define SET_OPEN ((UChar)0x005B) /*[*/ 1.36 +#define SET_CLOSE ((UChar)0x005D) /*]*/ 1.37 +#define HYPHEN ((UChar)0x002D) /*-*/ 1.38 +#define COMPLEMENT ((UChar)0x005E) /*^*/ 1.39 +#define COLON ((UChar)0x003A) /*:*/ 1.40 +#define BACKSLASH ((UChar)0x005C) /*\*/ 1.41 +#define INTERSECTION ((UChar)0x0026) /*&*/ 1.42 +#define UPPER_U ((UChar)0x0055) /*U*/ 1.43 +#define LOWER_U ((UChar)0x0075) /*u*/ 1.44 +#define OPEN_BRACE ((UChar)123) /*{*/ 1.45 +#define CLOSE_BRACE ((UChar)125) /*}*/ 1.46 +#define UPPER_P ((UChar)0x0050) /*P*/ 1.47 +#define LOWER_P ((UChar)0x0070) /*p*/ 1.48 +#define UPPER_N ((UChar)78) /*N*/ 1.49 +#define EQUALS ((UChar)0x003D) /*=*/ 1.50 + 1.51 +// HIGH_VALUE > all valid values. 110000 for codepoints 1.52 +#define UNICODESET_HIGH 0x0110000 1.53 + 1.54 +// LOW <= all valid values. ZERO for codepoints 1.55 +#define UNICODESET_LOW 0x000000 1.56 + 1.57 +// initial storage. Must be >= 0 1.58 +#define START_EXTRA 16 1.59 + 1.60 +// extra amount for growth. Must be >= 0 1.61 +#define GROW_EXTRA START_EXTRA 1.62 + 1.63 +U_NAMESPACE_BEGIN 1.64 + 1.65 +SymbolTable::~SymbolTable() {} 1.66 + 1.67 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) 1.68 + 1.69 +/** 1.70 + * Modify the given UChar32 variable so that it is in range, by 1.71 + * pinning values < UNICODESET_LOW to UNICODESET_LOW, and 1.72 + * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. 1.73 + * It modifies its argument in-place and also returns it. 1.74 + */ 1.75 +static inline UChar32 pinCodePoint(UChar32& c) { 1.76 + if (c < UNICODESET_LOW) { 1.77 + c = UNICODESET_LOW; 1.78 + } else if (c > (UNICODESET_HIGH-1)) { 1.79 + c = (UNICODESET_HIGH-1); 1.80 + } 1.81 + return c; 1.82 +} 1.83 + 1.84 +//---------------------------------------------------------------- 1.85 +// Debugging 1.86 +//---------------------------------------------------------------- 1.87 + 1.88 +// DO NOT DELETE THIS CODE. This code is used to debug memory leaks. 1.89 +// To enable the debugging, define the symbol DEBUG_MEM in the line 1.90 +// below. This will result in text being sent to stdout that looks 1.91 +// like this: 1.92 +// DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- 1.93 +// DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- 1.94 +// Each line lists a construction (ct) or destruction (dt) event, the 1.95 +// object address, the number of outstanding objects after the event, 1.96 +// and the pattern of the object in question. 1.97 + 1.98 +// #define DEBUG_MEM 1.99 + 1.100 +#ifdef DEBUG_MEM 1.101 +#include <stdio.h> 1.102 +static int32_t _dbgCount = 0; 1.103 + 1.104 +static inline void _dbgct(UnicodeSet* set) { 1.105 + UnicodeString str; 1.106 + set->toPattern(str, TRUE); 1.107 + char buf[40]; 1.108 + str.extract(0, 39, buf, ""); 1.109 + printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); 1.110 +} 1.111 + 1.112 +static inline void _dbgdt(UnicodeSet* set) { 1.113 + UnicodeString str; 1.114 + set->toPattern(str, TRUE); 1.115 + char buf[40]; 1.116 + str.extract(0, 39, buf, ""); 1.117 + printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); 1.118 +} 1.119 + 1.120 +#else 1.121 + 1.122 +#define _dbgct(set) 1.123 +#define _dbgdt(set) 1.124 + 1.125 +#endif 1.126 + 1.127 +//---------------------------------------------------------------- 1.128 +// UnicodeString in UVector support 1.129 +//---------------------------------------------------------------- 1.130 + 1.131 +static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { 1.132 + dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); 1.133 +} 1.134 + 1.135 +static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 1.136 + const UnicodeString &a = *(const UnicodeString*)t1.pointer; 1.137 + const UnicodeString &b = *(const UnicodeString*)t2.pointer; 1.138 + return a.compare(b); 1.139 +} 1.140 + 1.141 +//---------------------------------------------------------------- 1.142 +// Constructors &c 1.143 +//---------------------------------------------------------------- 1.144 + 1.145 +/** 1.146 + * Constructs an empty set. 1.147 + */ 1.148 +UnicodeSet::UnicodeSet() : 1.149 + len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 1.150 + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.151 + fFlags(0) 1.152 +{ 1.153 + UErrorCode status = U_ZERO_ERROR; 1.154 + allocateStrings(status); 1.155 + if (U_FAILURE(status)) { 1.156 + return; 1.157 + } 1.158 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.159 + if(list!=NULL){ 1.160 + list[0] = UNICODESET_HIGH; 1.161 + } else { // If memory allocation failed, set to bogus state. 1.162 + setToBogus(); 1.163 + return; 1.164 + } 1.165 + _dbgct(this); 1.166 +} 1.167 + 1.168 +/** 1.169 + * Constructs a set containing the given range. If <code>end > 1.170 + * start</code> then an empty set is created. 1.171 + * 1.172 + * @param start first character, inclusive, of range 1.173 + * @param end last character, inclusive, of range 1.174 + */ 1.175 +UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : 1.176 + len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 1.177 + bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.178 + fFlags(0) 1.179 +{ 1.180 + UErrorCode status = U_ZERO_ERROR; 1.181 + allocateStrings(status); 1.182 + if (U_FAILURE(status)) { 1.183 + return; 1.184 + } 1.185 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.186 + if(list!=NULL){ 1.187 + list[0] = UNICODESET_HIGH; 1.188 + complement(start, end); 1.189 + } else { // If memory allocation failed, set to bogus state. 1.190 + setToBogus(); 1.191 + return; 1.192 + } 1.193 + _dbgct(this); 1.194 +} 1.195 + 1.196 +/** 1.197 + * Constructs a set that is identical to the given UnicodeSet. 1.198 + */ 1.199 +UnicodeSet::UnicodeSet(const UnicodeSet& o) : 1.200 + UnicodeFilter(o), 1.201 + len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), 1.202 + bmpSet(0), 1.203 + buffer(0), bufferCapacity(0), 1.204 + patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.205 + fFlags(0) 1.206 +{ 1.207 + UErrorCode status = U_ZERO_ERROR; 1.208 + allocateStrings(status); 1.209 + if (U_FAILURE(status)) { 1.210 + return; 1.211 + } 1.212 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.213 + if(list!=NULL){ 1.214 + *this = o; 1.215 + } else { // If memory allocation failed, set to bogus state. 1.216 + setToBogus(); 1.217 + return; 1.218 + } 1.219 + _dbgct(this); 1.220 +} 1.221 + 1.222 +// Copy-construct as thawed. 1.223 +UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 1.224 + UnicodeFilter(o), 1.225 + len(0), capacity(o.len + GROW_EXTRA), list(0), 1.226 + bmpSet(0), 1.227 + buffer(0), bufferCapacity(0), 1.228 + patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1.229 + fFlags(0) 1.230 +{ 1.231 + UErrorCode status = U_ZERO_ERROR; 1.232 + allocateStrings(status); 1.233 + if (U_FAILURE(status)) { 1.234 + return; 1.235 + } 1.236 + list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1.237 + if(list!=NULL){ 1.238 + // *this = o except for bmpSet and stringSpan 1.239 + len = o.len; 1.240 + uprv_memcpy(list, o.list, len*sizeof(UChar32)); 1.241 + if (strings != NULL && o.strings != NULL) { 1.242 + strings->assign(*o.strings, cloneUnicodeString, status); 1.243 + } else { // Invalid strings. 1.244 + setToBogus(); 1.245 + return; 1.246 + } 1.247 + if (o.pat) { 1.248 + setPattern(UnicodeString(o.pat, o.patLen)); 1.249 + } 1.250 + } else { // If memory allocation failed, set to bogus state. 1.251 + setToBogus(); 1.252 + return; 1.253 + } 1.254 + _dbgct(this); 1.255 +} 1.256 + 1.257 +/** 1.258 + * Destructs the set. 1.259 + */ 1.260 +UnicodeSet::~UnicodeSet() { 1.261 + _dbgdt(this); // first! 1.262 + uprv_free(list); 1.263 + delete bmpSet; 1.264 + if (buffer) { 1.265 + uprv_free(buffer); 1.266 + } 1.267 + delete strings; 1.268 + delete stringSpan; 1.269 + releasePattern(); 1.270 +} 1.271 + 1.272 +/** 1.273 + * Assigns this object to be a copy of another. 1.274 + */ 1.275 +UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { 1.276 + if (this == &o) { 1.277 + return *this; 1.278 + } 1.279 + if (isFrozen()) { 1.280 + return *this; 1.281 + } 1.282 + if (o.isBogus()) { 1.283 + setToBogus(); 1.284 + return *this; 1.285 + } 1.286 + UErrorCode ec = U_ZERO_ERROR; 1.287 + ensureCapacity(o.len, ec); 1.288 + if (U_FAILURE(ec)) { 1.289 + return *this; // There is no way to report this error :-( 1.290 + } 1.291 + len = o.len; 1.292 + uprv_memcpy(list, o.list, len*sizeof(UChar32)); 1.293 + if (o.bmpSet == NULL) { 1.294 + bmpSet = NULL; 1.295 + } else { 1.296 + bmpSet = new BMPSet(*o.bmpSet, list, len); 1.297 + if (bmpSet == NULL) { // Check for memory allocation error. 1.298 + setToBogus(); 1.299 + return *this; 1.300 + } 1.301 + } 1.302 + if (strings != NULL && o.strings != NULL) { 1.303 + strings->assign(*o.strings, cloneUnicodeString, ec); 1.304 + } else { // Invalid strings. 1.305 + setToBogus(); 1.306 + return *this; 1.307 + } 1.308 + if (o.stringSpan == NULL) { 1.309 + stringSpan = NULL; 1.310 + } else { 1.311 + stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); 1.312 + if (stringSpan == NULL) { // Check for memory allocation error. 1.313 + setToBogus(); 1.314 + return *this; 1.315 + } 1.316 + } 1.317 + releasePattern(); 1.318 + if (o.pat) { 1.319 + setPattern(UnicodeString(o.pat, o.patLen)); 1.320 + } 1.321 + return *this; 1.322 +} 1.323 + 1.324 +/** 1.325 + * Returns a copy of this object. All UnicodeMatcher objects have 1.326 + * to support cloning in order to allow classes using 1.327 + * UnicodeMatchers, such as Transliterator, to implement cloning. 1.328 + */ 1.329 +UnicodeFunctor* UnicodeSet::clone() const { 1.330 + return new UnicodeSet(*this); 1.331 +} 1.332 + 1.333 +UnicodeFunctor *UnicodeSet::cloneAsThawed() const { 1.334 + return new UnicodeSet(*this, TRUE); 1.335 +} 1.336 + 1.337 +/** 1.338 + * Compares the specified object with this set for equality. Returns 1.339 + * <tt>true</tt> if the two sets 1.340 + * have the same size, and every member of the specified set is 1.341 + * contained in this set (or equivalently, every member of this set is 1.342 + * contained in the specified set). 1.343 + * 1.344 + * @param o set to be compared for equality with this set. 1.345 + * @return <tt>true</tt> if the specified set is equal to this set. 1.346 + */ 1.347 +UBool UnicodeSet::operator==(const UnicodeSet& o) const { 1.348 + if (len != o.len) return FALSE; 1.349 + for (int32_t i = 0; i < len; ++i) { 1.350 + if (list[i] != o.list[i]) return FALSE; 1.351 + } 1.352 + if (*strings != *o.strings) return FALSE; 1.353 + return TRUE; 1.354 +} 1.355 + 1.356 +/** 1.357 + * Returns the hash code value for this set. 1.358 + * 1.359 + * @return the hash code value for this set. 1.360 + * @see Object#hashCode() 1.361 + */ 1.362 +int32_t UnicodeSet::hashCode(void) const { 1.363 + int32_t result = len; 1.364 + for (int32_t i = 0; i < len; ++i) { 1.365 + result *= 1000003; 1.366 + result += list[i]; 1.367 + } 1.368 + return result; 1.369 +} 1.370 + 1.371 +//---------------------------------------------------------------- 1.372 +// Public API 1.373 +//---------------------------------------------------------------- 1.374 + 1.375 +/** 1.376 + * Returns the number of elements in this set (its cardinality), 1.377 + * Note than the elements of a set may include both individual 1.378 + * codepoints and strings. 1.379 + * 1.380 + * @return the number of elements in this set (its cardinality). 1.381 + */ 1.382 +int32_t UnicodeSet::size(void) const { 1.383 + int32_t n = 0; 1.384 + int32_t count = getRangeCount(); 1.385 + for (int32_t i = 0; i < count; ++i) { 1.386 + n += getRangeEnd(i) - getRangeStart(i) + 1; 1.387 + } 1.388 + return n + strings->size(); 1.389 +} 1.390 + 1.391 +/** 1.392 + * Returns <tt>true</tt> if this set contains no elements. 1.393 + * 1.394 + * @return <tt>true</tt> if this set contains no elements. 1.395 + */ 1.396 +UBool UnicodeSet::isEmpty(void) const { 1.397 + return len == 1 && strings->size() == 0; 1.398 +} 1.399 + 1.400 +/** 1.401 + * Returns true if this set contains the given character. 1.402 + * @param c character to be checked for containment 1.403 + * @return true if the test condition is met 1.404 + */ 1.405 +UBool UnicodeSet::contains(UChar32 c) const { 1.406 + // Set i to the index of the start item greater than ch 1.407 + // We know we will terminate without length test! 1.408 + // LATER: for large sets, add binary search 1.409 + //int32_t i = -1; 1.410 + //for (;;) { 1.411 + // if (c < list[++i]) break; 1.412 + //} 1.413 + if (bmpSet != NULL) { 1.414 + return bmpSet->contains(c); 1.415 + } 1.416 + if (stringSpan != NULL) { 1.417 + return stringSpan->contains(c); 1.418 + } 1.419 + if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound 1.420 + return FALSE; 1.421 + } 1.422 + int32_t i = findCodePoint(c); 1.423 + return (UBool)(i & 1); // return true if odd 1.424 +} 1.425 + 1.426 +/** 1.427 + * Returns the smallest value i such that c < list[i]. Caller 1.428 + * must ensure that c is a legal value or this method will enter 1.429 + * an infinite loop. This method performs a binary search. 1.430 + * @param c a character in the range MIN_VALUE..MAX_VALUE 1.431 + * inclusive 1.432 + * @return the smallest integer i in the range 0..len-1, 1.433 + * inclusive, such that c < list[i] 1.434 + */ 1.435 +int32_t UnicodeSet::findCodePoint(UChar32 c) const { 1.436 + /* Examples: 1.437 + findCodePoint(c) 1.438 + set list[] c=0 1 3 4 7 8 1.439 + === ============== =========== 1.440 + [] [110000] 0 0 0 0 0 0 1.441 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1.442 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1.443 + [:Any:] [0, 110000] 1 1 1 1 1 1 1.444 + */ 1.445 + 1.446 + // Return the smallest i such that c < list[i]. Assume 1.447 + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1.448 + if (c < list[0]) 1.449 + return 0; 1.450 + // High runner test. c is often after the last range, so an 1.451 + // initial check for this condition pays off. 1.452 + int32_t lo = 0; 1.453 + int32_t hi = len - 1; 1.454 + if (lo >= hi || c >= list[hi-1]) 1.455 + return hi; 1.456 + // invariant: c >= list[lo] 1.457 + // invariant: c < list[hi] 1.458 + for (;;) { 1.459 + int32_t i = (lo + hi) >> 1; 1.460 + if (i == lo) { 1.461 + break; // Found! 1.462 + } else if (c < list[i]) { 1.463 + hi = i; 1.464 + } else { 1.465 + lo = i; 1.466 + } 1.467 + } 1.468 + return hi; 1.469 +} 1.470 + 1.471 +/** 1.472 + * Returns true if this set contains every character 1.473 + * of the given range. 1.474 + * @param start first character, inclusive, of the range 1.475 + * @param end last character, inclusive, of the range 1.476 + * @return true if the test condition is met 1.477 + */ 1.478 +UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { 1.479 + //int32_t i = -1; 1.480 + //for (;;) { 1.481 + // if (start < list[++i]) break; 1.482 + //} 1.483 + int32_t i = findCodePoint(start); 1.484 + return ((i & 1) != 0 && end < list[i]); 1.485 +} 1.486 + 1.487 +/** 1.488 + * Returns <tt>true</tt> if this set contains the given 1.489 + * multicharacter string. 1.490 + * @param s string to be checked for containment 1.491 + * @return <tt>true</tt> if this set contains the specified string 1.492 + */ 1.493 +UBool UnicodeSet::contains(const UnicodeString& s) const { 1.494 + if (s.length() == 0) return FALSE; 1.495 + int32_t cp = getSingleCP(s); 1.496 + if (cp < 0) { 1.497 + return strings->contains((void*) &s); 1.498 + } else { 1.499 + return contains((UChar32) cp); 1.500 + } 1.501 +} 1.502 + 1.503 +/** 1.504 + * Returns true if this set contains all the characters and strings 1.505 + * of the given set. 1.506 + * @param c set to be checked for containment 1.507 + * @return true if the test condition is met 1.508 + */ 1.509 +UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 1.510 + // The specified set is a subset if all of its pairs are contained in 1.511 + // this set. It's possible to code this more efficiently in terms of 1.512 + // direct manipulation of the inversion lists if the need arises. 1.513 + int32_t n = c.getRangeCount(); 1.514 + for (int i=0; i<n; ++i) { 1.515 + if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { 1.516 + return FALSE; 1.517 + } 1.518 + } 1.519 + if (!strings->containsAll(*c.strings)) return FALSE; 1.520 + return TRUE; 1.521 +} 1.522 + 1.523 +/** 1.524 + * Returns true if this set contains all the characters 1.525 + * of the given string. 1.526 + * @param s string containing characters to be checked for containment 1.527 + * @return true if the test condition is met 1.528 + */ 1.529 +UBool UnicodeSet::containsAll(const UnicodeString& s) const { 1.530 + return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == 1.531 + s.length()); 1.532 +} 1.533 + 1.534 +/** 1.535 + * Returns true if this set contains none of the characters 1.536 + * of the given range. 1.537 + * @param start first character, inclusive, of the range 1.538 + * @param end last character, inclusive, of the range 1.539 + * @return true if the test condition is met 1.540 + */ 1.541 +UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { 1.542 + //int32_t i = -1; 1.543 + //for (;;) { 1.544 + // if (start < list[++i]) break; 1.545 + //} 1.546 + int32_t i = findCodePoint(start); 1.547 + return ((i & 1) == 0 && end < list[i]); 1.548 +} 1.549 + 1.550 +/** 1.551 + * Returns true if this set contains none of the characters and strings 1.552 + * of the given set. 1.553 + * @param c set to be checked for containment 1.554 + * @return true if the test condition is met 1.555 + */ 1.556 +UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 1.557 + // The specified set is a subset if all of its pairs are contained in 1.558 + // this set. It's possible to code this more efficiently in terms of 1.559 + // direct manipulation of the inversion lists if the need arises. 1.560 + int32_t n = c.getRangeCount(); 1.561 + for (int32_t i=0; i<n; ++i) { 1.562 + if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { 1.563 + return FALSE; 1.564 + } 1.565 + } 1.566 + if (!strings->containsNone(*c.strings)) return FALSE; 1.567 + return TRUE; 1.568 +} 1.569 + 1.570 +/** 1.571 + * Returns true if this set contains none of the characters 1.572 + * of the given string. 1.573 + * @param s string containing characters to be checked for containment 1.574 + * @return true if the test condition is met 1.575 + */ 1.576 +UBool UnicodeSet::containsNone(const UnicodeString& s) const { 1.577 + return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == 1.578 + s.length()); 1.579 +} 1.580 + 1.581 +/** 1.582 + * Returns <tt>true</tt> if this set contains any character whose low byte 1.583 + * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for 1.584 + * indexing. 1.585 + */ 1.586 +UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 1.587 + /* The index value v, in the range [0,255], is contained in this set if 1.588 + * it is contained in any pair of this set. Pairs either have the high 1.589 + * bytes equal, or unequal. If the high bytes are equal, then we have 1.590 + * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= 1.591 + * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. 1.592 + * Then v is contained if xx <= v || v <= yy. (This is identical to the 1.593 + * time zone month containment logic.) 1.594 + */ 1.595 + int32_t i; 1.596 + int32_t rangeCount=getRangeCount(); 1.597 + for (i=0; i<rangeCount; ++i) { 1.598 + UChar32 low = getRangeStart(i); 1.599 + UChar32 high = getRangeEnd(i); 1.600 + if ((low & ~0xFF) == (high & ~0xFF)) { 1.601 + if ((low & 0xFF) <= v && v <= (high & 0xFF)) { 1.602 + return TRUE; 1.603 + } 1.604 + } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { 1.605 + return TRUE; 1.606 + } 1.607 + } 1.608 + if (strings->size() != 0) { 1.609 + for (i=0; i<strings->size(); ++i) { 1.610 + const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); 1.611 + //if (s.length() == 0) { 1.612 + // // Empty strings match everything 1.613 + // return TRUE; 1.614 + //} 1.615 + // assert(s.length() != 0); // We enforce this elsewhere 1.616 + UChar32 c = s.char32At(0); 1.617 + if ((c & 0xFF) == v) { 1.618 + return TRUE; 1.619 + } 1.620 + } 1.621 + } 1.622 + return FALSE; 1.623 +} 1.624 + 1.625 +/** 1.626 + * Implementation of UnicodeMatcher::matches(). Always matches the 1.627 + * longest possible multichar string. 1.628 + */ 1.629 +UMatchDegree UnicodeSet::matches(const Replaceable& text, 1.630 + int32_t& offset, 1.631 + int32_t limit, 1.632 + UBool incremental) { 1.633 + if (offset == limit) { 1.634 + // Strings, if any, have length != 0, so we don't worry 1.635 + // about them here. If we ever allow zero-length strings 1.636 + // we much check for them here. 1.637 + if (contains(U_ETHER)) { 1.638 + return incremental ? U_PARTIAL_MATCH : U_MATCH; 1.639 + } else { 1.640 + return U_MISMATCH; 1.641 + } 1.642 + } else { 1.643 + if (strings->size() != 0) { // try strings first 1.644 + 1.645 + // might separate forward and backward loops later 1.646 + // for now they are combined 1.647 + 1.648 + // TODO Improve efficiency of this, at least in the forward 1.649 + // direction, if not in both. In the forward direction we 1.650 + // can assume the strings are sorted. 1.651 + 1.652 + int32_t i; 1.653 + UBool forward = offset < limit; 1.654 + 1.655 + // firstChar is the leftmost char to match in the 1.656 + // forward direction or the rightmost char to match in 1.657 + // the reverse direction. 1.658 + UChar firstChar = text.charAt(offset); 1.659 + 1.660 + // If there are multiple strings that can match we 1.661 + // return the longest match. 1.662 + int32_t highWaterLength = 0; 1.663 + 1.664 + for (i=0; i<strings->size(); ++i) { 1.665 + const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); 1.666 + 1.667 + //if (trial.length() == 0) { 1.668 + // return U_MATCH; // null-string always matches 1.669 + //} 1.670 + // assert(trial.length() != 0); // We ensure this elsewhere 1.671 + 1.672 + UChar c = trial.charAt(forward ? 0 : trial.length() - 1); 1.673 + 1.674 + // Strings are sorted, so we can optimize in the 1.675 + // forward direction. 1.676 + if (forward && c > firstChar) break; 1.677 + if (c != firstChar) continue; 1.678 + 1.679 + int32_t matchLen = matchRest(text, offset, limit, trial); 1.680 + 1.681 + if (incremental) { 1.682 + int32_t maxLen = forward ? limit-offset : offset-limit; 1.683 + if (matchLen == maxLen) { 1.684 + // We have successfully matched but only up to limit. 1.685 + return U_PARTIAL_MATCH; 1.686 + } 1.687 + } 1.688 + 1.689 + if (matchLen == trial.length()) { 1.690 + // We have successfully matched the whole string. 1.691 + if (matchLen > highWaterLength) { 1.692 + highWaterLength = matchLen; 1.693 + } 1.694 + // In the forward direction we know strings 1.695 + // are sorted so we can bail early. 1.696 + if (forward && matchLen < highWaterLength) { 1.697 + break; 1.698 + } 1.699 + continue; 1.700 + } 1.701 + } 1.702 + 1.703 + // We've checked all strings without a partial match. 1.704 + // If we have full matches, return the longest one. 1.705 + if (highWaterLength != 0) { 1.706 + offset += forward ? highWaterLength : -highWaterLength; 1.707 + return U_MATCH; 1.708 + } 1.709 + } 1.710 + return UnicodeFilter::matches(text, offset, limit, incremental); 1.711 + } 1.712 +} 1.713 + 1.714 +/** 1.715 + * Returns the longest match for s in text at the given position. 1.716 + * If limit > start then match forward from start+1 to limit 1.717 + * matching all characters except s.charAt(0). If limit < start, 1.718 + * go backward starting from start-1 matching all characters 1.719 + * except s.charAt(s.length()-1). This method assumes that the 1.720 + * first character, text.charAt(start), matches s, so it does not 1.721 + * check it. 1.722 + * @param text the text to match 1.723 + * @param start the first character to match. In the forward 1.724 + * direction, text.charAt(start) is matched against s.charAt(0). 1.725 + * In the reverse direction, it is matched against 1.726 + * s.charAt(s.length()-1). 1.727 + * @param limit the limit offset for matching, either last+1 in 1.728 + * the forward direction, or last-1 in the reverse direction, 1.729 + * where last is the index of the last character to match. 1.730 + * @return If part of s matches up to the limit, return |limit - 1.731 + * start|. If all of s matches before reaching the limit, return 1.732 + * s.length(). If there is a mismatch between s and text, return 1.733 + * 0 1.734 + */ 1.735 +int32_t UnicodeSet::matchRest(const Replaceable& text, 1.736 + int32_t start, int32_t limit, 1.737 + const UnicodeString& s) { 1.738 + int32_t i; 1.739 + int32_t maxLen; 1.740 + int32_t slen = s.length(); 1.741 + if (start < limit) { 1.742 + maxLen = limit - start; 1.743 + if (maxLen > slen) maxLen = slen; 1.744 + for (i = 1; i < maxLen; ++i) { 1.745 + if (text.charAt(start + i) != s.charAt(i)) return 0; 1.746 + } 1.747 + } else { 1.748 + maxLen = start - limit; 1.749 + if (maxLen > slen) maxLen = slen; 1.750 + --slen; // <=> slen = s.length() - 1; 1.751 + for (i = 1; i < maxLen; ++i) { 1.752 + if (text.charAt(start - i) != s.charAt(slen - i)) return 0; 1.753 + } 1.754 + } 1.755 + return maxLen; 1.756 +} 1.757 + 1.758 +/** 1.759 + * Implement of UnicodeMatcher 1.760 + */ 1.761 +void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { 1.762 + toUnionTo.addAll(*this); 1.763 +} 1.764 + 1.765 +/** 1.766 + * Returns the index of the given character within this set, where 1.767 + * the set is ordered by ascending code point. If the character 1.768 + * is not in this set, return -1. The inverse of this method is 1.769 + * <code>charAt()</code>. 1.770 + * @return an index from 0..size()-1, or -1 1.771 + */ 1.772 +int32_t UnicodeSet::indexOf(UChar32 c) const { 1.773 + if (c < MIN_VALUE || c > MAX_VALUE) { 1.774 + return -1; 1.775 + } 1.776 + int32_t i = 0; 1.777 + int32_t n = 0; 1.778 + for (;;) { 1.779 + UChar32 start = list[i++]; 1.780 + if (c < start) { 1.781 + return -1; 1.782 + } 1.783 + UChar32 limit = list[i++]; 1.784 + if (c < limit) { 1.785 + return n + c - start; 1.786 + } 1.787 + n += limit - start; 1.788 + } 1.789 +} 1.790 + 1.791 +/** 1.792 + * Returns the character at the given index within this set, where 1.793 + * the set is ordered by ascending code point. If the index is 1.794 + * out of range, return (UChar32)-1. The inverse of this method is 1.795 + * <code>indexOf()</code>. 1.796 + * @param index an index from 0..size()-1 1.797 + * @return the character at the given index, or (UChar32)-1. 1.798 + */ 1.799 +UChar32 UnicodeSet::charAt(int32_t index) const { 1.800 + if (index >= 0) { 1.801 + // len2 is the largest even integer <= len, that is, it is len 1.802 + // for even values and len-1 for odd values. With odd values 1.803 + // the last entry is UNICODESET_HIGH. 1.804 + int32_t len2 = len & ~1; 1.805 + for (int32_t i=0; i < len2;) { 1.806 + UChar32 start = list[i++]; 1.807 + int32_t count = list[i++] - start; 1.808 + if (index < count) { 1.809 + return (UChar32)(start + index); 1.810 + } 1.811 + index -= count; 1.812 + } 1.813 + } 1.814 + return (UChar32)-1; 1.815 +} 1.816 + 1.817 +/** 1.818 + * Make this object represent the range <code>start - end</code>. 1.819 + * If <code>end > start</code> then this object is set to an 1.820 + * an empty range. 1.821 + * 1.822 + * @param start first character in the set, inclusive 1.823 + * @rparam end last character in the set, inclusive 1.824 + */ 1.825 +UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 1.826 + clear(); 1.827 + complement(start, end); 1.828 + return *this; 1.829 +} 1.830 + 1.831 +/** 1.832 + * Adds the specified range to this set if it is not already 1.833 + * present. If this set already contains the specified range, 1.834 + * the call leaves this set unchanged. If <code>end > start</code> 1.835 + * then an empty range is added, leaving the set unchanged. 1.836 + * 1.837 + * @param start first character, inclusive, of range to be added 1.838 + * to this set. 1.839 + * @param end last character, inclusive, of range to be added 1.840 + * to this set. 1.841 + */ 1.842 +UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 1.843 + if (pinCodePoint(start) < pinCodePoint(end)) { 1.844 + UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1.845 + add(range, 2, 0); 1.846 + } else if (start == end) { 1.847 + add(start); 1.848 + } 1.849 + return *this; 1.850 +} 1.851 + 1.852 +// #define DEBUG_US_ADD 1.853 + 1.854 +#ifdef DEBUG_US_ADD 1.855 +#include <stdio.h> 1.856 +void dump(UChar32 c) { 1.857 + if (c <= 0xFF) { 1.858 + printf("%c", (char)c); 1.859 + } else { 1.860 + printf("U+%04X", c); 1.861 + } 1.862 +} 1.863 +void dump(const UChar32* list, int32_t len) { 1.864 + printf("["); 1.865 + for (int32_t i=0; i<len; ++i) { 1.866 + if (i != 0) printf(", "); 1.867 + dump(list[i]); 1.868 + } 1.869 + printf("]"); 1.870 +} 1.871 +#endif 1.872 + 1.873 +/** 1.874 + * Adds the specified character to this set if it is not already 1.875 + * present. If this set already contains the specified character, 1.876 + * the call leaves this set unchanged. 1.877 + */ 1.878 +UnicodeSet& UnicodeSet::add(UChar32 c) { 1.879 + // find smallest i such that c < list[i] 1.880 + // if odd, then it is IN the set 1.881 + // if even, then it is OUT of the set 1.882 + int32_t i = findCodePoint(pinCodePoint(c)); 1.883 + 1.884 + // already in set? 1.885 + if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; 1.886 + 1.887 + // HIGH is 0x110000 1.888 + // assert(list[len-1] == HIGH); 1.889 + 1.890 + // empty = [HIGH] 1.891 + // [start_0, limit_0, start_1, limit_1, HIGH] 1.892 + 1.893 + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1.894 + // ^ 1.895 + // list[i] 1.896 + 1.897 + // i == 0 means c is before the first range 1.898 + 1.899 +#ifdef DEBUG_US_ADD 1.900 + printf("Add of "); 1.901 + dump(c); 1.902 + printf(" found at %d", i); 1.903 + printf(": "); 1.904 + dump(list, len); 1.905 + printf(" => "); 1.906 +#endif 1.907 + 1.908 + if (c == list[i]-1) { 1.909 + // c is before start of next range 1.910 + list[i] = c; 1.911 + // if we touched the HIGH mark, then add a new one 1.912 + if (c == (UNICODESET_HIGH - 1)) { 1.913 + UErrorCode status = U_ZERO_ERROR; 1.914 + ensureCapacity(len+1, status); 1.915 + if (U_FAILURE(status)) { 1.916 + return *this; // There is no way to report this error :-( 1.917 + } 1.918 + list[len++] = UNICODESET_HIGH; 1.919 + } 1.920 + if (i > 0 && c == list[i-1]) { 1.921 + // collapse adjacent ranges 1.922 + 1.923 + // [..., start_k-1, c, c, limit_k, ..., HIGH] 1.924 + // ^ 1.925 + // list[i] 1.926 + 1.927 + //for (int32_t k=i-1; k<len-2; ++k) { 1.928 + // list[k] = list[k+2]; 1.929 + //} 1.930 + UChar32* dst = list + i - 1; 1.931 + UChar32* src = dst + 2; 1.932 + UChar32* srclimit = list + len; 1.933 + while (src < srclimit) *(dst++) = *(src++); 1.934 + 1.935 + len -= 2; 1.936 + } 1.937 + } 1.938 + 1.939 + else if (i > 0 && c == list[i-1]) { 1.940 + // c is after end of prior range 1.941 + list[i-1]++; 1.942 + // no need to check for collapse here 1.943 + } 1.944 + 1.945 + else { 1.946 + // At this point we know the new char is not adjacent to 1.947 + // any existing ranges, and it is not 10FFFF. 1.948 + 1.949 + 1.950 + // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] 1.951 + // ^ 1.952 + // list[i] 1.953 + 1.954 + // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] 1.955 + // ^ 1.956 + // list[i] 1.957 + 1.958 + UErrorCode status = U_ZERO_ERROR; 1.959 + ensureCapacity(len+2, status); 1.960 + if (U_FAILURE(status)) { 1.961 + return *this; // There is no way to report this error :-( 1.962 + } 1.963 + 1.964 + //for (int32_t k=len-1; k>=i; --k) { 1.965 + // list[k+2] = list[k]; 1.966 + //} 1.967 + UChar32* src = list + len; 1.968 + UChar32* dst = src + 2; 1.969 + UChar32* srclimit = list + i; 1.970 + while (src > srclimit) *(--dst) = *(--src); 1.971 + 1.972 + list[i] = c; 1.973 + list[i+1] = c+1; 1.974 + len += 2; 1.975 + } 1.976 + 1.977 +#ifdef DEBUG_US_ADD 1.978 + dump(list, len); 1.979 + printf("\n"); 1.980 + 1.981 + for (i=1; i<len; ++i) { 1.982 + if (list[i] <= list[i-1]) { 1.983 + // Corrupt array! 1.984 + printf("ERROR: list has been corrupted\n"); 1.985 + exit(1); 1.986 + } 1.987 + } 1.988 +#endif 1.989 + 1.990 + releasePattern(); 1.991 + return *this; 1.992 +} 1.993 + 1.994 +/** 1.995 + * Adds the specified multicharacter to this set if it is not already 1.996 + * present. If this set already contains the multicharacter, 1.997 + * the call leaves this set unchanged. 1.998 + * Thus "ch" => {"ch"} 1.999 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1.1000 + * @param s the source string 1.1001 + * @return the modified set, for chaining 1.1002 + */ 1.1003 +UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 1.1004 + if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1.1005 + int32_t cp = getSingleCP(s); 1.1006 + if (cp < 0) { 1.1007 + if (!strings->contains((void*) &s)) { 1.1008 + _add(s); 1.1009 + releasePattern(); 1.1010 + } 1.1011 + } else { 1.1012 + add((UChar32)cp); 1.1013 + } 1.1014 + return *this; 1.1015 +} 1.1016 + 1.1017 +/** 1.1018 + * Adds the given string, in order, to 'strings'. The given string 1.1019 + * must have been checked by the caller to not be empty and to not 1.1020 + * already be in 'strings'. 1.1021 + */ 1.1022 +void UnicodeSet::_add(const UnicodeString& s) { 1.1023 + if (isFrozen() || isBogus()) { 1.1024 + return; 1.1025 + } 1.1026 + UnicodeString* t = new UnicodeString(s); 1.1027 + if (t == NULL) { // Check for memory allocation error. 1.1028 + setToBogus(); 1.1029 + return; 1.1030 + } 1.1031 + UErrorCode ec = U_ZERO_ERROR; 1.1032 + strings->sortedInsert(t, compareUnicodeString, ec); 1.1033 + if (U_FAILURE(ec)) { 1.1034 + setToBogus(); 1.1035 + delete t; 1.1036 + } 1.1037 +} 1.1038 + 1.1039 +/** 1.1040 + * @return a code point IF the string consists of a single one. 1.1041 + * otherwise returns -1. 1.1042 + * @param string to test 1.1043 + */ 1.1044 +int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { 1.1045 + //if (s.length() < 1) { 1.1046 + // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); 1.1047 + //} 1.1048 + if (s.length() > 2) return -1; 1.1049 + if (s.length() == 1) return s.charAt(0); 1.1050 + 1.1051 + // at this point, len = 2 1.1052 + UChar32 cp = s.char32At(0); 1.1053 + if (cp > 0xFFFF) { // is surrogate pair 1.1054 + return cp; 1.1055 + } 1.1056 + return -1; 1.1057 +} 1.1058 + 1.1059 +/** 1.1060 + * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} 1.1061 + * If this set already any particular character, it has no effect on that character. 1.1062 + * @param the source string 1.1063 + * @return the modified set, for chaining 1.1064 + */ 1.1065 +UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { 1.1066 + UChar32 cp; 1.1067 + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1.1068 + cp = s.char32At(i); 1.1069 + add(cp); 1.1070 + } 1.1071 + return *this; 1.1072 +} 1.1073 + 1.1074 +/** 1.1075 + * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 1.1076 + * If this set already any particular character, it has no effect on that character. 1.1077 + * @param the source string 1.1078 + * @return the modified set, for chaining 1.1079 + */ 1.1080 +UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { 1.1081 + UnicodeSet set; 1.1082 + set.addAll(s); 1.1083 + retainAll(set); 1.1084 + return *this; 1.1085 +} 1.1086 + 1.1087 +/** 1.1088 + * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} 1.1089 + * If this set already any particular character, it has no effect on that character. 1.1090 + * @param the source string 1.1091 + * @return the modified set, for chaining 1.1092 + */ 1.1093 +UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { 1.1094 + UnicodeSet set; 1.1095 + set.addAll(s); 1.1096 + complementAll(set); 1.1097 + return *this; 1.1098 +} 1.1099 + 1.1100 +/** 1.1101 + * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} 1.1102 + * If this set already any particular character, it has no effect on that character. 1.1103 + * @param the source string 1.1104 + * @return the modified set, for chaining 1.1105 + */ 1.1106 +UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 1.1107 + UnicodeSet set; 1.1108 + set.addAll(s); 1.1109 + removeAll(set); 1.1110 + return *this; 1.1111 +} 1.1112 + 1.1113 +UnicodeSet& UnicodeSet::removeAllStrings() { 1.1114 + strings->removeAllElements(); 1.1115 + return *this; 1.1116 +} 1.1117 + 1.1118 + 1.1119 +/** 1.1120 + * Makes a set from a multicharacter string. Thus "ch" => {"ch"} 1.1121 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1.1122 + * @param the source string 1.1123 + * @return a newly created set containing the given string 1.1124 + */ 1.1125 +UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { 1.1126 + UnicodeSet *set = new UnicodeSet(); 1.1127 + if (set != NULL) { // Check for memory allocation error. 1.1128 + set->add(s); 1.1129 + } 1.1130 + return set; 1.1131 +} 1.1132 + 1.1133 + 1.1134 +/** 1.1135 + * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} 1.1136 + * @param the source string 1.1137 + * @return a newly created set containing the given characters 1.1138 + */ 1.1139 +UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { 1.1140 + UnicodeSet *set = new UnicodeSet(); 1.1141 + if (set != NULL) { // Check for memory allocation error. 1.1142 + set->addAll(s); 1.1143 + } 1.1144 + return set; 1.1145 +} 1.1146 + 1.1147 +/** 1.1148 + * Retain only the elements in this set that are contained in the 1.1149 + * specified range. If <code>end > start</code> then an empty range is 1.1150 + * retained, leaving the set empty. 1.1151 + * 1.1152 + * @param start first character, inclusive, of range to be retained 1.1153 + * to this set. 1.1154 + * @param end last character, inclusive, of range to be retained 1.1155 + * to this set. 1.1156 + */ 1.1157 +UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { 1.1158 + if (pinCodePoint(start) <= pinCodePoint(end)) { 1.1159 + UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1.1160 + retain(range, 2, 0); 1.1161 + } else { 1.1162 + clear(); 1.1163 + } 1.1164 + return *this; 1.1165 +} 1.1166 + 1.1167 +UnicodeSet& UnicodeSet::retain(UChar32 c) { 1.1168 + return retain(c, c); 1.1169 +} 1.1170 + 1.1171 +/** 1.1172 + * Removes the specified range from this set if it is present. 1.1173 + * The set will not contain the specified range once the call 1.1174 + * returns. If <code>end > start</code> then an empty range is 1.1175 + * removed, leaving the set unchanged. 1.1176 + * 1.1177 + * @param start first character, inclusive, of range to be removed 1.1178 + * from this set. 1.1179 + * @param end last character, inclusive, of range to be removed 1.1180 + * from this set. 1.1181 + */ 1.1182 +UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { 1.1183 + if (pinCodePoint(start) <= pinCodePoint(end)) { 1.1184 + UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1.1185 + retain(range, 2, 2); 1.1186 + } 1.1187 + return *this; 1.1188 +} 1.1189 + 1.1190 +/** 1.1191 + * Removes the specified character from this set if it is present. 1.1192 + * The set will not contain the specified range once the call 1.1193 + * returns. 1.1194 + */ 1.1195 +UnicodeSet& UnicodeSet::remove(UChar32 c) { 1.1196 + return remove(c, c); 1.1197 +} 1.1198 + 1.1199 +/** 1.1200 + * Removes the specified string from this set if it is present. 1.1201 + * The set will not contain the specified character once the call 1.1202 + * returns. 1.1203 + * @param the source string 1.1204 + * @return the modified set, for chaining 1.1205 + */ 1.1206 +UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 1.1207 + if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1.1208 + int32_t cp = getSingleCP(s); 1.1209 + if (cp < 0) { 1.1210 + strings->removeElement((void*) &s); 1.1211 + releasePattern(); 1.1212 + } else { 1.1213 + remove((UChar32)cp, (UChar32)cp); 1.1214 + } 1.1215 + return *this; 1.1216 +} 1.1217 + 1.1218 +/** 1.1219 + * Complements the specified range in this set. Any character in 1.1220 + * the range will be removed if it is in this set, or will be 1.1221 + * added if it is not in this set. If <code>end > start</code> 1.1222 + * then an empty range is xor'ed, leaving the set unchanged. 1.1223 + * 1.1224 + * @param start first character, inclusive, of range to be removed 1.1225 + * from this set. 1.1226 + * @param end last character, inclusive, of range to be removed 1.1227 + * from this set. 1.1228 + */ 1.1229 +UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { 1.1230 + if (isFrozen() || isBogus()) { 1.1231 + return *this; 1.1232 + } 1.1233 + if (pinCodePoint(start) <= pinCodePoint(end)) { 1.1234 + UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 1.1235 + exclusiveOr(range, 2, 0); 1.1236 + } 1.1237 + releasePattern(); 1.1238 + return *this; 1.1239 +} 1.1240 + 1.1241 +UnicodeSet& UnicodeSet::complement(UChar32 c) { 1.1242 + return complement(c, c); 1.1243 +} 1.1244 + 1.1245 +/** 1.1246 + * This is equivalent to 1.1247 + * <code>complement(MIN_VALUE, MAX_VALUE)</code>. 1.1248 + */ 1.1249 +UnicodeSet& UnicodeSet::complement(void) { 1.1250 + if (isFrozen() || isBogus()) { 1.1251 + return *this; 1.1252 + } 1.1253 + UErrorCode status = U_ZERO_ERROR; 1.1254 + if (list[0] == UNICODESET_LOW) { 1.1255 + ensureBufferCapacity(len-1, status); 1.1256 + if (U_FAILURE(status)) { 1.1257 + return *this; 1.1258 + } 1.1259 + uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); 1.1260 + --len; 1.1261 + } else { 1.1262 + ensureBufferCapacity(len+1, status); 1.1263 + if (U_FAILURE(status)) { 1.1264 + return *this; 1.1265 + } 1.1266 + uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); 1.1267 + buffer[0] = UNICODESET_LOW; 1.1268 + ++len; 1.1269 + } 1.1270 + swapBuffers(); 1.1271 + releasePattern(); 1.1272 + return *this; 1.1273 +} 1.1274 + 1.1275 +/** 1.1276 + * Complement the specified string in this set. 1.1277 + * The set will not contain the specified string once the call 1.1278 + * returns. 1.1279 + * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> 1.1280 + * @param s the string to complement 1.1281 + * @return this object, for chaining 1.1282 + */ 1.1283 +UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 1.1284 + if (s.length() == 0 || isFrozen() || isBogus()) return *this; 1.1285 + int32_t cp = getSingleCP(s); 1.1286 + if (cp < 0) { 1.1287 + if (strings->contains((void*) &s)) { 1.1288 + strings->removeElement((void*) &s); 1.1289 + } else { 1.1290 + _add(s); 1.1291 + } 1.1292 + releasePattern(); 1.1293 + } else { 1.1294 + complement((UChar32)cp, (UChar32)cp); 1.1295 + } 1.1296 + return *this; 1.1297 +} 1.1298 + 1.1299 +/** 1.1300 + * Adds all of the elements in the specified set to this set if 1.1301 + * they're not already present. This operation effectively 1.1302 + * modifies this set so that its value is the <i>union</i> of the two 1.1303 + * sets. The behavior of this operation is unspecified if the specified 1.1304 + * collection is modified while the operation is in progress. 1.1305 + * 1.1306 + * @param c set whose elements are to be added to this set. 1.1307 + * @see #add(char, char) 1.1308 + */ 1.1309 +UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 1.1310 + if ( c.len>0 && c.list!=NULL ) { 1.1311 + add(c.list, c.len, 0); 1.1312 + } 1.1313 + 1.1314 + // Add strings in order 1.1315 + if ( c.strings!=NULL ) { 1.1316 + for (int32_t i=0; i<c.strings->size(); ++i) { 1.1317 + const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); 1.1318 + if (!strings->contains((void*) s)) { 1.1319 + _add(*s); 1.1320 + } 1.1321 + } 1.1322 + } 1.1323 + return *this; 1.1324 +} 1.1325 + 1.1326 +/** 1.1327 + * Retains only the elements in this set that are contained in the 1.1328 + * specified set. In other words, removes from this set all of 1.1329 + * its elements that are not contained in the specified set. This 1.1330 + * operation effectively modifies this set so that its value is 1.1331 + * the <i>intersection</i> of the two sets. 1.1332 + * 1.1333 + * @param c set that defines which elements this set will retain. 1.1334 + */ 1.1335 +UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 1.1336 + if (isFrozen() || isBogus()) { 1.1337 + return *this; 1.1338 + } 1.1339 + retain(c.list, c.len, 0); 1.1340 + strings->retainAll(*c.strings); 1.1341 + return *this; 1.1342 +} 1.1343 + 1.1344 +/** 1.1345 + * Removes from this set all of its elements that are contained in the 1.1346 + * specified set. This operation effectively modifies this 1.1347 + * set so that its value is the <i>asymmetric set difference</i> of 1.1348 + * the two sets. 1.1349 + * 1.1350 + * @param c set that defines which elements will be removed from 1.1351 + * this set. 1.1352 + */ 1.1353 +UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 1.1354 + if (isFrozen() || isBogus()) { 1.1355 + return *this; 1.1356 + } 1.1357 + retain(c.list, c.len, 2); 1.1358 + strings->removeAll(*c.strings); 1.1359 + return *this; 1.1360 +} 1.1361 + 1.1362 +/** 1.1363 + * Complements in this set all elements contained in the specified 1.1364 + * set. Any character in the other set will be removed if it is 1.1365 + * in this set, or will be added if it is not in this set. 1.1366 + * 1.1367 + * @param c set that defines which elements will be xor'ed from 1.1368 + * this set. 1.1369 + */ 1.1370 +UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 1.1371 + if (isFrozen() || isBogus()) { 1.1372 + return *this; 1.1373 + } 1.1374 + exclusiveOr(c.list, c.len, 0); 1.1375 + 1.1376 + for (int32_t i=0; i<c.strings->size(); ++i) { 1.1377 + void* e = c.strings->elementAt(i); 1.1378 + if (!strings->removeElement(e)) { 1.1379 + _add(*(const UnicodeString*)e); 1.1380 + } 1.1381 + } 1.1382 + return *this; 1.1383 +} 1.1384 + 1.1385 +/** 1.1386 + * Removes all of the elements from this set. This set will be 1.1387 + * empty after this call returns. 1.1388 + */ 1.1389 +UnicodeSet& UnicodeSet::clear(void) { 1.1390 + if (isFrozen()) { 1.1391 + return *this; 1.1392 + } 1.1393 + if (list != NULL) { 1.1394 + list[0] = UNICODESET_HIGH; 1.1395 + } 1.1396 + len = 1; 1.1397 + releasePattern(); 1.1398 + if (strings != NULL) { 1.1399 + strings->removeAllElements(); 1.1400 + } 1.1401 + if (list != NULL && strings != NULL) { 1.1402 + // Remove bogus 1.1403 + fFlags = 0; 1.1404 + } 1.1405 + return *this; 1.1406 +} 1.1407 + 1.1408 +/** 1.1409 + * Iteration method that returns the number of ranges contained in 1.1410 + * this set. 1.1411 + * @see #getRangeStart 1.1412 + * @see #getRangeEnd 1.1413 + */ 1.1414 +int32_t UnicodeSet::getRangeCount() const { 1.1415 + return len/2; 1.1416 +} 1.1417 + 1.1418 +/** 1.1419 + * Iteration method that returns the first character in the 1.1420 + * specified range of this set. 1.1421 + * @see #getRangeCount 1.1422 + * @see #getRangeEnd 1.1423 + */ 1.1424 +UChar32 UnicodeSet::getRangeStart(int32_t index) const { 1.1425 + return list[index*2]; 1.1426 +} 1.1427 + 1.1428 +/** 1.1429 + * Iteration method that returns the last character in the 1.1430 + * specified range of this set. 1.1431 + * @see #getRangeStart 1.1432 + * @see #getRangeEnd 1.1433 + */ 1.1434 +UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1.1435 + return list[index*2 + 1] - 1; 1.1436 +} 1.1437 + 1.1438 +int32_t UnicodeSet::getStringCount() const { 1.1439 + return strings->size(); 1.1440 +} 1.1441 + 1.1442 +const UnicodeString* UnicodeSet::getString(int32_t index) const { 1.1443 + return (const UnicodeString*) strings->elementAt(index); 1.1444 +} 1.1445 + 1.1446 +/** 1.1447 + * Reallocate this objects internal structures to take up the least 1.1448 + * possible space, without changing this object's value. 1.1449 + */ 1.1450 +UnicodeSet& UnicodeSet::compact() { 1.1451 + if (isFrozen() || isBogus()) { 1.1452 + return *this; 1.1453 + } 1.1454 + // Delete buffer first to defragment memory less. 1.1455 + if (buffer != NULL) { 1.1456 + uprv_free(buffer); 1.1457 + buffer = NULL; 1.1458 + } 1.1459 + if (len < capacity) { 1.1460 + // Make the capacity equal to len or 1. 1.1461 + // We don't want to realloc of 0 size. 1.1462 + int32_t newCapacity = len + (len == 0); 1.1463 + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); 1.1464 + if (temp) { 1.1465 + list = temp; 1.1466 + capacity = newCapacity; 1.1467 + } 1.1468 + // else what the heck happened?! We allocated less memory! 1.1469 + // Oh well. We'll keep our original array. 1.1470 + } 1.1471 + return *this; 1.1472 +} 1.1473 + 1.1474 +int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { 1.1475 + int32_t bmpLength, length, destLength; 1.1476 + 1.1477 + if (U_FAILURE(ec)) { 1.1478 + return 0; 1.1479 + } 1.1480 + 1.1481 + if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { 1.1482 + ec=U_ILLEGAL_ARGUMENT_ERROR; 1.1483 + return 0; 1.1484 + } 1.1485 + 1.1486 + /* count necessary 16-bit units */ 1.1487 + length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH 1.1488 + // assert(length>=0); 1.1489 + if (length==0) { 1.1490 + /* empty set */ 1.1491 + if (destCapacity>0) { 1.1492 + *dest=0; 1.1493 + } else { 1.1494 + ec=U_BUFFER_OVERFLOW_ERROR; 1.1495 + } 1.1496 + return 1; 1.1497 + } 1.1498 + /* now length>0 */ 1.1499 + 1.1500 + if (this->list[length-1]<=0xffff) { 1.1501 + /* all BMP */ 1.1502 + bmpLength=length; 1.1503 + } else if (this->list[0]>=0x10000) { 1.1504 + /* all supplementary */ 1.1505 + bmpLength=0; 1.1506 + length*=2; 1.1507 + } else { 1.1508 + /* some BMP, some supplementary */ 1.1509 + for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} 1.1510 + length=bmpLength+2*(length-bmpLength); 1.1511 + } 1.1512 + 1.1513 + /* length: number of 16-bit array units */ 1.1514 + if (length>0x7fff) { 1.1515 + /* there are only 15 bits for the length in the first serialized word */ 1.1516 + ec=U_INDEX_OUTOFBOUNDS_ERROR; 1.1517 + return 0; 1.1518 + } 1.1519 + 1.1520 + /* 1.1521 + * total serialized length: 1.1522 + * number of 16-bit array units (length) + 1.1523 + * 1 length unit (always) + 1.1524 + * 1 bmpLength unit (if there are supplementary values) 1.1525 + */ 1.1526 + destLength=length+((length>bmpLength)?2:1); 1.1527 + if (destLength<=destCapacity) { 1.1528 + const UChar32 *p; 1.1529 + int32_t i; 1.1530 + 1.1531 + *dest=(uint16_t)length; 1.1532 + if (length>bmpLength) { 1.1533 + *dest|=0x8000; 1.1534 + *++dest=(uint16_t)bmpLength; 1.1535 + } 1.1536 + ++dest; 1.1537 + 1.1538 + /* write the BMP part of the array */ 1.1539 + p=this->list; 1.1540 + for (i=0; i<bmpLength; ++i) { 1.1541 + *dest++=(uint16_t)*p++; 1.1542 + } 1.1543 + 1.1544 + /* write the supplementary part of the array */ 1.1545 + for (; i<length; i+=2) { 1.1546 + *dest++=(uint16_t)(*p>>16); 1.1547 + *dest++=(uint16_t)*p++; 1.1548 + } 1.1549 + } else { 1.1550 + ec=U_BUFFER_OVERFLOW_ERROR; 1.1551 + } 1.1552 + return destLength; 1.1553 +} 1.1554 + 1.1555 +//---------------------------------------------------------------- 1.1556 +// Implementation: Utility methods 1.1557 +//---------------------------------------------------------------- 1.1558 + 1.1559 +/** 1.1560 + * Allocate our strings vector and return TRUE if successful. 1.1561 + */ 1.1562 +UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1.1563 + if (U_FAILURE(status)) { 1.1564 + return FALSE; 1.1565 + } 1.1566 + strings = new UVector(uprv_deleteUObject, 1.1567 + uhash_compareUnicodeString, 1, status); 1.1568 + if (strings == NULL) { // Check for memory allocation error. 1.1569 + status = U_MEMORY_ALLOCATION_ERROR; 1.1570 + return FALSE; 1.1571 + } 1.1572 + if (U_FAILURE(status)) { 1.1573 + delete strings; 1.1574 + strings = NULL; 1.1575 + return FALSE; 1.1576 + } 1.1577 + return TRUE; 1.1578 +} 1.1579 + 1.1580 +void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { 1.1581 + if (newLen <= capacity) 1.1582 + return; 1.1583 + UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1.1584 + if (temp == NULL) { 1.1585 + ec = U_MEMORY_ALLOCATION_ERROR; 1.1586 + setToBogus(); 1.1587 + return; 1.1588 + } 1.1589 + list = temp; 1.1590 + capacity = newLen + GROW_EXTRA; 1.1591 + // else we keep the original contents on the memory failure. 1.1592 +} 1.1593 + 1.1594 +void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { 1.1595 + if (buffer != NULL && newLen <= bufferCapacity) 1.1596 + return; 1.1597 + UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1.1598 + if (temp == NULL) { 1.1599 + ec = U_MEMORY_ALLOCATION_ERROR; 1.1600 + setToBogus(); 1.1601 + return; 1.1602 + } 1.1603 + buffer = temp; 1.1604 + bufferCapacity = newLen + GROW_EXTRA; 1.1605 + // else we keep the original contents on the memory failure. 1.1606 +} 1.1607 + 1.1608 +/** 1.1609 + * Swap list and buffer. 1.1610 + */ 1.1611 +void UnicodeSet::swapBuffers(void) { 1.1612 + // swap list and buffer 1.1613 + UChar32* temp = list; 1.1614 + list = buffer; 1.1615 + buffer = temp; 1.1616 + 1.1617 + int32_t c = capacity; 1.1618 + capacity = bufferCapacity; 1.1619 + bufferCapacity = c; 1.1620 +} 1.1621 + 1.1622 +void UnicodeSet::setToBogus() { 1.1623 + clear(); // Remove everything in the set. 1.1624 + fFlags = kIsBogus; 1.1625 +} 1.1626 + 1.1627 +//---------------------------------------------------------------- 1.1628 +// Implementation: Fundamental operators 1.1629 +//---------------------------------------------------------------- 1.1630 + 1.1631 +static inline UChar32 max(UChar32 a, UChar32 b) { 1.1632 + return (a > b) ? a : b; 1.1633 +} 1.1634 + 1.1635 +// polarity = 0, 3 is normal: x xor y 1.1636 +// polarity = 1, 2: x xor ~y == x === y 1.1637 + 1.1638 +void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { 1.1639 + if (isFrozen() || isBogus()) { 1.1640 + return; 1.1641 + } 1.1642 + UErrorCode status = U_ZERO_ERROR; 1.1643 + ensureBufferCapacity(len + otherLen, status); 1.1644 + if (U_FAILURE(status)) { 1.1645 + return; 1.1646 + } 1.1647 + 1.1648 + int32_t i = 0, j = 0, k = 0; 1.1649 + UChar32 a = list[i++]; 1.1650 + UChar32 b; 1.1651 + if (polarity == 1 || polarity == 2) { 1.1652 + b = UNICODESET_LOW; 1.1653 + if (other[j] == UNICODESET_LOW) { // skip base if already LOW 1.1654 + ++j; 1.1655 + b = other[j]; 1.1656 + } 1.1657 + } else { 1.1658 + b = other[j++]; 1.1659 + } 1.1660 + // simplest of all the routines 1.1661 + // sort the values, discarding identicals! 1.1662 + for (;;) { 1.1663 + if (a < b) { 1.1664 + buffer[k++] = a; 1.1665 + a = list[i++]; 1.1666 + } else if (b < a) { 1.1667 + buffer[k++] = b; 1.1668 + b = other[j++]; 1.1669 + } else if (a != UNICODESET_HIGH) { // at this point, a == b 1.1670 + // discard both values! 1.1671 + a = list[i++]; 1.1672 + b = other[j++]; 1.1673 + } else { // DONE! 1.1674 + buffer[k++] = UNICODESET_HIGH; 1.1675 + len = k; 1.1676 + break; 1.1677 + } 1.1678 + } 1.1679 + swapBuffers(); 1.1680 + releasePattern(); 1.1681 +} 1.1682 + 1.1683 +// polarity = 0 is normal: x union y 1.1684 +// polarity = 2: x union ~y 1.1685 +// polarity = 1: ~x union y 1.1686 +// polarity = 3: ~x union ~y 1.1687 + 1.1688 +void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1.1689 + if (isFrozen() || isBogus() || other==NULL) { 1.1690 + return; 1.1691 + } 1.1692 + UErrorCode status = U_ZERO_ERROR; 1.1693 + ensureBufferCapacity(len + otherLen, status); 1.1694 + if (U_FAILURE(status)) { 1.1695 + return; 1.1696 + } 1.1697 + 1.1698 + int32_t i = 0, j = 0, k = 0; 1.1699 + UChar32 a = list[i++]; 1.1700 + UChar32 b = other[j++]; 1.1701 + // change from xor is that we have to check overlapping pairs 1.1702 + // polarity bit 1 means a is second, bit 2 means b is. 1.1703 + for (;;) { 1.1704 + switch (polarity) { 1.1705 + case 0: // both first; take lower if unequal 1.1706 + if (a < b) { // take a 1.1707 + // Back up over overlapping ranges in buffer[] 1.1708 + if (k > 0 && a <= buffer[k-1]) { 1.1709 + // Pick latter end value in buffer[] vs. list[] 1.1710 + a = max(list[i], buffer[--k]); 1.1711 + } else { 1.1712 + // No overlap 1.1713 + buffer[k++] = a; 1.1714 + a = list[i]; 1.1715 + } 1.1716 + i++; // Common if/else code factored out 1.1717 + polarity ^= 1; 1.1718 + } else if (b < a) { // take b 1.1719 + if (k > 0 && b <= buffer[k-1]) { 1.1720 + b = max(other[j], buffer[--k]); 1.1721 + } else { 1.1722 + buffer[k++] = b; 1.1723 + b = other[j]; 1.1724 + } 1.1725 + j++; 1.1726 + polarity ^= 2; 1.1727 + } else { // a == b, take a, drop b 1.1728 + if (a == UNICODESET_HIGH) goto loop_end; 1.1729 + // This is symmetrical; it doesn't matter if 1.1730 + // we backtrack with a or b. - liu 1.1731 + if (k > 0 && a <= buffer[k-1]) { 1.1732 + a = max(list[i], buffer[--k]); 1.1733 + } else { 1.1734 + // No overlap 1.1735 + buffer[k++] = a; 1.1736 + a = list[i]; 1.1737 + } 1.1738 + i++; 1.1739 + polarity ^= 1; 1.1740 + b = other[j++]; 1.1741 + polarity ^= 2; 1.1742 + } 1.1743 + break; 1.1744 + case 3: // both second; take higher if unequal, and drop other 1.1745 + if (b <= a) { // take a 1.1746 + if (a == UNICODESET_HIGH) goto loop_end; 1.1747 + buffer[k++] = a; 1.1748 + } else { // take b 1.1749 + if (b == UNICODESET_HIGH) goto loop_end; 1.1750 + buffer[k++] = b; 1.1751 + } 1.1752 + a = list[i++]; 1.1753 + polarity ^= 1; // factored common code 1.1754 + b = other[j++]; 1.1755 + polarity ^= 2; 1.1756 + break; 1.1757 + case 1: // a second, b first; if b < a, overlap 1.1758 + if (a < b) { // no overlap, take a 1.1759 + buffer[k++] = a; a = list[i++]; polarity ^= 1; 1.1760 + } else if (b < a) { // OVERLAP, drop b 1.1761 + b = other[j++]; 1.1762 + polarity ^= 2; 1.1763 + } else { // a == b, drop both! 1.1764 + if (a == UNICODESET_HIGH) goto loop_end; 1.1765 + a = list[i++]; 1.1766 + polarity ^= 1; 1.1767 + b = other[j++]; 1.1768 + polarity ^= 2; 1.1769 + } 1.1770 + break; 1.1771 + case 2: // a first, b second; if a < b, overlap 1.1772 + if (b < a) { // no overlap, take b 1.1773 + buffer[k++] = b; 1.1774 + b = other[j++]; 1.1775 + polarity ^= 2; 1.1776 + } else if (a < b) { // OVERLAP, drop a 1.1777 + a = list[i++]; 1.1778 + polarity ^= 1; 1.1779 + } else { // a == b, drop both! 1.1780 + if (a == UNICODESET_HIGH) goto loop_end; 1.1781 + a = list[i++]; 1.1782 + polarity ^= 1; 1.1783 + b = other[j++]; 1.1784 + polarity ^= 2; 1.1785 + } 1.1786 + break; 1.1787 + } 1.1788 + } 1.1789 + loop_end: 1.1790 + buffer[k++] = UNICODESET_HIGH; // terminate 1.1791 + len = k; 1.1792 + swapBuffers(); 1.1793 + releasePattern(); 1.1794 +} 1.1795 + 1.1796 +// polarity = 0 is normal: x intersect y 1.1797 +// polarity = 2: x intersect ~y == set-minus 1.1798 +// polarity = 1: ~x intersect y 1.1799 +// polarity = 3: ~x intersect ~y 1.1800 + 1.1801 +void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { 1.1802 + if (isFrozen() || isBogus()) { 1.1803 + return; 1.1804 + } 1.1805 + UErrorCode status = U_ZERO_ERROR; 1.1806 + ensureBufferCapacity(len + otherLen, status); 1.1807 + if (U_FAILURE(status)) { 1.1808 + return; 1.1809 + } 1.1810 + 1.1811 + int32_t i = 0, j = 0, k = 0; 1.1812 + UChar32 a = list[i++]; 1.1813 + UChar32 b = other[j++]; 1.1814 + // change from xor is that we have to check overlapping pairs 1.1815 + // polarity bit 1 means a is second, bit 2 means b is. 1.1816 + for (;;) { 1.1817 + switch (polarity) { 1.1818 + case 0: // both first; drop the smaller 1.1819 + if (a < b) { // drop a 1.1820 + a = list[i++]; 1.1821 + polarity ^= 1; 1.1822 + } else if (b < a) { // drop b 1.1823 + b = other[j++]; 1.1824 + polarity ^= 2; 1.1825 + } else { // a == b, take one, drop other 1.1826 + if (a == UNICODESET_HIGH) goto loop_end; 1.1827 + buffer[k++] = a; 1.1828 + a = list[i++]; 1.1829 + polarity ^= 1; 1.1830 + b = other[j++]; 1.1831 + polarity ^= 2; 1.1832 + } 1.1833 + break; 1.1834 + case 3: // both second; take lower if unequal 1.1835 + if (a < b) { // take a 1.1836 + buffer[k++] = a; 1.1837 + a = list[i++]; 1.1838 + polarity ^= 1; 1.1839 + } else if (b < a) { // take b 1.1840 + buffer[k++] = b; 1.1841 + b = other[j++]; 1.1842 + polarity ^= 2; 1.1843 + } else { // a == b, take one, drop other 1.1844 + if (a == UNICODESET_HIGH) goto loop_end; 1.1845 + buffer[k++] = a; 1.1846 + a = list[i++]; 1.1847 + polarity ^= 1; 1.1848 + b = other[j++]; 1.1849 + polarity ^= 2; 1.1850 + } 1.1851 + break; 1.1852 + case 1: // a second, b first; 1.1853 + if (a < b) { // NO OVERLAP, drop a 1.1854 + a = list[i++]; 1.1855 + polarity ^= 1; 1.1856 + } else if (b < a) { // OVERLAP, take b 1.1857 + buffer[k++] = b; 1.1858 + b = other[j++]; 1.1859 + polarity ^= 2; 1.1860 + } else { // a == b, drop both! 1.1861 + if (a == UNICODESET_HIGH) goto loop_end; 1.1862 + a = list[i++]; 1.1863 + polarity ^= 1; 1.1864 + b = other[j++]; 1.1865 + polarity ^= 2; 1.1866 + } 1.1867 + break; 1.1868 + case 2: // a first, b second; if a < b, overlap 1.1869 + if (b < a) { // no overlap, drop b 1.1870 + b = other[j++]; 1.1871 + polarity ^= 2; 1.1872 + } else if (a < b) { // OVERLAP, take a 1.1873 + buffer[k++] = a; 1.1874 + a = list[i++]; 1.1875 + polarity ^= 1; 1.1876 + } else { // a == b, drop both! 1.1877 + if (a == UNICODESET_HIGH) goto loop_end; 1.1878 + a = list[i++]; 1.1879 + polarity ^= 1; 1.1880 + b = other[j++]; 1.1881 + polarity ^= 2; 1.1882 + } 1.1883 + break; 1.1884 + } 1.1885 + } 1.1886 + loop_end: 1.1887 + buffer[k++] = UNICODESET_HIGH; // terminate 1.1888 + len = k; 1.1889 + swapBuffers(); 1.1890 + releasePattern(); 1.1891 +} 1.1892 + 1.1893 +/** 1.1894 + * Append the <code>toPattern()</code> representation of a 1.1895 + * string to the given <code>StringBuffer</code>. 1.1896 + */ 1.1897 +void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool 1.1898 +escapeUnprintable) { 1.1899 + UChar32 cp; 1.1900 + for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { 1.1901 + _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); 1.1902 + } 1.1903 +} 1.1904 + 1.1905 +/** 1.1906 + * Append the <code>toPattern()</code> representation of a 1.1907 + * character to the given <code>StringBuffer</code>. 1.1908 + */ 1.1909 +void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool 1.1910 +escapeUnprintable) { 1.1911 + if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1.1912 + // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything 1.1913 + // unprintable 1.1914 + if (ICU_Utility::escapeUnprintable(buf, c)) { 1.1915 + return; 1.1916 + } 1.1917 + } 1.1918 + // Okay to let ':' pass through 1.1919 + switch (c) { 1.1920 + case SET_OPEN: 1.1921 + case SET_CLOSE: 1.1922 + case HYPHEN: 1.1923 + case COMPLEMENT: 1.1924 + case INTERSECTION: 1.1925 + case BACKSLASH: 1.1926 + case OPEN_BRACE: 1.1927 + case CLOSE_BRACE: 1.1928 + case COLON: 1.1929 + case SymbolTable::SYMBOL_REF: 1.1930 + buf.append(BACKSLASH); 1.1931 + break; 1.1932 + default: 1.1933 + // Escape whitespace 1.1934 + if (PatternProps::isWhiteSpace(c)) { 1.1935 + buf.append(BACKSLASH); 1.1936 + } 1.1937 + break; 1.1938 + } 1.1939 + buf.append(c); 1.1940 +} 1.1941 + 1.1942 +/** 1.1943 + * Append a string representation of this set to result. This will be 1.1944 + * a cleaned version of the string passed to applyPattern(), if there 1.1945 + * is one. Otherwise it will be generated. 1.1946 + */ 1.1947 +UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, 1.1948 + UBool escapeUnprintable) const 1.1949 +{ 1.1950 + if (pat != NULL) { 1.1951 + int32_t i; 1.1952 + int32_t backslashCount = 0; 1.1953 + for (i=0; i<patLen; ) { 1.1954 + UChar32 c; 1.1955 + U16_NEXT(pat, i, patLen, c); 1.1956 + if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { 1.1957 + // If the unprintable character is preceded by an odd 1.1958 + // number of backslashes, then it has been escaped. 1.1959 + // Before unescaping it, we delete the final 1.1960 + // backslash. 1.1961 + if ((backslashCount % 2) == 1) { 1.1962 + result.truncate(result.length() - 1); 1.1963 + } 1.1964 + ICU_Utility::escapeUnprintable(result, c); 1.1965 + backslashCount = 0; 1.1966 + } else { 1.1967 + result.append(c); 1.1968 + if (c == BACKSLASH) { 1.1969 + ++backslashCount; 1.1970 + } else { 1.1971 + backslashCount = 0; 1.1972 + } 1.1973 + } 1.1974 + } 1.1975 + return result; 1.1976 + } 1.1977 + 1.1978 + return _generatePattern(result, escapeUnprintable); 1.1979 +} 1.1980 + 1.1981 +/** 1.1982 + * Returns a string representation of this set. If the result of 1.1983 + * calling this function is passed to a UnicodeSet constructor, it 1.1984 + * will produce another set that is equal to this one. 1.1985 + */ 1.1986 +UnicodeString& UnicodeSet::toPattern(UnicodeString& result, 1.1987 + UBool escapeUnprintable) const 1.1988 +{ 1.1989 + result.truncate(0); 1.1990 + return _toPattern(result, escapeUnprintable); 1.1991 +} 1.1992 + 1.1993 +/** 1.1994 + * Generate and append a string representation of this set to result. 1.1995 + * This does not use this.pat, the cleaned up copy of the string 1.1996 + * passed to applyPattern(). 1.1997 + */ 1.1998 +UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 1.1999 + UBool escapeUnprintable) const 1.2000 +{ 1.2001 + result.append(SET_OPEN); 1.2002 + 1.2003 +// // Check against the predefined categories. We implicitly build 1.2004 +// // up ALL category sets the first time toPattern() is called. 1.2005 +// for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { 1.2006 +// if (*this == getCategorySet(cat)) { 1.2007 +// result.append(COLON); 1.2008 +// result.append(CATEGORY_NAMES, cat*2, 2); 1.2009 +// return result.append(CATEGORY_CLOSE); 1.2010 +// } 1.2011 +// } 1.2012 + 1.2013 + int32_t count = getRangeCount(); 1.2014 + 1.2015 + // If the set contains at least 2 intervals and includes both 1.2016 + // MIN_VALUE and MAX_VALUE, then the inverse representation will 1.2017 + // be more economical. 1.2018 + if (count > 1 && 1.2019 + getRangeStart(0) == MIN_VALUE && 1.2020 + getRangeEnd(count-1) == MAX_VALUE) { 1.2021 + 1.2022 + // Emit the inverse 1.2023 + result.append(COMPLEMENT); 1.2024 + 1.2025 + for (int32_t i = 1; i < count; ++i) { 1.2026 + UChar32 start = getRangeEnd(i-1)+1; 1.2027 + UChar32 end = getRangeStart(i)-1; 1.2028 + _appendToPat(result, start, escapeUnprintable); 1.2029 + if (start != end) { 1.2030 + if ((start+1) != end) { 1.2031 + result.append(HYPHEN); 1.2032 + } 1.2033 + _appendToPat(result, end, escapeUnprintable); 1.2034 + } 1.2035 + } 1.2036 + } 1.2037 + 1.2038 + // Default; emit the ranges as pairs 1.2039 + else { 1.2040 + for (int32_t i = 0; i < count; ++i) { 1.2041 + UChar32 start = getRangeStart(i); 1.2042 + UChar32 end = getRangeEnd(i); 1.2043 + _appendToPat(result, start, escapeUnprintable); 1.2044 + if (start != end) { 1.2045 + if ((start+1) != end) { 1.2046 + result.append(HYPHEN); 1.2047 + } 1.2048 + _appendToPat(result, end, escapeUnprintable); 1.2049 + } 1.2050 + } 1.2051 + } 1.2052 + 1.2053 + for (int32_t i = 0; i<strings->size(); ++i) { 1.2054 + result.append(OPEN_BRACE); 1.2055 + _appendToPat(result, 1.2056 + *(const UnicodeString*) strings->elementAt(i), 1.2057 + escapeUnprintable); 1.2058 + result.append(CLOSE_BRACE); 1.2059 + } 1.2060 + return result.append(SET_CLOSE); 1.2061 +} 1.2062 + 1.2063 +/** 1.2064 +* Release existing cached pattern 1.2065 +*/ 1.2066 +void UnicodeSet::releasePattern() { 1.2067 + if (pat) { 1.2068 + uprv_free(pat); 1.2069 + pat = NULL; 1.2070 + patLen = 0; 1.2071 + } 1.2072 +} 1.2073 + 1.2074 +/** 1.2075 +* Set the new pattern to cache. 1.2076 +*/ 1.2077 +void UnicodeSet::setPattern(const UnicodeString& newPat) { 1.2078 + releasePattern(); 1.2079 + int32_t newPatLen = newPat.length(); 1.2080 + pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); 1.2081 + if (pat) { 1.2082 + patLen = newPatLen; 1.2083 + newPat.extractBetween(0, patLen, pat); 1.2084 + pat[patLen] = 0; 1.2085 + } 1.2086 + // else we don't care if malloc failed. This was just a nice cache. 1.2087 + // We can regenerate an equivalent pattern later when requested. 1.2088 +} 1.2089 + 1.2090 +UnicodeFunctor *UnicodeSet::freeze() { 1.2091 + if(!isFrozen() && !isBogus()) { 1.2092 + // Do most of what compact() does before freezing because 1.2093 + // compact() will not work when the set is frozen. 1.2094 + // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 1.2095 + 1.2096 + // Delete buffer first to defragment memory less. 1.2097 + if (buffer != NULL) { 1.2098 + uprv_free(buffer); 1.2099 + buffer = NULL; 1.2100 + } 1.2101 + if (capacity > (len + GROW_EXTRA)) { 1.2102 + // Make the capacity equal to len or 1. 1.2103 + // We don't want to realloc of 0 size. 1.2104 + capacity = len + (len == 0); 1.2105 + list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); 1.2106 + if (list == NULL) { // Check for memory allocation error. 1.2107 + setToBogus(); 1.2108 + return this; 1.2109 + } 1.2110 + } 1.2111 + 1.2112 + // Optimize contains() and span() and similar functions. 1.2113 + if (!strings->isEmpty()) { 1.2114 + stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); 1.2115 + if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { 1.2116 + // All strings are irrelevant for span() etc. because 1.2117 + // all of each string's code points are contained in this set. 1.2118 + // Do not check needsStringSpanUTF8() because UTF-8 has at most as 1.2119 + // many relevant strings as UTF-16. 1.2120 + // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) 1.2121 + delete stringSpan; 1.2122 + stringSpan = NULL; 1.2123 + } 1.2124 + } 1.2125 + if (stringSpan == NULL) { 1.2126 + // No span-relevant strings: Optimize for code point spans. 1.2127 + bmpSet=new BMPSet(list, len); 1.2128 + if (bmpSet == NULL) { // Check for memory allocation error. 1.2129 + setToBogus(); 1.2130 + } 1.2131 + } 1.2132 + } 1.2133 + return this; 1.2134 +} 1.2135 + 1.2136 +int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 1.2137 + if(length>0 && bmpSet!=NULL) { 1.2138 + return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); 1.2139 + } 1.2140 + if(length<0) { 1.2141 + length=u_strlen(s); 1.2142 + } 1.2143 + if(length==0) { 1.2144 + return 0; 1.2145 + } 1.2146 + if(stringSpan!=NULL) { 1.2147 + return stringSpan->span(s, length, spanCondition); 1.2148 + } else if(!strings->isEmpty()) { 1.2149 + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1.2150 + UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 1.2151 + UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 1.2152 + UnicodeSetStringSpan strSpan(*this, *strings, which); 1.2153 + if(strSpan.needsStringSpanUTF16()) { 1.2154 + return strSpan.span(s, length, spanCondition); 1.2155 + } 1.2156 + } 1.2157 + 1.2158 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.2159 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.2160 + } 1.2161 + 1.2162 + UChar32 c; 1.2163 + int32_t start=0, prev=0; 1.2164 + do { 1.2165 + U16_NEXT(s, start, length, c); 1.2166 + if(spanCondition!=contains(c)) { 1.2167 + break; 1.2168 + } 1.2169 + } while((prev=start)<length); 1.2170 + return prev; 1.2171 +} 1.2172 + 1.2173 +int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 1.2174 + if(length>0 && bmpSet!=NULL) { 1.2175 + return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); 1.2176 + } 1.2177 + if(length<0) { 1.2178 + length=u_strlen(s); 1.2179 + } 1.2180 + if(length==0) { 1.2181 + return 0; 1.2182 + } 1.2183 + if(stringSpan!=NULL) { 1.2184 + return stringSpan->spanBack(s, length, spanCondition); 1.2185 + } else if(!strings->isEmpty()) { 1.2186 + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1.2187 + UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 1.2188 + UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 1.2189 + UnicodeSetStringSpan strSpan(*this, *strings, which); 1.2190 + if(strSpan.needsStringSpanUTF16()) { 1.2191 + return strSpan.spanBack(s, length, spanCondition); 1.2192 + } 1.2193 + } 1.2194 + 1.2195 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.2196 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.2197 + } 1.2198 + 1.2199 + UChar32 c; 1.2200 + int32_t prev=length; 1.2201 + do { 1.2202 + U16_PREV(s, 0, length, c); 1.2203 + if(spanCondition!=contains(c)) { 1.2204 + break; 1.2205 + } 1.2206 + } while((prev=length)>0); 1.2207 + return prev; 1.2208 +} 1.2209 + 1.2210 +int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 1.2211 + if(length>0 && bmpSet!=NULL) { 1.2212 + const uint8_t *s0=(const uint8_t *)s; 1.2213 + return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); 1.2214 + } 1.2215 + if(length<0) { 1.2216 + length=(int32_t)uprv_strlen(s); 1.2217 + } 1.2218 + if(length==0) { 1.2219 + return 0; 1.2220 + } 1.2221 + if(stringSpan!=NULL) { 1.2222 + return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); 1.2223 + } else if(!strings->isEmpty()) { 1.2224 + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1.2225 + UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 1.2226 + UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 1.2227 + UnicodeSetStringSpan strSpan(*this, *strings, which); 1.2228 + if(strSpan.needsStringSpanUTF8()) { 1.2229 + return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); 1.2230 + } 1.2231 + } 1.2232 + 1.2233 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.2234 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.2235 + } 1.2236 + 1.2237 + UChar32 c; 1.2238 + int32_t start=0, prev=0; 1.2239 + do { 1.2240 + U8_NEXT_OR_FFFD(s, start, length, c); 1.2241 + if(spanCondition!=contains(c)) { 1.2242 + break; 1.2243 + } 1.2244 + } while((prev=start)<length); 1.2245 + return prev; 1.2246 +} 1.2247 + 1.2248 +int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { 1.2249 + if(length>0 && bmpSet!=NULL) { 1.2250 + const uint8_t *s0=(const uint8_t *)s; 1.2251 + return bmpSet->spanBackUTF8(s0, length, spanCondition); 1.2252 + } 1.2253 + if(length<0) { 1.2254 + length=(int32_t)uprv_strlen(s); 1.2255 + } 1.2256 + if(length==0) { 1.2257 + return 0; 1.2258 + } 1.2259 + if(stringSpan!=NULL) { 1.2260 + return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); 1.2261 + } else if(!strings->isEmpty()) { 1.2262 + uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1.2263 + UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 1.2264 + UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 1.2265 + UnicodeSetStringSpan strSpan(*this, *strings, which); 1.2266 + if(strSpan.needsStringSpanUTF8()) { 1.2267 + return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); 1.2268 + } 1.2269 + } 1.2270 + 1.2271 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.2272 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.2273 + } 1.2274 + 1.2275 + UChar32 c; 1.2276 + int32_t prev=length; 1.2277 + do { 1.2278 + U8_PREV_OR_FFFD(s, 0, length, c); 1.2279 + if(spanCondition!=contains(c)) { 1.2280 + break; 1.2281 + } 1.2282 + } while((prev=length)>0); 1.2283 + return prev; 1.2284 +} 1.2285 + 1.2286 +U_NAMESPACE_END