intl/icu/source/common/uniset.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 1999-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * Date Name Description
michael@0 7 * 10/20/99 alan Creation.
michael@0 8 **********************************************************************
michael@0 9 */
michael@0 10
michael@0 11 #include "unicode/utypes.h"
michael@0 12 #include "unicode/parsepos.h"
michael@0 13 #include "unicode/symtable.h"
michael@0 14 #include "unicode/uniset.h"
michael@0 15 #include "unicode/utf8.h"
michael@0 16 #include "unicode/utf16.h"
michael@0 17 #include "ruleiter.h"
michael@0 18 #include "cmemory.h"
michael@0 19 #include "cstring.h"
michael@0 20 #include "patternprops.h"
michael@0 21 #include "uelement.h"
michael@0 22 #include "util.h"
michael@0 23 #include "uvector.h"
michael@0 24 #include "charstr.h"
michael@0 25 #include "ustrfmt.h"
michael@0 26 #include "uassert.h"
michael@0 27 #include "bmpset.h"
michael@0 28 #include "unisetspan.h"
michael@0 29
michael@0 30 // Define UChar constants using hex for EBCDIC compatibility
michael@0 31 // Used #define to reduce private static exports and memory access time.
michael@0 32 #define SET_OPEN ((UChar)0x005B) /*[*/
michael@0 33 #define SET_CLOSE ((UChar)0x005D) /*]*/
michael@0 34 #define HYPHEN ((UChar)0x002D) /*-*/
michael@0 35 #define COMPLEMENT ((UChar)0x005E) /*^*/
michael@0 36 #define COLON ((UChar)0x003A) /*:*/
michael@0 37 #define BACKSLASH ((UChar)0x005C) /*\*/
michael@0 38 #define INTERSECTION ((UChar)0x0026) /*&*/
michael@0 39 #define UPPER_U ((UChar)0x0055) /*U*/
michael@0 40 #define LOWER_U ((UChar)0x0075) /*u*/
michael@0 41 #define OPEN_BRACE ((UChar)123) /*{*/
michael@0 42 #define CLOSE_BRACE ((UChar)125) /*}*/
michael@0 43 #define UPPER_P ((UChar)0x0050) /*P*/
michael@0 44 #define LOWER_P ((UChar)0x0070) /*p*/
michael@0 45 #define UPPER_N ((UChar)78) /*N*/
michael@0 46 #define EQUALS ((UChar)0x003D) /*=*/
michael@0 47
michael@0 48 // HIGH_VALUE > all valid values. 110000 for codepoints
michael@0 49 #define UNICODESET_HIGH 0x0110000
michael@0 50
michael@0 51 // LOW <= all valid values. ZERO for codepoints
michael@0 52 #define UNICODESET_LOW 0x000000
michael@0 53
michael@0 54 // initial storage. Must be >= 0
michael@0 55 #define START_EXTRA 16
michael@0 56
michael@0 57 // extra amount for growth. Must be >= 0
michael@0 58 #define GROW_EXTRA START_EXTRA
michael@0 59
michael@0 60 U_NAMESPACE_BEGIN
michael@0 61
michael@0 62 SymbolTable::~SymbolTable() {}
michael@0 63
michael@0 64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
michael@0 65
michael@0 66 /**
michael@0 67 * Modify the given UChar32 variable so that it is in range, by
michael@0 68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
michael@0 69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
michael@0 70 * It modifies its argument in-place and also returns it.
michael@0 71 */
michael@0 72 static inline UChar32 pinCodePoint(UChar32& c) {
michael@0 73 if (c < UNICODESET_LOW) {
michael@0 74 c = UNICODESET_LOW;
michael@0 75 } else if (c > (UNICODESET_HIGH-1)) {
michael@0 76 c = (UNICODESET_HIGH-1);
michael@0 77 }
michael@0 78 return c;
michael@0 79 }
michael@0 80
michael@0 81 //----------------------------------------------------------------
michael@0 82 // Debugging
michael@0 83 //----------------------------------------------------------------
michael@0 84
michael@0 85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
michael@0 86 // To enable the debugging, define the symbol DEBUG_MEM in the line
michael@0 87 // below. This will result in text being sent to stdout that looks
michael@0 88 // like this:
michael@0 89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
michael@0 90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
michael@0 91 // Each line lists a construction (ct) or destruction (dt) event, the
michael@0 92 // object address, the number of outstanding objects after the event,
michael@0 93 // and the pattern of the object in question.
michael@0 94
michael@0 95 // #define DEBUG_MEM
michael@0 96
michael@0 97 #ifdef DEBUG_MEM
michael@0 98 #include <stdio.h>
michael@0 99 static int32_t _dbgCount = 0;
michael@0 100
michael@0 101 static inline void _dbgct(UnicodeSet* set) {
michael@0 102 UnicodeString str;
michael@0 103 set->toPattern(str, TRUE);
michael@0 104 char buf[40];
michael@0 105 str.extract(0, 39, buf, "");
michael@0 106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
michael@0 107 }
michael@0 108
michael@0 109 static inline void _dbgdt(UnicodeSet* set) {
michael@0 110 UnicodeString str;
michael@0 111 set->toPattern(str, TRUE);
michael@0 112 char buf[40];
michael@0 113 str.extract(0, 39, buf, "");
michael@0 114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
michael@0 115 }
michael@0 116
michael@0 117 #else
michael@0 118
michael@0 119 #define _dbgct(set)
michael@0 120 #define _dbgdt(set)
michael@0 121
michael@0 122 #endif
michael@0 123
michael@0 124 //----------------------------------------------------------------
michael@0 125 // UnicodeString in UVector support
michael@0 126 //----------------------------------------------------------------
michael@0 127
michael@0 128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
michael@0 129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
michael@0 130 }
michael@0 131
michael@0 132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
michael@0 133 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
michael@0 134 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
michael@0 135 return a.compare(b);
michael@0 136 }
michael@0 137
michael@0 138 //----------------------------------------------------------------
michael@0 139 // Constructors &c
michael@0 140 //----------------------------------------------------------------
michael@0 141
michael@0 142 /**
michael@0 143 * Constructs an empty set.
michael@0 144 */
michael@0 145 UnicodeSet::UnicodeSet() :
michael@0 146 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
michael@0 147 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0 148 fFlags(0)
michael@0 149 {
michael@0 150 UErrorCode status = U_ZERO_ERROR;
michael@0 151 allocateStrings(status);
michael@0 152 if (U_FAILURE(status)) {
michael@0 153 return;
michael@0 154 }
michael@0 155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0 156 if(list!=NULL){
michael@0 157 list[0] = UNICODESET_HIGH;
michael@0 158 } else { // If memory allocation failed, set to bogus state.
michael@0 159 setToBogus();
michael@0 160 return;
michael@0 161 }
michael@0 162 _dbgct(this);
michael@0 163 }
michael@0 164
michael@0 165 /**
michael@0 166 * Constructs a set containing the given range. If <code>end >
michael@0 167 * start</code> then an empty set is created.
michael@0 168 *
michael@0 169 * @param start first character, inclusive, of range
michael@0 170 * @param end last character, inclusive, of range
michael@0 171 */
michael@0 172 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
michael@0 173 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
michael@0 174 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0 175 fFlags(0)
michael@0 176 {
michael@0 177 UErrorCode status = U_ZERO_ERROR;
michael@0 178 allocateStrings(status);
michael@0 179 if (U_FAILURE(status)) {
michael@0 180 return;
michael@0 181 }
michael@0 182 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0 183 if(list!=NULL){
michael@0 184 list[0] = UNICODESET_HIGH;
michael@0 185 complement(start, end);
michael@0 186 } else { // If memory allocation failed, set to bogus state.
michael@0 187 setToBogus();
michael@0 188 return;
michael@0 189 }
michael@0 190 _dbgct(this);
michael@0 191 }
michael@0 192
michael@0 193 /**
michael@0 194 * Constructs a set that is identical to the given UnicodeSet.
michael@0 195 */
michael@0 196 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
michael@0 197 UnicodeFilter(o),
michael@0 198 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
michael@0 199 bmpSet(0),
michael@0 200 buffer(0), bufferCapacity(0),
michael@0 201 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0 202 fFlags(0)
michael@0 203 {
michael@0 204 UErrorCode status = U_ZERO_ERROR;
michael@0 205 allocateStrings(status);
michael@0 206 if (U_FAILURE(status)) {
michael@0 207 return;
michael@0 208 }
michael@0 209 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0 210 if(list!=NULL){
michael@0 211 *this = o;
michael@0 212 } else { // If memory allocation failed, set to bogus state.
michael@0 213 setToBogus();
michael@0 214 return;
michael@0 215 }
michael@0 216 _dbgct(this);
michael@0 217 }
michael@0 218
michael@0 219 // Copy-construct as thawed.
michael@0 220 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
michael@0 221 UnicodeFilter(o),
michael@0 222 len(0), capacity(o.len + GROW_EXTRA), list(0),
michael@0 223 bmpSet(0),
michael@0 224 buffer(0), bufferCapacity(0),
michael@0 225 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0 226 fFlags(0)
michael@0 227 {
michael@0 228 UErrorCode status = U_ZERO_ERROR;
michael@0 229 allocateStrings(status);
michael@0 230 if (U_FAILURE(status)) {
michael@0 231 return;
michael@0 232 }
michael@0 233 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0 234 if(list!=NULL){
michael@0 235 // *this = o except for bmpSet and stringSpan
michael@0 236 len = o.len;
michael@0 237 uprv_memcpy(list, o.list, len*sizeof(UChar32));
michael@0 238 if (strings != NULL && o.strings != NULL) {
michael@0 239 strings->assign(*o.strings, cloneUnicodeString, status);
michael@0 240 } else { // Invalid strings.
michael@0 241 setToBogus();
michael@0 242 return;
michael@0 243 }
michael@0 244 if (o.pat) {
michael@0 245 setPattern(UnicodeString(o.pat, o.patLen));
michael@0 246 }
michael@0 247 } else { // If memory allocation failed, set to bogus state.
michael@0 248 setToBogus();
michael@0 249 return;
michael@0 250 }
michael@0 251 _dbgct(this);
michael@0 252 }
michael@0 253
michael@0 254 /**
michael@0 255 * Destructs the set.
michael@0 256 */
michael@0 257 UnicodeSet::~UnicodeSet() {
michael@0 258 _dbgdt(this); // first!
michael@0 259 uprv_free(list);
michael@0 260 delete bmpSet;
michael@0 261 if (buffer) {
michael@0 262 uprv_free(buffer);
michael@0 263 }
michael@0 264 delete strings;
michael@0 265 delete stringSpan;
michael@0 266 releasePattern();
michael@0 267 }
michael@0 268
michael@0 269 /**
michael@0 270 * Assigns this object to be a copy of another.
michael@0 271 */
michael@0 272 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
michael@0 273 if (this == &o) {
michael@0 274 return *this;
michael@0 275 }
michael@0 276 if (isFrozen()) {
michael@0 277 return *this;
michael@0 278 }
michael@0 279 if (o.isBogus()) {
michael@0 280 setToBogus();
michael@0 281 return *this;
michael@0 282 }
michael@0 283 UErrorCode ec = U_ZERO_ERROR;
michael@0 284 ensureCapacity(o.len, ec);
michael@0 285 if (U_FAILURE(ec)) {
michael@0 286 return *this; // There is no way to report this error :-(
michael@0 287 }
michael@0 288 len = o.len;
michael@0 289 uprv_memcpy(list, o.list, len*sizeof(UChar32));
michael@0 290 if (o.bmpSet == NULL) {
michael@0 291 bmpSet = NULL;
michael@0 292 } else {
michael@0 293 bmpSet = new BMPSet(*o.bmpSet, list, len);
michael@0 294 if (bmpSet == NULL) { // Check for memory allocation error.
michael@0 295 setToBogus();
michael@0 296 return *this;
michael@0 297 }
michael@0 298 }
michael@0 299 if (strings != NULL && o.strings != NULL) {
michael@0 300 strings->assign(*o.strings, cloneUnicodeString, ec);
michael@0 301 } else { // Invalid strings.
michael@0 302 setToBogus();
michael@0 303 return *this;
michael@0 304 }
michael@0 305 if (o.stringSpan == NULL) {
michael@0 306 stringSpan = NULL;
michael@0 307 } else {
michael@0 308 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
michael@0 309 if (stringSpan == NULL) { // Check for memory allocation error.
michael@0 310 setToBogus();
michael@0 311 return *this;
michael@0 312 }
michael@0 313 }
michael@0 314 releasePattern();
michael@0 315 if (o.pat) {
michael@0 316 setPattern(UnicodeString(o.pat, o.patLen));
michael@0 317 }
michael@0 318 return *this;
michael@0 319 }
michael@0 320
michael@0 321 /**
michael@0 322 * Returns a copy of this object. All UnicodeMatcher objects have
michael@0 323 * to support cloning in order to allow classes using
michael@0 324 * UnicodeMatchers, such as Transliterator, to implement cloning.
michael@0 325 */
michael@0 326 UnicodeFunctor* UnicodeSet::clone() const {
michael@0 327 return new UnicodeSet(*this);
michael@0 328 }
michael@0 329
michael@0 330 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
michael@0 331 return new UnicodeSet(*this, TRUE);
michael@0 332 }
michael@0 333
michael@0 334 /**
michael@0 335 * Compares the specified object with this set for equality. Returns
michael@0 336 * <tt>true</tt> if the two sets
michael@0 337 * have the same size, and every member of the specified set is
michael@0 338 * contained in this set (or equivalently, every member of this set is
michael@0 339 * contained in the specified set).
michael@0 340 *
michael@0 341 * @param o set to be compared for equality with this set.
michael@0 342 * @return <tt>true</tt> if the specified set is equal to this set.
michael@0 343 */
michael@0 344 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
michael@0 345 if (len != o.len) return FALSE;
michael@0 346 for (int32_t i = 0; i < len; ++i) {
michael@0 347 if (list[i] != o.list[i]) return FALSE;
michael@0 348 }
michael@0 349 if (*strings != *o.strings) return FALSE;
michael@0 350 return TRUE;
michael@0 351 }
michael@0 352
michael@0 353 /**
michael@0 354 * Returns the hash code value for this set.
michael@0 355 *
michael@0 356 * @return the hash code value for this set.
michael@0 357 * @see Object#hashCode()
michael@0 358 */
michael@0 359 int32_t UnicodeSet::hashCode(void) const {
michael@0 360 int32_t result = len;
michael@0 361 for (int32_t i = 0; i < len; ++i) {
michael@0 362 result *= 1000003;
michael@0 363 result += list[i];
michael@0 364 }
michael@0 365 return result;
michael@0 366 }
michael@0 367
michael@0 368 //----------------------------------------------------------------
michael@0 369 // Public API
michael@0 370 //----------------------------------------------------------------
michael@0 371
michael@0 372 /**
michael@0 373 * Returns the number of elements in this set (its cardinality),
michael@0 374 * Note than the elements of a set may include both individual
michael@0 375 * codepoints and strings.
michael@0 376 *
michael@0 377 * @return the number of elements in this set (its cardinality).
michael@0 378 */
michael@0 379 int32_t UnicodeSet::size(void) const {
michael@0 380 int32_t n = 0;
michael@0 381 int32_t count = getRangeCount();
michael@0 382 for (int32_t i = 0; i < count; ++i) {
michael@0 383 n += getRangeEnd(i) - getRangeStart(i) + 1;
michael@0 384 }
michael@0 385 return n + strings->size();
michael@0 386 }
michael@0 387
michael@0 388 /**
michael@0 389 * Returns <tt>true</tt> if this set contains no elements.
michael@0 390 *
michael@0 391 * @return <tt>true</tt> if this set contains no elements.
michael@0 392 */
michael@0 393 UBool UnicodeSet::isEmpty(void) const {
michael@0 394 return len == 1 && strings->size() == 0;
michael@0 395 }
michael@0 396
michael@0 397 /**
michael@0 398 * Returns true if this set contains the given character.
michael@0 399 * @param c character to be checked for containment
michael@0 400 * @return true if the test condition is met
michael@0 401 */
michael@0 402 UBool UnicodeSet::contains(UChar32 c) const {
michael@0 403 // Set i to the index of the start item greater than ch
michael@0 404 // We know we will terminate without length test!
michael@0 405 // LATER: for large sets, add binary search
michael@0 406 //int32_t i = -1;
michael@0 407 //for (;;) {
michael@0 408 // if (c < list[++i]) break;
michael@0 409 //}
michael@0 410 if (bmpSet != NULL) {
michael@0 411 return bmpSet->contains(c);
michael@0 412 }
michael@0 413 if (stringSpan != NULL) {
michael@0 414 return stringSpan->contains(c);
michael@0 415 }
michael@0 416 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
michael@0 417 return FALSE;
michael@0 418 }
michael@0 419 int32_t i = findCodePoint(c);
michael@0 420 return (UBool)(i & 1); // return true if odd
michael@0 421 }
michael@0 422
michael@0 423 /**
michael@0 424 * Returns the smallest value i such that c < list[i]. Caller
michael@0 425 * must ensure that c is a legal value or this method will enter
michael@0 426 * an infinite loop. This method performs a binary search.
michael@0 427 * @param c a character in the range MIN_VALUE..MAX_VALUE
michael@0 428 * inclusive
michael@0 429 * @return the smallest integer i in the range 0..len-1,
michael@0 430 * inclusive, such that c < list[i]
michael@0 431 */
michael@0 432 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
michael@0 433 /* Examples:
michael@0 434 findCodePoint(c)
michael@0 435 set list[] c=0 1 3 4 7 8
michael@0 436 === ============== ===========
michael@0 437 [] [110000] 0 0 0 0 0 0
michael@0 438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
michael@0 439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
michael@0 440 [:Any:] [0, 110000] 1 1 1 1 1 1
michael@0 441 */
michael@0 442
michael@0 443 // Return the smallest i such that c < list[i]. Assume
michael@0 444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
michael@0 445 if (c < list[0])
michael@0 446 return 0;
michael@0 447 // High runner test. c is often after the last range, so an
michael@0 448 // initial check for this condition pays off.
michael@0 449 int32_t lo = 0;
michael@0 450 int32_t hi = len - 1;
michael@0 451 if (lo >= hi || c >= list[hi-1])
michael@0 452 return hi;
michael@0 453 // invariant: c >= list[lo]
michael@0 454 // invariant: c < list[hi]
michael@0 455 for (;;) {
michael@0 456 int32_t i = (lo + hi) >> 1;
michael@0 457 if (i == lo) {
michael@0 458 break; // Found!
michael@0 459 } else if (c < list[i]) {
michael@0 460 hi = i;
michael@0 461 } else {
michael@0 462 lo = i;
michael@0 463 }
michael@0 464 }
michael@0 465 return hi;
michael@0 466 }
michael@0 467
michael@0 468 /**
michael@0 469 * Returns true if this set contains every character
michael@0 470 * of the given range.
michael@0 471 * @param start first character, inclusive, of the range
michael@0 472 * @param end last character, inclusive, of the range
michael@0 473 * @return true if the test condition is met
michael@0 474 */
michael@0 475 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
michael@0 476 //int32_t i = -1;
michael@0 477 //for (;;) {
michael@0 478 // if (start < list[++i]) break;
michael@0 479 //}
michael@0 480 int32_t i = findCodePoint(start);
michael@0 481 return ((i & 1) != 0 && end < list[i]);
michael@0 482 }
michael@0 483
michael@0 484 /**
michael@0 485 * Returns <tt>true</tt> if this set contains the given
michael@0 486 * multicharacter string.
michael@0 487 * @param s string to be checked for containment
michael@0 488 * @return <tt>true</tt> if this set contains the specified string
michael@0 489 */
michael@0 490 UBool UnicodeSet::contains(const UnicodeString& s) const {
michael@0 491 if (s.length() == 0) return FALSE;
michael@0 492 int32_t cp = getSingleCP(s);
michael@0 493 if (cp < 0) {
michael@0 494 return strings->contains((void*) &s);
michael@0 495 } else {
michael@0 496 return contains((UChar32) cp);
michael@0 497 }
michael@0 498 }
michael@0 499
michael@0 500 /**
michael@0 501 * Returns true if this set contains all the characters and strings
michael@0 502 * of the given set.
michael@0 503 * @param c set to be checked for containment
michael@0 504 * @return true if the test condition is met
michael@0 505 */
michael@0 506 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
michael@0 507 // The specified set is a subset if all of its pairs are contained in
michael@0 508 // this set. It's possible to code this more efficiently in terms of
michael@0 509 // direct manipulation of the inversion lists if the need arises.
michael@0 510 int32_t n = c.getRangeCount();
michael@0 511 for (int i=0; i<n; ++i) {
michael@0 512 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
michael@0 513 return FALSE;
michael@0 514 }
michael@0 515 }
michael@0 516 if (!strings->containsAll(*c.strings)) return FALSE;
michael@0 517 return TRUE;
michael@0 518 }
michael@0 519
michael@0 520 /**
michael@0 521 * Returns true if this set contains all the characters
michael@0 522 * of the given string.
michael@0 523 * @param s string containing characters to be checked for containment
michael@0 524 * @return true if the test condition is met
michael@0 525 */
michael@0 526 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
michael@0 527 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
michael@0 528 s.length());
michael@0 529 }
michael@0 530
michael@0 531 /**
michael@0 532 * Returns true if this set contains none of the characters
michael@0 533 * of the given range.
michael@0 534 * @param start first character, inclusive, of the range
michael@0 535 * @param end last character, inclusive, of the range
michael@0 536 * @return true if the test condition is met
michael@0 537 */
michael@0 538 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
michael@0 539 //int32_t i = -1;
michael@0 540 //for (;;) {
michael@0 541 // if (start < list[++i]) break;
michael@0 542 //}
michael@0 543 int32_t i = findCodePoint(start);
michael@0 544 return ((i & 1) == 0 && end < list[i]);
michael@0 545 }
michael@0 546
michael@0 547 /**
michael@0 548 * Returns true if this set contains none of the characters and strings
michael@0 549 * of the given set.
michael@0 550 * @param c set to be checked for containment
michael@0 551 * @return true if the test condition is met
michael@0 552 */
michael@0 553 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
michael@0 554 // The specified set is a subset if all of its pairs are contained in
michael@0 555 // this set. It's possible to code this more efficiently in terms of
michael@0 556 // direct manipulation of the inversion lists if the need arises.
michael@0 557 int32_t n = c.getRangeCount();
michael@0 558 for (int32_t i=0; i<n; ++i) {
michael@0 559 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
michael@0 560 return FALSE;
michael@0 561 }
michael@0 562 }
michael@0 563 if (!strings->containsNone(*c.strings)) return FALSE;
michael@0 564 return TRUE;
michael@0 565 }
michael@0 566
michael@0 567 /**
michael@0 568 * Returns true if this set contains none of the characters
michael@0 569 * of the given string.
michael@0 570 * @param s string containing characters to be checked for containment
michael@0 571 * @return true if the test condition is met
michael@0 572 */
michael@0 573 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
michael@0 574 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
michael@0 575 s.length());
michael@0 576 }
michael@0 577
michael@0 578 /**
michael@0 579 * Returns <tt>true</tt> if this set contains any character whose low byte
michael@0 580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
michael@0 581 * indexing.
michael@0 582 */
michael@0 583 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
michael@0 584 /* The index value v, in the range [0,255], is contained in this set if
michael@0 585 * it is contained in any pair of this set. Pairs either have the high
michael@0 586 * bytes equal, or unequal. If the high bytes are equal, then we have
michael@0 587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
michael@0 588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
michael@0 589 * Then v is contained if xx <= v || v <= yy. (This is identical to the
michael@0 590 * time zone month containment logic.)
michael@0 591 */
michael@0 592 int32_t i;
michael@0 593 int32_t rangeCount=getRangeCount();
michael@0 594 for (i=0; i<rangeCount; ++i) {
michael@0 595 UChar32 low = getRangeStart(i);
michael@0 596 UChar32 high = getRangeEnd(i);
michael@0 597 if ((low & ~0xFF) == (high & ~0xFF)) {
michael@0 598 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
michael@0 599 return TRUE;
michael@0 600 }
michael@0 601 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
michael@0 602 return TRUE;
michael@0 603 }
michael@0 604 }
michael@0 605 if (strings->size() != 0) {
michael@0 606 for (i=0; i<strings->size(); ++i) {
michael@0 607 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
michael@0 608 //if (s.length() == 0) {
michael@0 609 // // Empty strings match everything
michael@0 610 // return TRUE;
michael@0 611 //}
michael@0 612 // assert(s.length() != 0); // We enforce this elsewhere
michael@0 613 UChar32 c = s.char32At(0);
michael@0 614 if ((c & 0xFF) == v) {
michael@0 615 return TRUE;
michael@0 616 }
michael@0 617 }
michael@0 618 }
michael@0 619 return FALSE;
michael@0 620 }
michael@0 621
michael@0 622 /**
michael@0 623 * Implementation of UnicodeMatcher::matches(). Always matches the
michael@0 624 * longest possible multichar string.
michael@0 625 */
michael@0 626 UMatchDegree UnicodeSet::matches(const Replaceable& text,
michael@0 627 int32_t& offset,
michael@0 628 int32_t limit,
michael@0 629 UBool incremental) {
michael@0 630 if (offset == limit) {
michael@0 631 // Strings, if any, have length != 0, so we don't worry
michael@0 632 // about them here. If we ever allow zero-length strings
michael@0 633 // we much check for them here.
michael@0 634 if (contains(U_ETHER)) {
michael@0 635 return incremental ? U_PARTIAL_MATCH : U_MATCH;
michael@0 636 } else {
michael@0 637 return U_MISMATCH;
michael@0 638 }
michael@0 639 } else {
michael@0 640 if (strings->size() != 0) { // try strings first
michael@0 641
michael@0 642 // might separate forward and backward loops later
michael@0 643 // for now they are combined
michael@0 644
michael@0 645 // TODO Improve efficiency of this, at least in the forward
michael@0 646 // direction, if not in both. In the forward direction we
michael@0 647 // can assume the strings are sorted.
michael@0 648
michael@0 649 int32_t i;
michael@0 650 UBool forward = offset < limit;
michael@0 651
michael@0 652 // firstChar is the leftmost char to match in the
michael@0 653 // forward direction or the rightmost char to match in
michael@0 654 // the reverse direction.
michael@0 655 UChar firstChar = text.charAt(offset);
michael@0 656
michael@0 657 // If there are multiple strings that can match we
michael@0 658 // return the longest match.
michael@0 659 int32_t highWaterLength = 0;
michael@0 660
michael@0 661 for (i=0; i<strings->size(); ++i) {
michael@0 662 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
michael@0 663
michael@0 664 //if (trial.length() == 0) {
michael@0 665 // return U_MATCH; // null-string always matches
michael@0 666 //}
michael@0 667 // assert(trial.length() != 0); // We ensure this elsewhere
michael@0 668
michael@0 669 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
michael@0 670
michael@0 671 // Strings are sorted, so we can optimize in the
michael@0 672 // forward direction.
michael@0 673 if (forward && c > firstChar) break;
michael@0 674 if (c != firstChar) continue;
michael@0 675
michael@0 676 int32_t matchLen = matchRest(text, offset, limit, trial);
michael@0 677
michael@0 678 if (incremental) {
michael@0 679 int32_t maxLen = forward ? limit-offset : offset-limit;
michael@0 680 if (matchLen == maxLen) {
michael@0 681 // We have successfully matched but only up to limit.
michael@0 682 return U_PARTIAL_MATCH;
michael@0 683 }
michael@0 684 }
michael@0 685
michael@0 686 if (matchLen == trial.length()) {
michael@0 687 // We have successfully matched the whole string.
michael@0 688 if (matchLen > highWaterLength) {
michael@0 689 highWaterLength = matchLen;
michael@0 690 }
michael@0 691 // In the forward direction we know strings
michael@0 692 // are sorted so we can bail early.
michael@0 693 if (forward && matchLen < highWaterLength) {
michael@0 694 break;
michael@0 695 }
michael@0 696 continue;
michael@0 697 }
michael@0 698 }
michael@0 699
michael@0 700 // We've checked all strings without a partial match.
michael@0 701 // If we have full matches, return the longest one.
michael@0 702 if (highWaterLength != 0) {
michael@0 703 offset += forward ? highWaterLength : -highWaterLength;
michael@0 704 return U_MATCH;
michael@0 705 }
michael@0 706 }
michael@0 707 return UnicodeFilter::matches(text, offset, limit, incremental);
michael@0 708 }
michael@0 709 }
michael@0 710
michael@0 711 /**
michael@0 712 * Returns the longest match for s in text at the given position.
michael@0 713 * If limit > start then match forward from start+1 to limit
michael@0 714 * matching all characters except s.charAt(0). If limit < start,
michael@0 715 * go backward starting from start-1 matching all characters
michael@0 716 * except s.charAt(s.length()-1). This method assumes that the
michael@0 717 * first character, text.charAt(start), matches s, so it does not
michael@0 718 * check it.
michael@0 719 * @param text the text to match
michael@0 720 * @param start the first character to match. In the forward
michael@0 721 * direction, text.charAt(start) is matched against s.charAt(0).
michael@0 722 * In the reverse direction, it is matched against
michael@0 723 * s.charAt(s.length()-1).
michael@0 724 * @param limit the limit offset for matching, either last+1 in
michael@0 725 * the forward direction, or last-1 in the reverse direction,
michael@0 726 * where last is the index of the last character to match.
michael@0 727 * @return If part of s matches up to the limit, return |limit -
michael@0 728 * start|. If all of s matches before reaching the limit, return
michael@0 729 * s.length(). If there is a mismatch between s and text, return
michael@0 730 * 0
michael@0 731 */
michael@0 732 int32_t UnicodeSet::matchRest(const Replaceable& text,
michael@0 733 int32_t start, int32_t limit,
michael@0 734 const UnicodeString& s) {
michael@0 735 int32_t i;
michael@0 736 int32_t maxLen;
michael@0 737 int32_t slen = s.length();
michael@0 738 if (start < limit) {
michael@0 739 maxLen = limit - start;
michael@0 740 if (maxLen > slen) maxLen = slen;
michael@0 741 for (i = 1; i < maxLen; ++i) {
michael@0 742 if (text.charAt(start + i) != s.charAt(i)) return 0;
michael@0 743 }
michael@0 744 } else {
michael@0 745 maxLen = start - limit;
michael@0 746 if (maxLen > slen) maxLen = slen;
michael@0 747 --slen; // <=> slen = s.length() - 1;
michael@0 748 for (i = 1; i < maxLen; ++i) {
michael@0 749 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
michael@0 750 }
michael@0 751 }
michael@0 752 return maxLen;
michael@0 753 }
michael@0 754
michael@0 755 /**
michael@0 756 * Implement of UnicodeMatcher
michael@0 757 */
michael@0 758 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
michael@0 759 toUnionTo.addAll(*this);
michael@0 760 }
michael@0 761
michael@0 762 /**
michael@0 763 * Returns the index of the given character within this set, where
michael@0 764 * the set is ordered by ascending code point. If the character
michael@0 765 * is not in this set, return -1. The inverse of this method is
michael@0 766 * <code>charAt()</code>.
michael@0 767 * @return an index from 0..size()-1, or -1
michael@0 768 */
michael@0 769 int32_t UnicodeSet::indexOf(UChar32 c) const {
michael@0 770 if (c < MIN_VALUE || c > MAX_VALUE) {
michael@0 771 return -1;
michael@0 772 }
michael@0 773 int32_t i = 0;
michael@0 774 int32_t n = 0;
michael@0 775 for (;;) {
michael@0 776 UChar32 start = list[i++];
michael@0 777 if (c < start) {
michael@0 778 return -1;
michael@0 779 }
michael@0 780 UChar32 limit = list[i++];
michael@0 781 if (c < limit) {
michael@0 782 return n + c - start;
michael@0 783 }
michael@0 784 n += limit - start;
michael@0 785 }
michael@0 786 }
michael@0 787
michael@0 788 /**
michael@0 789 * Returns the character at the given index within this set, where
michael@0 790 * the set is ordered by ascending code point. If the index is
michael@0 791 * out of range, return (UChar32)-1. The inverse of this method is
michael@0 792 * <code>indexOf()</code>.
michael@0 793 * @param index an index from 0..size()-1
michael@0 794 * @return the character at the given index, or (UChar32)-1.
michael@0 795 */
michael@0 796 UChar32 UnicodeSet::charAt(int32_t index) const {
michael@0 797 if (index >= 0) {
michael@0 798 // len2 is the largest even integer <= len, that is, it is len
michael@0 799 // for even values and len-1 for odd values. With odd values
michael@0 800 // the last entry is UNICODESET_HIGH.
michael@0 801 int32_t len2 = len & ~1;
michael@0 802 for (int32_t i=0; i < len2;) {
michael@0 803 UChar32 start = list[i++];
michael@0 804 int32_t count = list[i++] - start;
michael@0 805 if (index < count) {
michael@0 806 return (UChar32)(start + index);
michael@0 807 }
michael@0 808 index -= count;
michael@0 809 }
michael@0 810 }
michael@0 811 return (UChar32)-1;
michael@0 812 }
michael@0 813
michael@0 814 /**
michael@0 815 * Make this object represent the range <code>start - end</code>.
michael@0 816 * If <code>end > start</code> then this object is set to an
michael@0 817 * an empty range.
michael@0 818 *
michael@0 819 * @param start first character in the set, inclusive
michael@0 820 * @rparam end last character in the set, inclusive
michael@0 821 */
michael@0 822 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
michael@0 823 clear();
michael@0 824 complement(start, end);
michael@0 825 return *this;
michael@0 826 }
michael@0 827
michael@0 828 /**
michael@0 829 * Adds the specified range to this set if it is not already
michael@0 830 * present. If this set already contains the specified range,
michael@0 831 * the call leaves this set unchanged. If <code>end > start</code>
michael@0 832 * then an empty range is added, leaving the set unchanged.
michael@0 833 *
michael@0 834 * @param start first character, inclusive, of range to be added
michael@0 835 * to this set.
michael@0 836 * @param end last character, inclusive, of range to be added
michael@0 837 * to this set.
michael@0 838 */
michael@0 839 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
michael@0 840 if (pinCodePoint(start) < pinCodePoint(end)) {
michael@0 841 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0 842 add(range, 2, 0);
michael@0 843 } else if (start == end) {
michael@0 844 add(start);
michael@0 845 }
michael@0 846 return *this;
michael@0 847 }
michael@0 848
michael@0 849 // #define DEBUG_US_ADD
michael@0 850
michael@0 851 #ifdef DEBUG_US_ADD
michael@0 852 #include <stdio.h>
michael@0 853 void dump(UChar32 c) {
michael@0 854 if (c <= 0xFF) {
michael@0 855 printf("%c", (char)c);
michael@0 856 } else {
michael@0 857 printf("U+%04X", c);
michael@0 858 }
michael@0 859 }
michael@0 860 void dump(const UChar32* list, int32_t len) {
michael@0 861 printf("[");
michael@0 862 for (int32_t i=0; i<len; ++i) {
michael@0 863 if (i != 0) printf(", ");
michael@0 864 dump(list[i]);
michael@0 865 }
michael@0 866 printf("]");
michael@0 867 }
michael@0 868 #endif
michael@0 869
michael@0 870 /**
michael@0 871 * Adds the specified character to this set if it is not already
michael@0 872 * present. If this set already contains the specified character,
michael@0 873 * the call leaves this set unchanged.
michael@0 874 */
michael@0 875 UnicodeSet& UnicodeSet::add(UChar32 c) {
michael@0 876 // find smallest i such that c < list[i]
michael@0 877 // if odd, then it is IN the set
michael@0 878 // if even, then it is OUT of the set
michael@0 879 int32_t i = findCodePoint(pinCodePoint(c));
michael@0 880
michael@0 881 // already in set?
michael@0 882 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
michael@0 883
michael@0 884 // HIGH is 0x110000
michael@0 885 // assert(list[len-1] == HIGH);
michael@0 886
michael@0 887 // empty = [HIGH]
michael@0 888 // [start_0, limit_0, start_1, limit_1, HIGH]
michael@0 889
michael@0 890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
michael@0 891 // ^
michael@0 892 // list[i]
michael@0 893
michael@0 894 // i == 0 means c is before the first range
michael@0 895
michael@0 896 #ifdef DEBUG_US_ADD
michael@0 897 printf("Add of ");
michael@0 898 dump(c);
michael@0 899 printf(" found at %d", i);
michael@0 900 printf(": ");
michael@0 901 dump(list, len);
michael@0 902 printf(" => ");
michael@0 903 #endif
michael@0 904
michael@0 905 if (c == list[i]-1) {
michael@0 906 // c is before start of next range
michael@0 907 list[i] = c;
michael@0 908 // if we touched the HIGH mark, then add a new one
michael@0 909 if (c == (UNICODESET_HIGH - 1)) {
michael@0 910 UErrorCode status = U_ZERO_ERROR;
michael@0 911 ensureCapacity(len+1, status);
michael@0 912 if (U_FAILURE(status)) {
michael@0 913 return *this; // There is no way to report this error :-(
michael@0 914 }
michael@0 915 list[len++] = UNICODESET_HIGH;
michael@0 916 }
michael@0 917 if (i > 0 && c == list[i-1]) {
michael@0 918 // collapse adjacent ranges
michael@0 919
michael@0 920 // [..., start_k-1, c, c, limit_k, ..., HIGH]
michael@0 921 // ^
michael@0 922 // list[i]
michael@0 923
michael@0 924 //for (int32_t k=i-1; k<len-2; ++k) {
michael@0 925 // list[k] = list[k+2];
michael@0 926 //}
michael@0 927 UChar32* dst = list + i - 1;
michael@0 928 UChar32* src = dst + 2;
michael@0 929 UChar32* srclimit = list + len;
michael@0 930 while (src < srclimit) *(dst++) = *(src++);
michael@0 931
michael@0 932 len -= 2;
michael@0 933 }
michael@0 934 }
michael@0 935
michael@0 936 else if (i > 0 && c == list[i-1]) {
michael@0 937 // c is after end of prior range
michael@0 938 list[i-1]++;
michael@0 939 // no need to check for collapse here
michael@0 940 }
michael@0 941
michael@0 942 else {
michael@0 943 // At this point we know the new char is not adjacent to
michael@0 944 // any existing ranges, and it is not 10FFFF.
michael@0 945
michael@0 946
michael@0 947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
michael@0 948 // ^
michael@0 949 // list[i]
michael@0 950
michael@0 951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
michael@0 952 // ^
michael@0 953 // list[i]
michael@0 954
michael@0 955 UErrorCode status = U_ZERO_ERROR;
michael@0 956 ensureCapacity(len+2, status);
michael@0 957 if (U_FAILURE(status)) {
michael@0 958 return *this; // There is no way to report this error :-(
michael@0 959 }
michael@0 960
michael@0 961 //for (int32_t k=len-1; k>=i; --k) {
michael@0 962 // list[k+2] = list[k];
michael@0 963 //}
michael@0 964 UChar32* src = list + len;
michael@0 965 UChar32* dst = src + 2;
michael@0 966 UChar32* srclimit = list + i;
michael@0 967 while (src > srclimit) *(--dst) = *(--src);
michael@0 968
michael@0 969 list[i] = c;
michael@0 970 list[i+1] = c+1;
michael@0 971 len += 2;
michael@0 972 }
michael@0 973
michael@0 974 #ifdef DEBUG_US_ADD
michael@0 975 dump(list, len);
michael@0 976 printf("\n");
michael@0 977
michael@0 978 for (i=1; i<len; ++i) {
michael@0 979 if (list[i] <= list[i-1]) {
michael@0 980 // Corrupt array!
michael@0 981 printf("ERROR: list has been corrupted\n");
michael@0 982 exit(1);
michael@0 983 }
michael@0 984 }
michael@0 985 #endif
michael@0 986
michael@0 987 releasePattern();
michael@0 988 return *this;
michael@0 989 }
michael@0 990
michael@0 991 /**
michael@0 992 * Adds the specified multicharacter to this set if it is not already
michael@0 993 * present. If this set already contains the multicharacter,
michael@0 994 * the call leaves this set unchanged.
michael@0 995 * Thus "ch" => {"ch"}
michael@0 996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0 997 * @param s the source string
michael@0 998 * @return the modified set, for chaining
michael@0 999 */
michael@0 1000 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
michael@0 1001 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0 1002 int32_t cp = getSingleCP(s);
michael@0 1003 if (cp < 0) {
michael@0 1004 if (!strings->contains((void*) &s)) {
michael@0 1005 _add(s);
michael@0 1006 releasePattern();
michael@0 1007 }
michael@0 1008 } else {
michael@0 1009 add((UChar32)cp);
michael@0 1010 }
michael@0 1011 return *this;
michael@0 1012 }
michael@0 1013
michael@0 1014 /**
michael@0 1015 * Adds the given string, in order, to 'strings'. The given string
michael@0 1016 * must have been checked by the caller to not be empty and to not
michael@0 1017 * already be in 'strings'.
michael@0 1018 */
michael@0 1019 void UnicodeSet::_add(const UnicodeString& s) {
michael@0 1020 if (isFrozen() || isBogus()) {
michael@0 1021 return;
michael@0 1022 }
michael@0 1023 UnicodeString* t = new UnicodeString(s);
michael@0 1024 if (t == NULL) { // Check for memory allocation error.
michael@0 1025 setToBogus();
michael@0 1026 return;
michael@0 1027 }
michael@0 1028 UErrorCode ec = U_ZERO_ERROR;
michael@0 1029 strings->sortedInsert(t, compareUnicodeString, ec);
michael@0 1030 if (U_FAILURE(ec)) {
michael@0 1031 setToBogus();
michael@0 1032 delete t;
michael@0 1033 }
michael@0 1034 }
michael@0 1035
michael@0 1036 /**
michael@0 1037 * @return a code point IF the string consists of a single one.
michael@0 1038 * otherwise returns -1.
michael@0 1039 * @param string to test
michael@0 1040 */
michael@0 1041 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
michael@0 1042 //if (s.length() < 1) {
michael@0 1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
michael@0 1044 //}
michael@0 1045 if (s.length() > 2) return -1;
michael@0 1046 if (s.length() == 1) return s.charAt(0);
michael@0 1047
michael@0 1048 // at this point, len = 2
michael@0 1049 UChar32 cp = s.char32At(0);
michael@0 1050 if (cp > 0xFFFF) { // is surrogate pair
michael@0 1051 return cp;
michael@0 1052 }
michael@0 1053 return -1;
michael@0 1054 }
michael@0 1055
michael@0 1056 /**
michael@0 1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
michael@0 1058 * If this set already any particular character, it has no effect on that character.
michael@0 1059 * @param the source string
michael@0 1060 * @return the modified set, for chaining
michael@0 1061 */
michael@0 1062 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
michael@0 1063 UChar32 cp;
michael@0 1064 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
michael@0 1065 cp = s.char32At(i);
michael@0 1066 add(cp);
michael@0 1067 }
michael@0 1068 return *this;
michael@0 1069 }
michael@0 1070
michael@0 1071 /**
michael@0 1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0 1073 * If this set already any particular character, it has no effect on that character.
michael@0 1074 * @param the source string
michael@0 1075 * @return the modified set, for chaining
michael@0 1076 */
michael@0 1077 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
michael@0 1078 UnicodeSet set;
michael@0 1079 set.addAll(s);
michael@0 1080 retainAll(set);
michael@0 1081 return *this;
michael@0 1082 }
michael@0 1083
michael@0 1084 /**
michael@0 1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0 1086 * If this set already any particular character, it has no effect on that character.
michael@0 1087 * @param the source string
michael@0 1088 * @return the modified set, for chaining
michael@0 1089 */
michael@0 1090 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
michael@0 1091 UnicodeSet set;
michael@0 1092 set.addAll(s);
michael@0 1093 complementAll(set);
michael@0 1094 return *this;
michael@0 1095 }
michael@0 1096
michael@0 1097 /**
michael@0 1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0 1099 * If this set already any particular character, it has no effect on that character.
michael@0 1100 * @param the source string
michael@0 1101 * @return the modified set, for chaining
michael@0 1102 */
michael@0 1103 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
michael@0 1104 UnicodeSet set;
michael@0 1105 set.addAll(s);
michael@0 1106 removeAll(set);
michael@0 1107 return *this;
michael@0 1108 }
michael@0 1109
michael@0 1110 UnicodeSet& UnicodeSet::removeAllStrings() {
michael@0 1111 strings->removeAllElements();
michael@0 1112 return *this;
michael@0 1113 }
michael@0 1114
michael@0 1115
michael@0 1116 /**
michael@0 1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
michael@0 1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0 1119 * @param the source string
michael@0 1120 * @return a newly created set containing the given string
michael@0 1121 */
michael@0 1122 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
michael@0 1123 UnicodeSet *set = new UnicodeSet();
michael@0 1124 if (set != NULL) { // Check for memory allocation error.
michael@0 1125 set->add(s);
michael@0 1126 }
michael@0 1127 return set;
michael@0 1128 }
michael@0 1129
michael@0 1130
michael@0 1131 /**
michael@0 1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
michael@0 1133 * @param the source string
michael@0 1134 * @return a newly created set containing the given characters
michael@0 1135 */
michael@0 1136 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
michael@0 1137 UnicodeSet *set = new UnicodeSet();
michael@0 1138 if (set != NULL) { // Check for memory allocation error.
michael@0 1139 set->addAll(s);
michael@0 1140 }
michael@0 1141 return set;
michael@0 1142 }
michael@0 1143
michael@0 1144 /**
michael@0 1145 * Retain only the elements in this set that are contained in the
michael@0 1146 * specified range. If <code>end > start</code> then an empty range is
michael@0 1147 * retained, leaving the set empty.
michael@0 1148 *
michael@0 1149 * @param start first character, inclusive, of range to be retained
michael@0 1150 * to this set.
michael@0 1151 * @param end last character, inclusive, of range to be retained
michael@0 1152 * to this set.
michael@0 1153 */
michael@0 1154 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
michael@0 1155 if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0 1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0 1157 retain(range, 2, 0);
michael@0 1158 } else {
michael@0 1159 clear();
michael@0 1160 }
michael@0 1161 return *this;
michael@0 1162 }
michael@0 1163
michael@0 1164 UnicodeSet& UnicodeSet::retain(UChar32 c) {
michael@0 1165 return retain(c, c);
michael@0 1166 }
michael@0 1167
michael@0 1168 /**
michael@0 1169 * Removes the specified range from this set if it is present.
michael@0 1170 * The set will not contain the specified range once the call
michael@0 1171 * returns. If <code>end > start</code> then an empty range is
michael@0 1172 * removed, leaving the set unchanged.
michael@0 1173 *
michael@0 1174 * @param start first character, inclusive, of range to be removed
michael@0 1175 * from this set.
michael@0 1176 * @param end last character, inclusive, of range to be removed
michael@0 1177 * from this set.
michael@0 1178 */
michael@0 1179 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
michael@0 1180 if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0 1181 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0 1182 retain(range, 2, 2);
michael@0 1183 }
michael@0 1184 return *this;
michael@0 1185 }
michael@0 1186
michael@0 1187 /**
michael@0 1188 * Removes the specified character from this set if it is present.
michael@0 1189 * The set will not contain the specified range once the call
michael@0 1190 * returns.
michael@0 1191 */
michael@0 1192 UnicodeSet& UnicodeSet::remove(UChar32 c) {
michael@0 1193 return remove(c, c);
michael@0 1194 }
michael@0 1195
michael@0 1196 /**
michael@0 1197 * Removes the specified string from this set if it is present.
michael@0 1198 * The set will not contain the specified character once the call
michael@0 1199 * returns.
michael@0 1200 * @param the source string
michael@0 1201 * @return the modified set, for chaining
michael@0 1202 */
michael@0 1203 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
michael@0 1204 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0 1205 int32_t cp = getSingleCP(s);
michael@0 1206 if (cp < 0) {
michael@0 1207 strings->removeElement((void*) &s);
michael@0 1208 releasePattern();
michael@0 1209 } else {
michael@0 1210 remove((UChar32)cp, (UChar32)cp);
michael@0 1211 }
michael@0 1212 return *this;
michael@0 1213 }
michael@0 1214
michael@0 1215 /**
michael@0 1216 * Complements the specified range in this set. Any character in
michael@0 1217 * the range will be removed if it is in this set, or will be
michael@0 1218 * added if it is not in this set. If <code>end > start</code>
michael@0 1219 * then an empty range is xor'ed, leaving the set unchanged.
michael@0 1220 *
michael@0 1221 * @param start first character, inclusive, of range to be removed
michael@0 1222 * from this set.
michael@0 1223 * @param end last character, inclusive, of range to be removed
michael@0 1224 * from this set.
michael@0 1225 */
michael@0 1226 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
michael@0 1227 if (isFrozen() || isBogus()) {
michael@0 1228 return *this;
michael@0 1229 }
michael@0 1230 if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0 1231 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0 1232 exclusiveOr(range, 2, 0);
michael@0 1233 }
michael@0 1234 releasePattern();
michael@0 1235 return *this;
michael@0 1236 }
michael@0 1237
michael@0 1238 UnicodeSet& UnicodeSet::complement(UChar32 c) {
michael@0 1239 return complement(c, c);
michael@0 1240 }
michael@0 1241
michael@0 1242 /**
michael@0 1243 * This is equivalent to
michael@0 1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
michael@0 1245 */
michael@0 1246 UnicodeSet& UnicodeSet::complement(void) {
michael@0 1247 if (isFrozen() || isBogus()) {
michael@0 1248 return *this;
michael@0 1249 }
michael@0 1250 UErrorCode status = U_ZERO_ERROR;
michael@0 1251 if (list[0] == UNICODESET_LOW) {
michael@0 1252 ensureBufferCapacity(len-1, status);
michael@0 1253 if (U_FAILURE(status)) {
michael@0 1254 return *this;
michael@0 1255 }
michael@0 1256 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
michael@0 1257 --len;
michael@0 1258 } else {
michael@0 1259 ensureBufferCapacity(len+1, status);
michael@0 1260 if (U_FAILURE(status)) {
michael@0 1261 return *this;
michael@0 1262 }
michael@0 1263 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
michael@0 1264 buffer[0] = UNICODESET_LOW;
michael@0 1265 ++len;
michael@0 1266 }
michael@0 1267 swapBuffers();
michael@0 1268 releasePattern();
michael@0 1269 return *this;
michael@0 1270 }
michael@0 1271
michael@0 1272 /**
michael@0 1273 * Complement the specified string in this set.
michael@0 1274 * The set will not contain the specified string once the call
michael@0 1275 * returns.
michael@0 1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0 1277 * @param s the string to complement
michael@0 1278 * @return this object, for chaining
michael@0 1279 */
michael@0 1280 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
michael@0 1281 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0 1282 int32_t cp = getSingleCP(s);
michael@0 1283 if (cp < 0) {
michael@0 1284 if (strings->contains((void*) &s)) {
michael@0 1285 strings->removeElement((void*) &s);
michael@0 1286 } else {
michael@0 1287 _add(s);
michael@0 1288 }
michael@0 1289 releasePattern();
michael@0 1290 } else {
michael@0 1291 complement((UChar32)cp, (UChar32)cp);
michael@0 1292 }
michael@0 1293 return *this;
michael@0 1294 }
michael@0 1295
michael@0 1296 /**
michael@0 1297 * Adds all of the elements in the specified set to this set if
michael@0 1298 * they're not already present. This operation effectively
michael@0 1299 * modifies this set so that its value is the <i>union</i> of the two
michael@0 1300 * sets. The behavior of this operation is unspecified if the specified
michael@0 1301 * collection is modified while the operation is in progress.
michael@0 1302 *
michael@0 1303 * @param c set whose elements are to be added to this set.
michael@0 1304 * @see #add(char, char)
michael@0 1305 */
michael@0 1306 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
michael@0 1307 if ( c.len>0 && c.list!=NULL ) {
michael@0 1308 add(c.list, c.len, 0);
michael@0 1309 }
michael@0 1310
michael@0 1311 // Add strings in order
michael@0 1312 if ( c.strings!=NULL ) {
michael@0 1313 for (int32_t i=0; i<c.strings->size(); ++i) {
michael@0 1314 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
michael@0 1315 if (!strings->contains((void*) s)) {
michael@0 1316 _add(*s);
michael@0 1317 }
michael@0 1318 }
michael@0 1319 }
michael@0 1320 return *this;
michael@0 1321 }
michael@0 1322
michael@0 1323 /**
michael@0 1324 * Retains only the elements in this set that are contained in the
michael@0 1325 * specified set. In other words, removes from this set all of
michael@0 1326 * its elements that are not contained in the specified set. This
michael@0 1327 * operation effectively modifies this set so that its value is
michael@0 1328 * the <i>intersection</i> of the two sets.
michael@0 1329 *
michael@0 1330 * @param c set that defines which elements this set will retain.
michael@0 1331 */
michael@0 1332 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
michael@0 1333 if (isFrozen() || isBogus()) {
michael@0 1334 return *this;
michael@0 1335 }
michael@0 1336 retain(c.list, c.len, 0);
michael@0 1337 strings->retainAll(*c.strings);
michael@0 1338 return *this;
michael@0 1339 }
michael@0 1340
michael@0 1341 /**
michael@0 1342 * Removes from this set all of its elements that are contained in the
michael@0 1343 * specified set. This operation effectively modifies this
michael@0 1344 * set so that its value is the <i>asymmetric set difference</i> of
michael@0 1345 * the two sets.
michael@0 1346 *
michael@0 1347 * @param c set that defines which elements will be removed from
michael@0 1348 * this set.
michael@0 1349 */
michael@0 1350 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
michael@0 1351 if (isFrozen() || isBogus()) {
michael@0 1352 return *this;
michael@0 1353 }
michael@0 1354 retain(c.list, c.len, 2);
michael@0 1355 strings->removeAll(*c.strings);
michael@0 1356 return *this;
michael@0 1357 }
michael@0 1358
michael@0 1359 /**
michael@0 1360 * Complements in this set all elements contained in the specified
michael@0 1361 * set. Any character in the other set will be removed if it is
michael@0 1362 * in this set, or will be added if it is not in this set.
michael@0 1363 *
michael@0 1364 * @param c set that defines which elements will be xor'ed from
michael@0 1365 * this set.
michael@0 1366 */
michael@0 1367 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
michael@0 1368 if (isFrozen() || isBogus()) {
michael@0 1369 return *this;
michael@0 1370 }
michael@0 1371 exclusiveOr(c.list, c.len, 0);
michael@0 1372
michael@0 1373 for (int32_t i=0; i<c.strings->size(); ++i) {
michael@0 1374 void* e = c.strings->elementAt(i);
michael@0 1375 if (!strings->removeElement(e)) {
michael@0 1376 _add(*(const UnicodeString*)e);
michael@0 1377 }
michael@0 1378 }
michael@0 1379 return *this;
michael@0 1380 }
michael@0 1381
michael@0 1382 /**
michael@0 1383 * Removes all of the elements from this set. This set will be
michael@0 1384 * empty after this call returns.
michael@0 1385 */
michael@0 1386 UnicodeSet& UnicodeSet::clear(void) {
michael@0 1387 if (isFrozen()) {
michael@0 1388 return *this;
michael@0 1389 }
michael@0 1390 if (list != NULL) {
michael@0 1391 list[0] = UNICODESET_HIGH;
michael@0 1392 }
michael@0 1393 len = 1;
michael@0 1394 releasePattern();
michael@0 1395 if (strings != NULL) {
michael@0 1396 strings->removeAllElements();
michael@0 1397 }
michael@0 1398 if (list != NULL && strings != NULL) {
michael@0 1399 // Remove bogus
michael@0 1400 fFlags = 0;
michael@0 1401 }
michael@0 1402 return *this;
michael@0 1403 }
michael@0 1404
michael@0 1405 /**
michael@0 1406 * Iteration method that returns the number of ranges contained in
michael@0 1407 * this set.
michael@0 1408 * @see #getRangeStart
michael@0 1409 * @see #getRangeEnd
michael@0 1410 */
michael@0 1411 int32_t UnicodeSet::getRangeCount() const {
michael@0 1412 return len/2;
michael@0 1413 }
michael@0 1414
michael@0 1415 /**
michael@0 1416 * Iteration method that returns the first character in the
michael@0 1417 * specified range of this set.
michael@0 1418 * @see #getRangeCount
michael@0 1419 * @see #getRangeEnd
michael@0 1420 */
michael@0 1421 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
michael@0 1422 return list[index*2];
michael@0 1423 }
michael@0 1424
michael@0 1425 /**
michael@0 1426 * Iteration method that returns the last character in the
michael@0 1427 * specified range of this set.
michael@0 1428 * @see #getRangeStart
michael@0 1429 * @see #getRangeEnd
michael@0 1430 */
michael@0 1431 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
michael@0 1432 return list[index*2 + 1] - 1;
michael@0 1433 }
michael@0 1434
michael@0 1435 int32_t UnicodeSet::getStringCount() const {
michael@0 1436 return strings->size();
michael@0 1437 }
michael@0 1438
michael@0 1439 const UnicodeString* UnicodeSet::getString(int32_t index) const {
michael@0 1440 return (const UnicodeString*) strings->elementAt(index);
michael@0 1441 }
michael@0 1442
michael@0 1443 /**
michael@0 1444 * Reallocate this objects internal structures to take up the least
michael@0 1445 * possible space, without changing this object's value.
michael@0 1446 */
michael@0 1447 UnicodeSet& UnicodeSet::compact() {
michael@0 1448 if (isFrozen() || isBogus()) {
michael@0 1449 return *this;
michael@0 1450 }
michael@0 1451 // Delete buffer first to defragment memory less.
michael@0 1452 if (buffer != NULL) {
michael@0 1453 uprv_free(buffer);
michael@0 1454 buffer = NULL;
michael@0 1455 }
michael@0 1456 if (len < capacity) {
michael@0 1457 // Make the capacity equal to len or 1.
michael@0 1458 // We don't want to realloc of 0 size.
michael@0 1459 int32_t newCapacity = len + (len == 0);
michael@0 1460 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
michael@0 1461 if (temp) {
michael@0 1462 list = temp;
michael@0 1463 capacity = newCapacity;
michael@0 1464 }
michael@0 1465 // else what the heck happened?! We allocated less memory!
michael@0 1466 // Oh well. We'll keep our original array.
michael@0 1467 }
michael@0 1468 return *this;
michael@0 1469 }
michael@0 1470
michael@0 1471 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
michael@0 1472 int32_t bmpLength, length, destLength;
michael@0 1473
michael@0 1474 if (U_FAILURE(ec)) {
michael@0 1475 return 0;
michael@0 1476 }
michael@0 1477
michael@0 1478 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
michael@0 1479 ec=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1480 return 0;
michael@0 1481 }
michael@0 1482
michael@0 1483 /* count necessary 16-bit units */
michael@0 1484 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
michael@0 1485 // assert(length>=0);
michael@0 1486 if (length==0) {
michael@0 1487 /* empty set */
michael@0 1488 if (destCapacity>0) {
michael@0 1489 *dest=0;
michael@0 1490 } else {
michael@0 1491 ec=U_BUFFER_OVERFLOW_ERROR;
michael@0 1492 }
michael@0 1493 return 1;
michael@0 1494 }
michael@0 1495 /* now length>0 */
michael@0 1496
michael@0 1497 if (this->list[length-1]<=0xffff) {
michael@0 1498 /* all BMP */
michael@0 1499 bmpLength=length;
michael@0 1500 } else if (this->list[0]>=0x10000) {
michael@0 1501 /* all supplementary */
michael@0 1502 bmpLength=0;
michael@0 1503 length*=2;
michael@0 1504 } else {
michael@0 1505 /* some BMP, some supplementary */
michael@0 1506 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
michael@0 1507 length=bmpLength+2*(length-bmpLength);
michael@0 1508 }
michael@0 1509
michael@0 1510 /* length: number of 16-bit array units */
michael@0 1511 if (length>0x7fff) {
michael@0 1512 /* there are only 15 bits for the length in the first serialized word */
michael@0 1513 ec=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1514 return 0;
michael@0 1515 }
michael@0 1516
michael@0 1517 /*
michael@0 1518 * total serialized length:
michael@0 1519 * number of 16-bit array units (length) +
michael@0 1520 * 1 length unit (always) +
michael@0 1521 * 1 bmpLength unit (if there are supplementary values)
michael@0 1522 */
michael@0 1523 destLength=length+((length>bmpLength)?2:1);
michael@0 1524 if (destLength<=destCapacity) {
michael@0 1525 const UChar32 *p;
michael@0 1526 int32_t i;
michael@0 1527
michael@0 1528 *dest=(uint16_t)length;
michael@0 1529 if (length>bmpLength) {
michael@0 1530 *dest|=0x8000;
michael@0 1531 *++dest=(uint16_t)bmpLength;
michael@0 1532 }
michael@0 1533 ++dest;
michael@0 1534
michael@0 1535 /* write the BMP part of the array */
michael@0 1536 p=this->list;
michael@0 1537 for (i=0; i<bmpLength; ++i) {
michael@0 1538 *dest++=(uint16_t)*p++;
michael@0 1539 }
michael@0 1540
michael@0 1541 /* write the supplementary part of the array */
michael@0 1542 for (; i<length; i+=2) {
michael@0 1543 *dest++=(uint16_t)(*p>>16);
michael@0 1544 *dest++=(uint16_t)*p++;
michael@0 1545 }
michael@0 1546 } else {
michael@0 1547 ec=U_BUFFER_OVERFLOW_ERROR;
michael@0 1548 }
michael@0 1549 return destLength;
michael@0 1550 }
michael@0 1551
michael@0 1552 //----------------------------------------------------------------
michael@0 1553 // Implementation: Utility methods
michael@0 1554 //----------------------------------------------------------------
michael@0 1555
michael@0 1556 /**
michael@0 1557 * Allocate our strings vector and return TRUE if successful.
michael@0 1558 */
michael@0 1559 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
michael@0 1560 if (U_FAILURE(status)) {
michael@0 1561 return FALSE;
michael@0 1562 }
michael@0 1563 strings = new UVector(uprv_deleteUObject,
michael@0 1564 uhash_compareUnicodeString, 1, status);
michael@0 1565 if (strings == NULL) { // Check for memory allocation error.
michael@0 1566 status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1567 return FALSE;
michael@0 1568 }
michael@0 1569 if (U_FAILURE(status)) {
michael@0 1570 delete strings;
michael@0 1571 strings = NULL;
michael@0 1572 return FALSE;
michael@0 1573 }
michael@0 1574 return TRUE;
michael@0 1575 }
michael@0 1576
michael@0 1577 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
michael@0 1578 if (newLen <= capacity)
michael@0 1579 return;
michael@0 1580 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
michael@0 1581 if (temp == NULL) {
michael@0 1582 ec = U_MEMORY_ALLOCATION_ERROR;
michael@0 1583 setToBogus();
michael@0 1584 return;
michael@0 1585 }
michael@0 1586 list = temp;
michael@0 1587 capacity = newLen + GROW_EXTRA;
michael@0 1588 // else we keep the original contents on the memory failure.
michael@0 1589 }
michael@0 1590
michael@0 1591 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
michael@0 1592 if (buffer != NULL && newLen <= bufferCapacity)
michael@0 1593 return;
michael@0 1594 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
michael@0 1595 if (temp == NULL) {
michael@0 1596 ec = U_MEMORY_ALLOCATION_ERROR;
michael@0 1597 setToBogus();
michael@0 1598 return;
michael@0 1599 }
michael@0 1600 buffer = temp;
michael@0 1601 bufferCapacity = newLen + GROW_EXTRA;
michael@0 1602 // else we keep the original contents on the memory failure.
michael@0 1603 }
michael@0 1604
michael@0 1605 /**
michael@0 1606 * Swap list and buffer.
michael@0 1607 */
michael@0 1608 void UnicodeSet::swapBuffers(void) {
michael@0 1609 // swap list and buffer
michael@0 1610 UChar32* temp = list;
michael@0 1611 list = buffer;
michael@0 1612 buffer = temp;
michael@0 1613
michael@0 1614 int32_t c = capacity;
michael@0 1615 capacity = bufferCapacity;
michael@0 1616 bufferCapacity = c;
michael@0 1617 }
michael@0 1618
michael@0 1619 void UnicodeSet::setToBogus() {
michael@0 1620 clear(); // Remove everything in the set.
michael@0 1621 fFlags = kIsBogus;
michael@0 1622 }
michael@0 1623
michael@0 1624 //----------------------------------------------------------------
michael@0 1625 // Implementation: Fundamental operators
michael@0 1626 //----------------------------------------------------------------
michael@0 1627
michael@0 1628 static inline UChar32 max(UChar32 a, UChar32 b) {
michael@0 1629 return (a > b) ? a : b;
michael@0 1630 }
michael@0 1631
michael@0 1632 // polarity = 0, 3 is normal: x xor y
michael@0 1633 // polarity = 1, 2: x xor ~y == x === y
michael@0 1634
michael@0 1635 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0 1636 if (isFrozen() || isBogus()) {
michael@0 1637 return;
michael@0 1638 }
michael@0 1639 UErrorCode status = U_ZERO_ERROR;
michael@0 1640 ensureBufferCapacity(len + otherLen, status);
michael@0 1641 if (U_FAILURE(status)) {
michael@0 1642 return;
michael@0 1643 }
michael@0 1644
michael@0 1645 int32_t i = 0, j = 0, k = 0;
michael@0 1646 UChar32 a = list[i++];
michael@0 1647 UChar32 b;
michael@0 1648 if (polarity == 1 || polarity == 2) {
michael@0 1649 b = UNICODESET_LOW;
michael@0 1650 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
michael@0 1651 ++j;
michael@0 1652 b = other[j];
michael@0 1653 }
michael@0 1654 } else {
michael@0 1655 b = other[j++];
michael@0 1656 }
michael@0 1657 // simplest of all the routines
michael@0 1658 // sort the values, discarding identicals!
michael@0 1659 for (;;) {
michael@0 1660 if (a < b) {
michael@0 1661 buffer[k++] = a;
michael@0 1662 a = list[i++];
michael@0 1663 } else if (b < a) {
michael@0 1664 buffer[k++] = b;
michael@0 1665 b = other[j++];
michael@0 1666 } else if (a != UNICODESET_HIGH) { // at this point, a == b
michael@0 1667 // discard both values!
michael@0 1668 a = list[i++];
michael@0 1669 b = other[j++];
michael@0 1670 } else { // DONE!
michael@0 1671 buffer[k++] = UNICODESET_HIGH;
michael@0 1672 len = k;
michael@0 1673 break;
michael@0 1674 }
michael@0 1675 }
michael@0 1676 swapBuffers();
michael@0 1677 releasePattern();
michael@0 1678 }
michael@0 1679
michael@0 1680 // polarity = 0 is normal: x union y
michael@0 1681 // polarity = 2: x union ~y
michael@0 1682 // polarity = 1: ~x union y
michael@0 1683 // polarity = 3: ~x union ~y
michael@0 1684
michael@0 1685 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0 1686 if (isFrozen() || isBogus() || other==NULL) {
michael@0 1687 return;
michael@0 1688 }
michael@0 1689 UErrorCode status = U_ZERO_ERROR;
michael@0 1690 ensureBufferCapacity(len + otherLen, status);
michael@0 1691 if (U_FAILURE(status)) {
michael@0 1692 return;
michael@0 1693 }
michael@0 1694
michael@0 1695 int32_t i = 0, j = 0, k = 0;
michael@0 1696 UChar32 a = list[i++];
michael@0 1697 UChar32 b = other[j++];
michael@0 1698 // change from xor is that we have to check overlapping pairs
michael@0 1699 // polarity bit 1 means a is second, bit 2 means b is.
michael@0 1700 for (;;) {
michael@0 1701 switch (polarity) {
michael@0 1702 case 0: // both first; take lower if unequal
michael@0 1703 if (a < b) { // take a
michael@0 1704 // Back up over overlapping ranges in buffer[]
michael@0 1705 if (k > 0 && a <= buffer[k-1]) {
michael@0 1706 // Pick latter end value in buffer[] vs. list[]
michael@0 1707 a = max(list[i], buffer[--k]);
michael@0 1708 } else {
michael@0 1709 // No overlap
michael@0 1710 buffer[k++] = a;
michael@0 1711 a = list[i];
michael@0 1712 }
michael@0 1713 i++; // Common if/else code factored out
michael@0 1714 polarity ^= 1;
michael@0 1715 } else if (b < a) { // take b
michael@0 1716 if (k > 0 && b <= buffer[k-1]) {
michael@0 1717 b = max(other[j], buffer[--k]);
michael@0 1718 } else {
michael@0 1719 buffer[k++] = b;
michael@0 1720 b = other[j];
michael@0 1721 }
michael@0 1722 j++;
michael@0 1723 polarity ^= 2;
michael@0 1724 } else { // a == b, take a, drop b
michael@0 1725 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1726 // This is symmetrical; it doesn't matter if
michael@0 1727 // we backtrack with a or b. - liu
michael@0 1728 if (k > 0 && a <= buffer[k-1]) {
michael@0 1729 a = max(list[i], buffer[--k]);
michael@0 1730 } else {
michael@0 1731 // No overlap
michael@0 1732 buffer[k++] = a;
michael@0 1733 a = list[i];
michael@0 1734 }
michael@0 1735 i++;
michael@0 1736 polarity ^= 1;
michael@0 1737 b = other[j++];
michael@0 1738 polarity ^= 2;
michael@0 1739 }
michael@0 1740 break;
michael@0 1741 case 3: // both second; take higher if unequal, and drop other
michael@0 1742 if (b <= a) { // take a
michael@0 1743 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1744 buffer[k++] = a;
michael@0 1745 } else { // take b
michael@0 1746 if (b == UNICODESET_HIGH) goto loop_end;
michael@0 1747 buffer[k++] = b;
michael@0 1748 }
michael@0 1749 a = list[i++];
michael@0 1750 polarity ^= 1; // factored common code
michael@0 1751 b = other[j++];
michael@0 1752 polarity ^= 2;
michael@0 1753 break;
michael@0 1754 case 1: // a second, b first; if b < a, overlap
michael@0 1755 if (a < b) { // no overlap, take a
michael@0 1756 buffer[k++] = a; a = list[i++]; polarity ^= 1;
michael@0 1757 } else if (b < a) { // OVERLAP, drop b
michael@0 1758 b = other[j++];
michael@0 1759 polarity ^= 2;
michael@0 1760 } else { // a == b, drop both!
michael@0 1761 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1762 a = list[i++];
michael@0 1763 polarity ^= 1;
michael@0 1764 b = other[j++];
michael@0 1765 polarity ^= 2;
michael@0 1766 }
michael@0 1767 break;
michael@0 1768 case 2: // a first, b second; if a < b, overlap
michael@0 1769 if (b < a) { // no overlap, take b
michael@0 1770 buffer[k++] = b;
michael@0 1771 b = other[j++];
michael@0 1772 polarity ^= 2;
michael@0 1773 } else if (a < b) { // OVERLAP, drop a
michael@0 1774 a = list[i++];
michael@0 1775 polarity ^= 1;
michael@0 1776 } else { // a == b, drop both!
michael@0 1777 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1778 a = list[i++];
michael@0 1779 polarity ^= 1;
michael@0 1780 b = other[j++];
michael@0 1781 polarity ^= 2;
michael@0 1782 }
michael@0 1783 break;
michael@0 1784 }
michael@0 1785 }
michael@0 1786 loop_end:
michael@0 1787 buffer[k++] = UNICODESET_HIGH; // terminate
michael@0 1788 len = k;
michael@0 1789 swapBuffers();
michael@0 1790 releasePattern();
michael@0 1791 }
michael@0 1792
michael@0 1793 // polarity = 0 is normal: x intersect y
michael@0 1794 // polarity = 2: x intersect ~y == set-minus
michael@0 1795 // polarity = 1: ~x intersect y
michael@0 1796 // polarity = 3: ~x intersect ~y
michael@0 1797
michael@0 1798 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0 1799 if (isFrozen() || isBogus()) {
michael@0 1800 return;
michael@0 1801 }
michael@0 1802 UErrorCode status = U_ZERO_ERROR;
michael@0 1803 ensureBufferCapacity(len + otherLen, status);
michael@0 1804 if (U_FAILURE(status)) {
michael@0 1805 return;
michael@0 1806 }
michael@0 1807
michael@0 1808 int32_t i = 0, j = 0, k = 0;
michael@0 1809 UChar32 a = list[i++];
michael@0 1810 UChar32 b = other[j++];
michael@0 1811 // change from xor is that we have to check overlapping pairs
michael@0 1812 // polarity bit 1 means a is second, bit 2 means b is.
michael@0 1813 for (;;) {
michael@0 1814 switch (polarity) {
michael@0 1815 case 0: // both first; drop the smaller
michael@0 1816 if (a < b) { // drop a
michael@0 1817 a = list[i++];
michael@0 1818 polarity ^= 1;
michael@0 1819 } else if (b < a) { // drop b
michael@0 1820 b = other[j++];
michael@0 1821 polarity ^= 2;
michael@0 1822 } else { // a == b, take one, drop other
michael@0 1823 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1824 buffer[k++] = a;
michael@0 1825 a = list[i++];
michael@0 1826 polarity ^= 1;
michael@0 1827 b = other[j++];
michael@0 1828 polarity ^= 2;
michael@0 1829 }
michael@0 1830 break;
michael@0 1831 case 3: // both second; take lower if unequal
michael@0 1832 if (a < b) { // take a
michael@0 1833 buffer[k++] = a;
michael@0 1834 a = list[i++];
michael@0 1835 polarity ^= 1;
michael@0 1836 } else if (b < a) { // take b
michael@0 1837 buffer[k++] = b;
michael@0 1838 b = other[j++];
michael@0 1839 polarity ^= 2;
michael@0 1840 } else { // a == b, take one, drop other
michael@0 1841 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1842 buffer[k++] = a;
michael@0 1843 a = list[i++];
michael@0 1844 polarity ^= 1;
michael@0 1845 b = other[j++];
michael@0 1846 polarity ^= 2;
michael@0 1847 }
michael@0 1848 break;
michael@0 1849 case 1: // a second, b first;
michael@0 1850 if (a < b) { // NO OVERLAP, drop a
michael@0 1851 a = list[i++];
michael@0 1852 polarity ^= 1;
michael@0 1853 } else if (b < a) { // OVERLAP, take b
michael@0 1854 buffer[k++] = b;
michael@0 1855 b = other[j++];
michael@0 1856 polarity ^= 2;
michael@0 1857 } else { // a == b, drop both!
michael@0 1858 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1859 a = list[i++];
michael@0 1860 polarity ^= 1;
michael@0 1861 b = other[j++];
michael@0 1862 polarity ^= 2;
michael@0 1863 }
michael@0 1864 break;
michael@0 1865 case 2: // a first, b second; if a < b, overlap
michael@0 1866 if (b < a) { // no overlap, drop b
michael@0 1867 b = other[j++];
michael@0 1868 polarity ^= 2;
michael@0 1869 } else if (a < b) { // OVERLAP, take a
michael@0 1870 buffer[k++] = a;
michael@0 1871 a = list[i++];
michael@0 1872 polarity ^= 1;
michael@0 1873 } else { // a == b, drop both!
michael@0 1874 if (a == UNICODESET_HIGH) goto loop_end;
michael@0 1875 a = list[i++];
michael@0 1876 polarity ^= 1;
michael@0 1877 b = other[j++];
michael@0 1878 polarity ^= 2;
michael@0 1879 }
michael@0 1880 break;
michael@0 1881 }
michael@0 1882 }
michael@0 1883 loop_end:
michael@0 1884 buffer[k++] = UNICODESET_HIGH; // terminate
michael@0 1885 len = k;
michael@0 1886 swapBuffers();
michael@0 1887 releasePattern();
michael@0 1888 }
michael@0 1889
michael@0 1890 /**
michael@0 1891 * Append the <code>toPattern()</code> representation of a
michael@0 1892 * string to the given <code>StringBuffer</code>.
michael@0 1893 */
michael@0 1894 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
michael@0 1895 escapeUnprintable) {
michael@0 1896 UChar32 cp;
michael@0 1897 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
michael@0 1898 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
michael@0 1899 }
michael@0 1900 }
michael@0 1901
michael@0 1902 /**
michael@0 1903 * Append the <code>toPattern()</code> representation of a
michael@0 1904 * character to the given <code>StringBuffer</code>.
michael@0 1905 */
michael@0 1906 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
michael@0 1907 escapeUnprintable) {
michael@0 1908 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
michael@0 1909 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
michael@0 1910 // unprintable
michael@0 1911 if (ICU_Utility::escapeUnprintable(buf, c)) {
michael@0 1912 return;
michael@0 1913 }
michael@0 1914 }
michael@0 1915 // Okay to let ':' pass through
michael@0 1916 switch (c) {
michael@0 1917 case SET_OPEN:
michael@0 1918 case SET_CLOSE:
michael@0 1919 case HYPHEN:
michael@0 1920 case COMPLEMENT:
michael@0 1921 case INTERSECTION:
michael@0 1922 case BACKSLASH:
michael@0 1923 case OPEN_BRACE:
michael@0 1924 case CLOSE_BRACE:
michael@0 1925 case COLON:
michael@0 1926 case SymbolTable::SYMBOL_REF:
michael@0 1927 buf.append(BACKSLASH);
michael@0 1928 break;
michael@0 1929 default:
michael@0 1930 // Escape whitespace
michael@0 1931 if (PatternProps::isWhiteSpace(c)) {
michael@0 1932 buf.append(BACKSLASH);
michael@0 1933 }
michael@0 1934 break;
michael@0 1935 }
michael@0 1936 buf.append(c);
michael@0 1937 }
michael@0 1938
michael@0 1939 /**
michael@0 1940 * Append a string representation of this set to result. This will be
michael@0 1941 * a cleaned version of the string passed to applyPattern(), if there
michael@0 1942 * is one. Otherwise it will be generated.
michael@0 1943 */
michael@0 1944 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
michael@0 1945 UBool escapeUnprintable) const
michael@0 1946 {
michael@0 1947 if (pat != NULL) {
michael@0 1948 int32_t i;
michael@0 1949 int32_t backslashCount = 0;
michael@0 1950 for (i=0; i<patLen; ) {
michael@0 1951 UChar32 c;
michael@0 1952 U16_NEXT(pat, i, patLen, c);
michael@0 1953 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
michael@0 1954 // If the unprintable character is preceded by an odd
michael@0 1955 // number of backslashes, then it has been escaped.
michael@0 1956 // Before unescaping it, we delete the final
michael@0 1957 // backslash.
michael@0 1958 if ((backslashCount % 2) == 1) {
michael@0 1959 result.truncate(result.length() - 1);
michael@0 1960 }
michael@0 1961 ICU_Utility::escapeUnprintable(result, c);
michael@0 1962 backslashCount = 0;
michael@0 1963 } else {
michael@0 1964 result.append(c);
michael@0 1965 if (c == BACKSLASH) {
michael@0 1966 ++backslashCount;
michael@0 1967 } else {
michael@0 1968 backslashCount = 0;
michael@0 1969 }
michael@0 1970 }
michael@0 1971 }
michael@0 1972 return result;
michael@0 1973 }
michael@0 1974
michael@0 1975 return _generatePattern(result, escapeUnprintable);
michael@0 1976 }
michael@0 1977
michael@0 1978 /**
michael@0 1979 * Returns a string representation of this set. If the result of
michael@0 1980 * calling this function is passed to a UnicodeSet constructor, it
michael@0 1981 * will produce another set that is equal to this one.
michael@0 1982 */
michael@0 1983 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
michael@0 1984 UBool escapeUnprintable) const
michael@0 1985 {
michael@0 1986 result.truncate(0);
michael@0 1987 return _toPattern(result, escapeUnprintable);
michael@0 1988 }
michael@0 1989
michael@0 1990 /**
michael@0 1991 * Generate and append a string representation of this set to result.
michael@0 1992 * This does not use this.pat, the cleaned up copy of the string
michael@0 1993 * passed to applyPattern().
michael@0 1994 */
michael@0 1995 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
michael@0 1996 UBool escapeUnprintable) const
michael@0 1997 {
michael@0 1998 result.append(SET_OPEN);
michael@0 1999
michael@0 2000 // // Check against the predefined categories. We implicitly build
michael@0 2001 // // up ALL category sets the first time toPattern() is called.
michael@0 2002 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
michael@0 2003 // if (*this == getCategorySet(cat)) {
michael@0 2004 // result.append(COLON);
michael@0 2005 // result.append(CATEGORY_NAMES, cat*2, 2);
michael@0 2006 // return result.append(CATEGORY_CLOSE);
michael@0 2007 // }
michael@0 2008 // }
michael@0 2009
michael@0 2010 int32_t count = getRangeCount();
michael@0 2011
michael@0 2012 // If the set contains at least 2 intervals and includes both
michael@0 2013 // MIN_VALUE and MAX_VALUE, then the inverse representation will
michael@0 2014 // be more economical.
michael@0 2015 if (count > 1 &&
michael@0 2016 getRangeStart(0) == MIN_VALUE &&
michael@0 2017 getRangeEnd(count-1) == MAX_VALUE) {
michael@0 2018
michael@0 2019 // Emit the inverse
michael@0 2020 result.append(COMPLEMENT);
michael@0 2021
michael@0 2022 for (int32_t i = 1; i < count; ++i) {
michael@0 2023 UChar32 start = getRangeEnd(i-1)+1;
michael@0 2024 UChar32 end = getRangeStart(i)-1;
michael@0 2025 _appendToPat(result, start, escapeUnprintable);
michael@0 2026 if (start != end) {
michael@0 2027 if ((start+1) != end) {
michael@0 2028 result.append(HYPHEN);
michael@0 2029 }
michael@0 2030 _appendToPat(result, end, escapeUnprintable);
michael@0 2031 }
michael@0 2032 }
michael@0 2033 }
michael@0 2034
michael@0 2035 // Default; emit the ranges as pairs
michael@0 2036 else {
michael@0 2037 for (int32_t i = 0; i < count; ++i) {
michael@0 2038 UChar32 start = getRangeStart(i);
michael@0 2039 UChar32 end = getRangeEnd(i);
michael@0 2040 _appendToPat(result, start, escapeUnprintable);
michael@0 2041 if (start != end) {
michael@0 2042 if ((start+1) != end) {
michael@0 2043 result.append(HYPHEN);
michael@0 2044 }
michael@0 2045 _appendToPat(result, end, escapeUnprintable);
michael@0 2046 }
michael@0 2047 }
michael@0 2048 }
michael@0 2049
michael@0 2050 for (int32_t i = 0; i<strings->size(); ++i) {
michael@0 2051 result.append(OPEN_BRACE);
michael@0 2052 _appendToPat(result,
michael@0 2053 *(const UnicodeString*) strings->elementAt(i),
michael@0 2054 escapeUnprintable);
michael@0 2055 result.append(CLOSE_BRACE);
michael@0 2056 }
michael@0 2057 return result.append(SET_CLOSE);
michael@0 2058 }
michael@0 2059
michael@0 2060 /**
michael@0 2061 * Release existing cached pattern
michael@0 2062 */
michael@0 2063 void UnicodeSet::releasePattern() {
michael@0 2064 if (pat) {
michael@0 2065 uprv_free(pat);
michael@0 2066 pat = NULL;
michael@0 2067 patLen = 0;
michael@0 2068 }
michael@0 2069 }
michael@0 2070
michael@0 2071 /**
michael@0 2072 * Set the new pattern to cache.
michael@0 2073 */
michael@0 2074 void UnicodeSet::setPattern(const UnicodeString& newPat) {
michael@0 2075 releasePattern();
michael@0 2076 int32_t newPatLen = newPat.length();
michael@0 2077 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
michael@0 2078 if (pat) {
michael@0 2079 patLen = newPatLen;
michael@0 2080 newPat.extractBetween(0, patLen, pat);
michael@0 2081 pat[patLen] = 0;
michael@0 2082 }
michael@0 2083 // else we don't care if malloc failed. This was just a nice cache.
michael@0 2084 // We can regenerate an equivalent pattern later when requested.
michael@0 2085 }
michael@0 2086
michael@0 2087 UnicodeFunctor *UnicodeSet::freeze() {
michael@0 2088 if(!isFrozen() && !isBogus()) {
michael@0 2089 // Do most of what compact() does before freezing because
michael@0 2090 // compact() will not work when the set is frozen.
michael@0 2091 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
michael@0 2092
michael@0 2093 // Delete buffer first to defragment memory less.
michael@0 2094 if (buffer != NULL) {
michael@0 2095 uprv_free(buffer);
michael@0 2096 buffer = NULL;
michael@0 2097 }
michael@0 2098 if (capacity > (len + GROW_EXTRA)) {
michael@0 2099 // Make the capacity equal to len or 1.
michael@0 2100 // We don't want to realloc of 0 size.
michael@0 2101 capacity = len + (len == 0);
michael@0 2102 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
michael@0 2103 if (list == NULL) { // Check for memory allocation error.
michael@0 2104 setToBogus();
michael@0 2105 return this;
michael@0 2106 }
michael@0 2107 }
michael@0 2108
michael@0 2109 // Optimize contains() and span() and similar functions.
michael@0 2110 if (!strings->isEmpty()) {
michael@0 2111 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
michael@0 2112 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
michael@0 2113 // All strings are irrelevant for span() etc. because
michael@0 2114 // all of each string's code points are contained in this set.
michael@0 2115 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
michael@0 2116 // many relevant strings as UTF-16.
michael@0 2117 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
michael@0 2118 delete stringSpan;
michael@0 2119 stringSpan = NULL;
michael@0 2120 }
michael@0 2121 }
michael@0 2122 if (stringSpan == NULL) {
michael@0 2123 // No span-relevant strings: Optimize for code point spans.
michael@0 2124 bmpSet=new BMPSet(list, len);
michael@0 2125 if (bmpSet == NULL) { // Check for memory allocation error.
michael@0 2126 setToBogus();
michael@0 2127 }
michael@0 2128 }
michael@0 2129 }
michael@0 2130 return this;
michael@0 2131 }
michael@0 2132
michael@0 2133 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 2134 if(length>0 && bmpSet!=NULL) {
michael@0 2135 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
michael@0 2136 }
michael@0 2137 if(length<0) {
michael@0 2138 length=u_strlen(s);
michael@0 2139 }
michael@0 2140 if(length==0) {
michael@0 2141 return 0;
michael@0 2142 }
michael@0 2143 if(stringSpan!=NULL) {
michael@0 2144 return stringSpan->span(s, length, spanCondition);
michael@0 2145 } else if(!strings->isEmpty()) {
michael@0 2146 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0 2147 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
michael@0 2148 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
michael@0 2149 UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0 2150 if(strSpan.needsStringSpanUTF16()) {
michael@0 2151 return strSpan.span(s, length, spanCondition);
michael@0 2152 }
michael@0 2153 }
michael@0 2154
michael@0 2155 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 2156 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 2157 }
michael@0 2158
michael@0 2159 UChar32 c;
michael@0 2160 int32_t start=0, prev=0;
michael@0 2161 do {
michael@0 2162 U16_NEXT(s, start, length, c);
michael@0 2163 if(spanCondition!=contains(c)) {
michael@0 2164 break;
michael@0 2165 }
michael@0 2166 } while((prev=start)<length);
michael@0 2167 return prev;
michael@0 2168 }
michael@0 2169
michael@0 2170 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 2171 if(length>0 && bmpSet!=NULL) {
michael@0 2172 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
michael@0 2173 }
michael@0 2174 if(length<0) {
michael@0 2175 length=u_strlen(s);
michael@0 2176 }
michael@0 2177 if(length==0) {
michael@0 2178 return 0;
michael@0 2179 }
michael@0 2180 if(stringSpan!=NULL) {
michael@0 2181 return stringSpan->spanBack(s, length, spanCondition);
michael@0 2182 } else if(!strings->isEmpty()) {
michael@0 2183 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0 2184 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
michael@0 2185 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
michael@0 2186 UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0 2187 if(strSpan.needsStringSpanUTF16()) {
michael@0 2188 return strSpan.spanBack(s, length, spanCondition);
michael@0 2189 }
michael@0 2190 }
michael@0 2191
michael@0 2192 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 2193 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 2194 }
michael@0 2195
michael@0 2196 UChar32 c;
michael@0 2197 int32_t prev=length;
michael@0 2198 do {
michael@0 2199 U16_PREV(s, 0, length, c);
michael@0 2200 if(spanCondition!=contains(c)) {
michael@0 2201 break;
michael@0 2202 }
michael@0 2203 } while((prev=length)>0);
michael@0 2204 return prev;
michael@0 2205 }
michael@0 2206
michael@0 2207 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 2208 if(length>0 && bmpSet!=NULL) {
michael@0 2209 const uint8_t *s0=(const uint8_t *)s;
michael@0 2210 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
michael@0 2211 }
michael@0 2212 if(length<0) {
michael@0 2213 length=(int32_t)uprv_strlen(s);
michael@0 2214 }
michael@0 2215 if(length==0) {
michael@0 2216 return 0;
michael@0 2217 }
michael@0 2218 if(stringSpan!=NULL) {
michael@0 2219 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
michael@0 2220 } else if(!strings->isEmpty()) {
michael@0 2221 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0 2222 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
michael@0 2223 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
michael@0 2224 UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0 2225 if(strSpan.needsStringSpanUTF8()) {
michael@0 2226 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
michael@0 2227 }
michael@0 2228 }
michael@0 2229
michael@0 2230 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 2231 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 2232 }
michael@0 2233
michael@0 2234 UChar32 c;
michael@0 2235 int32_t start=0, prev=0;
michael@0 2236 do {
michael@0 2237 U8_NEXT_OR_FFFD(s, start, length, c);
michael@0 2238 if(spanCondition!=contains(c)) {
michael@0 2239 break;
michael@0 2240 }
michael@0 2241 } while((prev=start)<length);
michael@0 2242 return prev;
michael@0 2243 }
michael@0 2244
michael@0 2245 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 2246 if(length>0 && bmpSet!=NULL) {
michael@0 2247 const uint8_t *s0=(const uint8_t *)s;
michael@0 2248 return bmpSet->spanBackUTF8(s0, length, spanCondition);
michael@0 2249 }
michael@0 2250 if(length<0) {
michael@0 2251 length=(int32_t)uprv_strlen(s);
michael@0 2252 }
michael@0 2253 if(length==0) {
michael@0 2254 return 0;
michael@0 2255 }
michael@0 2256 if(stringSpan!=NULL) {
michael@0 2257 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
michael@0 2258 } else if(!strings->isEmpty()) {
michael@0 2259 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0 2260 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
michael@0 2261 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
michael@0 2262 UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0 2263 if(strSpan.needsStringSpanUTF8()) {
michael@0 2264 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
michael@0 2265 }
michael@0 2266 }
michael@0 2267
michael@0 2268 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 2269 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 2270 }
michael@0 2271
michael@0 2272 UChar32 c;
michael@0 2273 int32_t prev=length;
michael@0 2274 do {
michael@0 2275 U8_PREV_OR_FFFD(s, 0, length, c);
michael@0 2276 if(spanCondition!=contains(c)) {
michael@0 2277 break;
michael@0 2278 }
michael@0 2279 } while((prev=length)>0);
michael@0 2280 return prev;
michael@0 2281 }
michael@0 2282
michael@0 2283 U_NAMESPACE_END

mercurial