Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 1999-2012, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * Date Name Description |
michael@0 | 7 | * 10/20/99 alan Creation. |
michael@0 | 8 | ********************************************************************** |
michael@0 | 9 | */ |
michael@0 | 10 | |
michael@0 | 11 | #include "unicode/utypes.h" |
michael@0 | 12 | #include "unicode/parsepos.h" |
michael@0 | 13 | #include "unicode/symtable.h" |
michael@0 | 14 | #include "unicode/uniset.h" |
michael@0 | 15 | #include "unicode/utf8.h" |
michael@0 | 16 | #include "unicode/utf16.h" |
michael@0 | 17 | #include "ruleiter.h" |
michael@0 | 18 | #include "cmemory.h" |
michael@0 | 19 | #include "cstring.h" |
michael@0 | 20 | #include "patternprops.h" |
michael@0 | 21 | #include "uelement.h" |
michael@0 | 22 | #include "util.h" |
michael@0 | 23 | #include "uvector.h" |
michael@0 | 24 | #include "charstr.h" |
michael@0 | 25 | #include "ustrfmt.h" |
michael@0 | 26 | #include "uassert.h" |
michael@0 | 27 | #include "bmpset.h" |
michael@0 | 28 | #include "unisetspan.h" |
michael@0 | 29 | |
michael@0 | 30 | // Define UChar constants using hex for EBCDIC compatibility |
michael@0 | 31 | // Used #define to reduce private static exports and memory access time. |
michael@0 | 32 | #define SET_OPEN ((UChar)0x005B) /*[*/ |
michael@0 | 33 | #define SET_CLOSE ((UChar)0x005D) /*]*/ |
michael@0 | 34 | #define HYPHEN ((UChar)0x002D) /*-*/ |
michael@0 | 35 | #define COMPLEMENT ((UChar)0x005E) /*^*/ |
michael@0 | 36 | #define COLON ((UChar)0x003A) /*:*/ |
michael@0 | 37 | #define BACKSLASH ((UChar)0x005C) /*\*/ |
michael@0 | 38 | #define INTERSECTION ((UChar)0x0026) /*&*/ |
michael@0 | 39 | #define UPPER_U ((UChar)0x0055) /*U*/ |
michael@0 | 40 | #define LOWER_U ((UChar)0x0075) /*u*/ |
michael@0 | 41 | #define OPEN_BRACE ((UChar)123) /*{*/ |
michael@0 | 42 | #define CLOSE_BRACE ((UChar)125) /*}*/ |
michael@0 | 43 | #define UPPER_P ((UChar)0x0050) /*P*/ |
michael@0 | 44 | #define LOWER_P ((UChar)0x0070) /*p*/ |
michael@0 | 45 | #define UPPER_N ((UChar)78) /*N*/ |
michael@0 | 46 | #define EQUALS ((UChar)0x003D) /*=*/ |
michael@0 | 47 | |
michael@0 | 48 | // HIGH_VALUE > all valid values. 110000 for codepoints |
michael@0 | 49 | #define UNICODESET_HIGH 0x0110000 |
michael@0 | 50 | |
michael@0 | 51 | // LOW <= all valid values. ZERO for codepoints |
michael@0 | 52 | #define UNICODESET_LOW 0x000000 |
michael@0 | 53 | |
michael@0 | 54 | // initial storage. Must be >= 0 |
michael@0 | 55 | #define START_EXTRA 16 |
michael@0 | 56 | |
michael@0 | 57 | // extra amount for growth. Must be >= 0 |
michael@0 | 58 | #define GROW_EXTRA START_EXTRA |
michael@0 | 59 | |
michael@0 | 60 | U_NAMESPACE_BEGIN |
michael@0 | 61 | |
michael@0 | 62 | SymbolTable::~SymbolTable() {} |
michael@0 | 63 | |
michael@0 | 64 | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet) |
michael@0 | 65 | |
michael@0 | 66 | /** |
michael@0 | 67 | * Modify the given UChar32 variable so that it is in range, by |
michael@0 | 68 | * pinning values < UNICODESET_LOW to UNICODESET_LOW, and |
michael@0 | 69 | * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1. |
michael@0 | 70 | * It modifies its argument in-place and also returns it. |
michael@0 | 71 | */ |
michael@0 | 72 | static inline UChar32 pinCodePoint(UChar32& c) { |
michael@0 | 73 | if (c < UNICODESET_LOW) { |
michael@0 | 74 | c = UNICODESET_LOW; |
michael@0 | 75 | } else if (c > (UNICODESET_HIGH-1)) { |
michael@0 | 76 | c = (UNICODESET_HIGH-1); |
michael@0 | 77 | } |
michael@0 | 78 | return c; |
michael@0 | 79 | } |
michael@0 | 80 | |
michael@0 | 81 | //---------------------------------------------------------------- |
michael@0 | 82 | // Debugging |
michael@0 | 83 | //---------------------------------------------------------------- |
michael@0 | 84 | |
michael@0 | 85 | // DO NOT DELETE THIS CODE. This code is used to debug memory leaks. |
michael@0 | 86 | // To enable the debugging, define the symbol DEBUG_MEM in the line |
michael@0 | 87 | // below. This will result in text being sent to stdout that looks |
michael@0 | 88 | // like this: |
michael@0 | 89 | // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85- |
michael@0 | 90 | // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85- |
michael@0 | 91 | // Each line lists a construction (ct) or destruction (dt) event, the |
michael@0 | 92 | // object address, the number of outstanding objects after the event, |
michael@0 | 93 | // and the pattern of the object in question. |
michael@0 | 94 | |
michael@0 | 95 | // #define DEBUG_MEM |
michael@0 | 96 | |
michael@0 | 97 | #ifdef DEBUG_MEM |
michael@0 | 98 | #include <stdio.h> |
michael@0 | 99 | static int32_t _dbgCount = 0; |
michael@0 | 100 | |
michael@0 | 101 | static inline void _dbgct(UnicodeSet* set) { |
michael@0 | 102 | UnicodeString str; |
michael@0 | 103 | set->toPattern(str, TRUE); |
michael@0 | 104 | char buf[40]; |
michael@0 | 105 | str.extract(0, 39, buf, ""); |
michael@0 | 106 | printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf); |
michael@0 | 107 | } |
michael@0 | 108 | |
michael@0 | 109 | static inline void _dbgdt(UnicodeSet* set) { |
michael@0 | 110 | UnicodeString str; |
michael@0 | 111 | set->toPattern(str, TRUE); |
michael@0 | 112 | char buf[40]; |
michael@0 | 113 | str.extract(0, 39, buf, ""); |
michael@0 | 114 | printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf); |
michael@0 | 115 | } |
michael@0 | 116 | |
michael@0 | 117 | #else |
michael@0 | 118 | |
michael@0 | 119 | #define _dbgct(set) |
michael@0 | 120 | #define _dbgdt(set) |
michael@0 | 121 | |
michael@0 | 122 | #endif |
michael@0 | 123 | |
michael@0 | 124 | //---------------------------------------------------------------- |
michael@0 | 125 | // UnicodeString in UVector support |
michael@0 | 126 | //---------------------------------------------------------------- |
michael@0 | 127 | |
michael@0 | 128 | static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) { |
michael@0 | 129 | dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer); |
michael@0 | 130 | } |
michael@0 | 131 | |
michael@0 | 132 | static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { |
michael@0 | 133 | const UnicodeString &a = *(const UnicodeString*)t1.pointer; |
michael@0 | 134 | const UnicodeString &b = *(const UnicodeString*)t2.pointer; |
michael@0 | 135 | return a.compare(b); |
michael@0 | 136 | } |
michael@0 | 137 | |
michael@0 | 138 | //---------------------------------------------------------------- |
michael@0 | 139 | // Constructors &c |
michael@0 | 140 | //---------------------------------------------------------------- |
michael@0 | 141 | |
michael@0 | 142 | /** |
michael@0 | 143 | * Constructs an empty set. |
michael@0 | 144 | */ |
michael@0 | 145 | UnicodeSet::UnicodeSet() : |
michael@0 | 146 | len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), |
michael@0 | 147 | bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 148 | fFlags(0) |
michael@0 | 149 | { |
michael@0 | 150 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 151 | allocateStrings(status); |
michael@0 | 152 | if (U_FAILURE(status)) { |
michael@0 | 153 | return; |
michael@0 | 154 | } |
michael@0 | 155 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 156 | if(list!=NULL){ |
michael@0 | 157 | list[0] = UNICODESET_HIGH; |
michael@0 | 158 | } else { // If memory allocation failed, set to bogus state. |
michael@0 | 159 | setToBogus(); |
michael@0 | 160 | return; |
michael@0 | 161 | } |
michael@0 | 162 | _dbgct(this); |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | /** |
michael@0 | 166 | * Constructs a set containing the given range. If <code>end > |
michael@0 | 167 | * start</code> then an empty set is created. |
michael@0 | 168 | * |
michael@0 | 169 | * @param start first character, inclusive, of range |
michael@0 | 170 | * @param end last character, inclusive, of range |
michael@0 | 171 | */ |
michael@0 | 172 | UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : |
michael@0 | 173 | len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), |
michael@0 | 174 | bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 175 | fFlags(0) |
michael@0 | 176 | { |
michael@0 | 177 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 178 | allocateStrings(status); |
michael@0 | 179 | if (U_FAILURE(status)) { |
michael@0 | 180 | return; |
michael@0 | 181 | } |
michael@0 | 182 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 183 | if(list!=NULL){ |
michael@0 | 184 | list[0] = UNICODESET_HIGH; |
michael@0 | 185 | complement(start, end); |
michael@0 | 186 | } else { // If memory allocation failed, set to bogus state. |
michael@0 | 187 | setToBogus(); |
michael@0 | 188 | return; |
michael@0 | 189 | } |
michael@0 | 190 | _dbgct(this); |
michael@0 | 191 | } |
michael@0 | 192 | |
michael@0 | 193 | /** |
michael@0 | 194 | * Constructs a set that is identical to the given UnicodeSet. |
michael@0 | 195 | */ |
michael@0 | 196 | UnicodeSet::UnicodeSet(const UnicodeSet& o) : |
michael@0 | 197 | UnicodeFilter(o), |
michael@0 | 198 | len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), |
michael@0 | 199 | bmpSet(0), |
michael@0 | 200 | buffer(0), bufferCapacity(0), |
michael@0 | 201 | patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 202 | fFlags(0) |
michael@0 | 203 | { |
michael@0 | 204 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 205 | allocateStrings(status); |
michael@0 | 206 | if (U_FAILURE(status)) { |
michael@0 | 207 | return; |
michael@0 | 208 | } |
michael@0 | 209 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 210 | if(list!=NULL){ |
michael@0 | 211 | *this = o; |
michael@0 | 212 | } else { // If memory allocation failed, set to bogus state. |
michael@0 | 213 | setToBogus(); |
michael@0 | 214 | return; |
michael@0 | 215 | } |
michael@0 | 216 | _dbgct(this); |
michael@0 | 217 | } |
michael@0 | 218 | |
michael@0 | 219 | // Copy-construct as thawed. |
michael@0 | 220 | UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : |
michael@0 | 221 | UnicodeFilter(o), |
michael@0 | 222 | len(0), capacity(o.len + GROW_EXTRA), list(0), |
michael@0 | 223 | bmpSet(0), |
michael@0 | 224 | buffer(0), bufferCapacity(0), |
michael@0 | 225 | patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), |
michael@0 | 226 | fFlags(0) |
michael@0 | 227 | { |
michael@0 | 228 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 229 | allocateStrings(status); |
michael@0 | 230 | if (U_FAILURE(status)) { |
michael@0 | 231 | return; |
michael@0 | 232 | } |
michael@0 | 233 | list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); |
michael@0 | 234 | if(list!=NULL){ |
michael@0 | 235 | // *this = o except for bmpSet and stringSpan |
michael@0 | 236 | len = o.len; |
michael@0 | 237 | uprv_memcpy(list, o.list, len*sizeof(UChar32)); |
michael@0 | 238 | if (strings != NULL && o.strings != NULL) { |
michael@0 | 239 | strings->assign(*o.strings, cloneUnicodeString, status); |
michael@0 | 240 | } else { // Invalid strings. |
michael@0 | 241 | setToBogus(); |
michael@0 | 242 | return; |
michael@0 | 243 | } |
michael@0 | 244 | if (o.pat) { |
michael@0 | 245 | setPattern(UnicodeString(o.pat, o.patLen)); |
michael@0 | 246 | } |
michael@0 | 247 | } else { // If memory allocation failed, set to bogus state. |
michael@0 | 248 | setToBogus(); |
michael@0 | 249 | return; |
michael@0 | 250 | } |
michael@0 | 251 | _dbgct(this); |
michael@0 | 252 | } |
michael@0 | 253 | |
michael@0 | 254 | /** |
michael@0 | 255 | * Destructs the set. |
michael@0 | 256 | */ |
michael@0 | 257 | UnicodeSet::~UnicodeSet() { |
michael@0 | 258 | _dbgdt(this); // first! |
michael@0 | 259 | uprv_free(list); |
michael@0 | 260 | delete bmpSet; |
michael@0 | 261 | if (buffer) { |
michael@0 | 262 | uprv_free(buffer); |
michael@0 | 263 | } |
michael@0 | 264 | delete strings; |
michael@0 | 265 | delete stringSpan; |
michael@0 | 266 | releasePattern(); |
michael@0 | 267 | } |
michael@0 | 268 | |
michael@0 | 269 | /** |
michael@0 | 270 | * Assigns this object to be a copy of another. |
michael@0 | 271 | */ |
michael@0 | 272 | UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) { |
michael@0 | 273 | if (this == &o) { |
michael@0 | 274 | return *this; |
michael@0 | 275 | } |
michael@0 | 276 | if (isFrozen()) { |
michael@0 | 277 | return *this; |
michael@0 | 278 | } |
michael@0 | 279 | if (o.isBogus()) { |
michael@0 | 280 | setToBogus(); |
michael@0 | 281 | return *this; |
michael@0 | 282 | } |
michael@0 | 283 | UErrorCode ec = U_ZERO_ERROR; |
michael@0 | 284 | ensureCapacity(o.len, ec); |
michael@0 | 285 | if (U_FAILURE(ec)) { |
michael@0 | 286 | return *this; // There is no way to report this error :-( |
michael@0 | 287 | } |
michael@0 | 288 | len = o.len; |
michael@0 | 289 | uprv_memcpy(list, o.list, len*sizeof(UChar32)); |
michael@0 | 290 | if (o.bmpSet == NULL) { |
michael@0 | 291 | bmpSet = NULL; |
michael@0 | 292 | } else { |
michael@0 | 293 | bmpSet = new BMPSet(*o.bmpSet, list, len); |
michael@0 | 294 | if (bmpSet == NULL) { // Check for memory allocation error. |
michael@0 | 295 | setToBogus(); |
michael@0 | 296 | return *this; |
michael@0 | 297 | } |
michael@0 | 298 | } |
michael@0 | 299 | if (strings != NULL && o.strings != NULL) { |
michael@0 | 300 | strings->assign(*o.strings, cloneUnicodeString, ec); |
michael@0 | 301 | } else { // Invalid strings. |
michael@0 | 302 | setToBogus(); |
michael@0 | 303 | return *this; |
michael@0 | 304 | } |
michael@0 | 305 | if (o.stringSpan == NULL) { |
michael@0 | 306 | stringSpan = NULL; |
michael@0 | 307 | } else { |
michael@0 | 308 | stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); |
michael@0 | 309 | if (stringSpan == NULL) { // Check for memory allocation error. |
michael@0 | 310 | setToBogus(); |
michael@0 | 311 | return *this; |
michael@0 | 312 | } |
michael@0 | 313 | } |
michael@0 | 314 | releasePattern(); |
michael@0 | 315 | if (o.pat) { |
michael@0 | 316 | setPattern(UnicodeString(o.pat, o.patLen)); |
michael@0 | 317 | } |
michael@0 | 318 | return *this; |
michael@0 | 319 | } |
michael@0 | 320 | |
michael@0 | 321 | /** |
michael@0 | 322 | * Returns a copy of this object. All UnicodeMatcher objects have |
michael@0 | 323 | * to support cloning in order to allow classes using |
michael@0 | 324 | * UnicodeMatchers, such as Transliterator, to implement cloning. |
michael@0 | 325 | */ |
michael@0 | 326 | UnicodeFunctor* UnicodeSet::clone() const { |
michael@0 | 327 | return new UnicodeSet(*this); |
michael@0 | 328 | } |
michael@0 | 329 | |
michael@0 | 330 | UnicodeFunctor *UnicodeSet::cloneAsThawed() const { |
michael@0 | 331 | return new UnicodeSet(*this, TRUE); |
michael@0 | 332 | } |
michael@0 | 333 | |
michael@0 | 334 | /** |
michael@0 | 335 | * Compares the specified object with this set for equality. Returns |
michael@0 | 336 | * <tt>true</tt> if the two sets |
michael@0 | 337 | * have the same size, and every member of the specified set is |
michael@0 | 338 | * contained in this set (or equivalently, every member of this set is |
michael@0 | 339 | * contained in the specified set). |
michael@0 | 340 | * |
michael@0 | 341 | * @param o set to be compared for equality with this set. |
michael@0 | 342 | * @return <tt>true</tt> if the specified set is equal to this set. |
michael@0 | 343 | */ |
michael@0 | 344 | UBool UnicodeSet::operator==(const UnicodeSet& o) const { |
michael@0 | 345 | if (len != o.len) return FALSE; |
michael@0 | 346 | for (int32_t i = 0; i < len; ++i) { |
michael@0 | 347 | if (list[i] != o.list[i]) return FALSE; |
michael@0 | 348 | } |
michael@0 | 349 | if (*strings != *o.strings) return FALSE; |
michael@0 | 350 | return TRUE; |
michael@0 | 351 | } |
michael@0 | 352 | |
michael@0 | 353 | /** |
michael@0 | 354 | * Returns the hash code value for this set. |
michael@0 | 355 | * |
michael@0 | 356 | * @return the hash code value for this set. |
michael@0 | 357 | * @see Object#hashCode() |
michael@0 | 358 | */ |
michael@0 | 359 | int32_t UnicodeSet::hashCode(void) const { |
michael@0 | 360 | int32_t result = len; |
michael@0 | 361 | for (int32_t i = 0; i < len; ++i) { |
michael@0 | 362 | result *= 1000003; |
michael@0 | 363 | result += list[i]; |
michael@0 | 364 | } |
michael@0 | 365 | return result; |
michael@0 | 366 | } |
michael@0 | 367 | |
michael@0 | 368 | //---------------------------------------------------------------- |
michael@0 | 369 | // Public API |
michael@0 | 370 | //---------------------------------------------------------------- |
michael@0 | 371 | |
michael@0 | 372 | /** |
michael@0 | 373 | * Returns the number of elements in this set (its cardinality), |
michael@0 | 374 | * Note than the elements of a set may include both individual |
michael@0 | 375 | * codepoints and strings. |
michael@0 | 376 | * |
michael@0 | 377 | * @return the number of elements in this set (its cardinality). |
michael@0 | 378 | */ |
michael@0 | 379 | int32_t UnicodeSet::size(void) const { |
michael@0 | 380 | int32_t n = 0; |
michael@0 | 381 | int32_t count = getRangeCount(); |
michael@0 | 382 | for (int32_t i = 0; i < count; ++i) { |
michael@0 | 383 | n += getRangeEnd(i) - getRangeStart(i) + 1; |
michael@0 | 384 | } |
michael@0 | 385 | return n + strings->size(); |
michael@0 | 386 | } |
michael@0 | 387 | |
michael@0 | 388 | /** |
michael@0 | 389 | * Returns <tt>true</tt> if this set contains no elements. |
michael@0 | 390 | * |
michael@0 | 391 | * @return <tt>true</tt> if this set contains no elements. |
michael@0 | 392 | */ |
michael@0 | 393 | UBool UnicodeSet::isEmpty(void) const { |
michael@0 | 394 | return len == 1 && strings->size() == 0; |
michael@0 | 395 | } |
michael@0 | 396 | |
michael@0 | 397 | /** |
michael@0 | 398 | * Returns true if this set contains the given character. |
michael@0 | 399 | * @param c character to be checked for containment |
michael@0 | 400 | * @return true if the test condition is met |
michael@0 | 401 | */ |
michael@0 | 402 | UBool UnicodeSet::contains(UChar32 c) const { |
michael@0 | 403 | // Set i to the index of the start item greater than ch |
michael@0 | 404 | // We know we will terminate without length test! |
michael@0 | 405 | // LATER: for large sets, add binary search |
michael@0 | 406 | //int32_t i = -1; |
michael@0 | 407 | //for (;;) { |
michael@0 | 408 | // if (c < list[++i]) break; |
michael@0 | 409 | //} |
michael@0 | 410 | if (bmpSet != NULL) { |
michael@0 | 411 | return bmpSet->contains(c); |
michael@0 | 412 | } |
michael@0 | 413 | if (stringSpan != NULL) { |
michael@0 | 414 | return stringSpan->contains(c); |
michael@0 | 415 | } |
michael@0 | 416 | if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound |
michael@0 | 417 | return FALSE; |
michael@0 | 418 | } |
michael@0 | 419 | int32_t i = findCodePoint(c); |
michael@0 | 420 | return (UBool)(i & 1); // return true if odd |
michael@0 | 421 | } |
michael@0 | 422 | |
michael@0 | 423 | /** |
michael@0 | 424 | * Returns the smallest value i such that c < list[i]. Caller |
michael@0 | 425 | * must ensure that c is a legal value or this method will enter |
michael@0 | 426 | * an infinite loop. This method performs a binary search. |
michael@0 | 427 | * @param c a character in the range MIN_VALUE..MAX_VALUE |
michael@0 | 428 | * inclusive |
michael@0 | 429 | * @return the smallest integer i in the range 0..len-1, |
michael@0 | 430 | * inclusive, such that c < list[i] |
michael@0 | 431 | */ |
michael@0 | 432 | int32_t UnicodeSet::findCodePoint(UChar32 c) const { |
michael@0 | 433 | /* Examples: |
michael@0 | 434 | findCodePoint(c) |
michael@0 | 435 | set list[] c=0 1 3 4 7 8 |
michael@0 | 436 | === ============== =========== |
michael@0 | 437 | [] [110000] 0 0 0 0 0 0 |
michael@0 | 438 | [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 |
michael@0 | 439 | [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 |
michael@0 | 440 | [:Any:] [0, 110000] 1 1 1 1 1 1 |
michael@0 | 441 | */ |
michael@0 | 442 | |
michael@0 | 443 | // Return the smallest i such that c < list[i]. Assume |
michael@0 | 444 | // list[len - 1] == HIGH and that c is legal (0..HIGH-1). |
michael@0 | 445 | if (c < list[0]) |
michael@0 | 446 | return 0; |
michael@0 | 447 | // High runner test. c is often after the last range, so an |
michael@0 | 448 | // initial check for this condition pays off. |
michael@0 | 449 | int32_t lo = 0; |
michael@0 | 450 | int32_t hi = len - 1; |
michael@0 | 451 | if (lo >= hi || c >= list[hi-1]) |
michael@0 | 452 | return hi; |
michael@0 | 453 | // invariant: c >= list[lo] |
michael@0 | 454 | // invariant: c < list[hi] |
michael@0 | 455 | for (;;) { |
michael@0 | 456 | int32_t i = (lo + hi) >> 1; |
michael@0 | 457 | if (i == lo) { |
michael@0 | 458 | break; // Found! |
michael@0 | 459 | } else if (c < list[i]) { |
michael@0 | 460 | hi = i; |
michael@0 | 461 | } else { |
michael@0 | 462 | lo = i; |
michael@0 | 463 | } |
michael@0 | 464 | } |
michael@0 | 465 | return hi; |
michael@0 | 466 | } |
michael@0 | 467 | |
michael@0 | 468 | /** |
michael@0 | 469 | * Returns true if this set contains every character |
michael@0 | 470 | * of the given range. |
michael@0 | 471 | * @param start first character, inclusive, of the range |
michael@0 | 472 | * @param end last character, inclusive, of the range |
michael@0 | 473 | * @return true if the test condition is met |
michael@0 | 474 | */ |
michael@0 | 475 | UBool UnicodeSet::contains(UChar32 start, UChar32 end) const { |
michael@0 | 476 | //int32_t i = -1; |
michael@0 | 477 | //for (;;) { |
michael@0 | 478 | // if (start < list[++i]) break; |
michael@0 | 479 | //} |
michael@0 | 480 | int32_t i = findCodePoint(start); |
michael@0 | 481 | return ((i & 1) != 0 && end < list[i]); |
michael@0 | 482 | } |
michael@0 | 483 | |
michael@0 | 484 | /** |
michael@0 | 485 | * Returns <tt>true</tt> if this set contains the given |
michael@0 | 486 | * multicharacter string. |
michael@0 | 487 | * @param s string to be checked for containment |
michael@0 | 488 | * @return <tt>true</tt> if this set contains the specified string |
michael@0 | 489 | */ |
michael@0 | 490 | UBool UnicodeSet::contains(const UnicodeString& s) const { |
michael@0 | 491 | if (s.length() == 0) return FALSE; |
michael@0 | 492 | int32_t cp = getSingleCP(s); |
michael@0 | 493 | if (cp < 0) { |
michael@0 | 494 | return strings->contains((void*) &s); |
michael@0 | 495 | } else { |
michael@0 | 496 | return contains((UChar32) cp); |
michael@0 | 497 | } |
michael@0 | 498 | } |
michael@0 | 499 | |
michael@0 | 500 | /** |
michael@0 | 501 | * Returns true if this set contains all the characters and strings |
michael@0 | 502 | * of the given set. |
michael@0 | 503 | * @param c set to be checked for containment |
michael@0 | 504 | * @return true if the test condition is met |
michael@0 | 505 | */ |
michael@0 | 506 | UBool UnicodeSet::containsAll(const UnicodeSet& c) const { |
michael@0 | 507 | // The specified set is a subset if all of its pairs are contained in |
michael@0 | 508 | // this set. It's possible to code this more efficiently in terms of |
michael@0 | 509 | // direct manipulation of the inversion lists if the need arises. |
michael@0 | 510 | int32_t n = c.getRangeCount(); |
michael@0 | 511 | for (int i=0; i<n; ++i) { |
michael@0 | 512 | if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) { |
michael@0 | 513 | return FALSE; |
michael@0 | 514 | } |
michael@0 | 515 | } |
michael@0 | 516 | if (!strings->containsAll(*c.strings)) return FALSE; |
michael@0 | 517 | return TRUE; |
michael@0 | 518 | } |
michael@0 | 519 | |
michael@0 | 520 | /** |
michael@0 | 521 | * Returns true if this set contains all the characters |
michael@0 | 522 | * of the given string. |
michael@0 | 523 | * @param s string containing characters to be checked for containment |
michael@0 | 524 | * @return true if the test condition is met |
michael@0 | 525 | */ |
michael@0 | 526 | UBool UnicodeSet::containsAll(const UnicodeString& s) const { |
michael@0 | 527 | return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) == |
michael@0 | 528 | s.length()); |
michael@0 | 529 | } |
michael@0 | 530 | |
michael@0 | 531 | /** |
michael@0 | 532 | * Returns true if this set contains none of the characters |
michael@0 | 533 | * of the given range. |
michael@0 | 534 | * @param start first character, inclusive, of the range |
michael@0 | 535 | * @param end last character, inclusive, of the range |
michael@0 | 536 | * @return true if the test condition is met |
michael@0 | 537 | */ |
michael@0 | 538 | UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const { |
michael@0 | 539 | //int32_t i = -1; |
michael@0 | 540 | //for (;;) { |
michael@0 | 541 | // if (start < list[++i]) break; |
michael@0 | 542 | //} |
michael@0 | 543 | int32_t i = findCodePoint(start); |
michael@0 | 544 | return ((i & 1) == 0 && end < list[i]); |
michael@0 | 545 | } |
michael@0 | 546 | |
michael@0 | 547 | /** |
michael@0 | 548 | * Returns true if this set contains none of the characters and strings |
michael@0 | 549 | * of the given set. |
michael@0 | 550 | * @param c set to be checked for containment |
michael@0 | 551 | * @return true if the test condition is met |
michael@0 | 552 | */ |
michael@0 | 553 | UBool UnicodeSet::containsNone(const UnicodeSet& c) const { |
michael@0 | 554 | // The specified set is a subset if all of its pairs are contained in |
michael@0 | 555 | // this set. It's possible to code this more efficiently in terms of |
michael@0 | 556 | // direct manipulation of the inversion lists if the need arises. |
michael@0 | 557 | int32_t n = c.getRangeCount(); |
michael@0 | 558 | for (int32_t i=0; i<n; ++i) { |
michael@0 | 559 | if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) { |
michael@0 | 560 | return FALSE; |
michael@0 | 561 | } |
michael@0 | 562 | } |
michael@0 | 563 | if (!strings->containsNone(*c.strings)) return FALSE; |
michael@0 | 564 | return TRUE; |
michael@0 | 565 | } |
michael@0 | 566 | |
michael@0 | 567 | /** |
michael@0 | 568 | * Returns true if this set contains none of the characters |
michael@0 | 569 | * of the given string. |
michael@0 | 570 | * @param s string containing characters to be checked for containment |
michael@0 | 571 | * @return true if the test condition is met |
michael@0 | 572 | */ |
michael@0 | 573 | UBool UnicodeSet::containsNone(const UnicodeString& s) const { |
michael@0 | 574 | return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) == |
michael@0 | 575 | s.length()); |
michael@0 | 576 | } |
michael@0 | 577 | |
michael@0 | 578 | /** |
michael@0 | 579 | * Returns <tt>true</tt> if this set contains any character whose low byte |
michael@0 | 580 | * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for |
michael@0 | 581 | * indexing. |
michael@0 | 582 | */ |
michael@0 | 583 | UBool UnicodeSet::matchesIndexValue(uint8_t v) const { |
michael@0 | 584 | /* The index value v, in the range [0,255], is contained in this set if |
michael@0 | 585 | * it is contained in any pair of this set. Pairs either have the high |
michael@0 | 586 | * bytes equal, or unequal. If the high bytes are equal, then we have |
michael@0 | 587 | * aaxx..aayy, where aa is the high byte. Then v is contained if xx <= |
michael@0 | 588 | * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa. |
michael@0 | 589 | * Then v is contained if xx <= v || v <= yy. (This is identical to the |
michael@0 | 590 | * time zone month containment logic.) |
michael@0 | 591 | */ |
michael@0 | 592 | int32_t i; |
michael@0 | 593 | int32_t rangeCount=getRangeCount(); |
michael@0 | 594 | for (i=0; i<rangeCount; ++i) { |
michael@0 | 595 | UChar32 low = getRangeStart(i); |
michael@0 | 596 | UChar32 high = getRangeEnd(i); |
michael@0 | 597 | if ((low & ~0xFF) == (high & ~0xFF)) { |
michael@0 | 598 | if ((low & 0xFF) <= v && v <= (high & 0xFF)) { |
michael@0 | 599 | return TRUE; |
michael@0 | 600 | } |
michael@0 | 601 | } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) { |
michael@0 | 602 | return TRUE; |
michael@0 | 603 | } |
michael@0 | 604 | } |
michael@0 | 605 | if (strings->size() != 0) { |
michael@0 | 606 | for (i=0; i<strings->size(); ++i) { |
michael@0 | 607 | const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); |
michael@0 | 608 | //if (s.length() == 0) { |
michael@0 | 609 | // // Empty strings match everything |
michael@0 | 610 | // return TRUE; |
michael@0 | 611 | //} |
michael@0 | 612 | // assert(s.length() != 0); // We enforce this elsewhere |
michael@0 | 613 | UChar32 c = s.char32At(0); |
michael@0 | 614 | if ((c & 0xFF) == v) { |
michael@0 | 615 | return TRUE; |
michael@0 | 616 | } |
michael@0 | 617 | } |
michael@0 | 618 | } |
michael@0 | 619 | return FALSE; |
michael@0 | 620 | } |
michael@0 | 621 | |
michael@0 | 622 | /** |
michael@0 | 623 | * Implementation of UnicodeMatcher::matches(). Always matches the |
michael@0 | 624 | * longest possible multichar string. |
michael@0 | 625 | */ |
michael@0 | 626 | UMatchDegree UnicodeSet::matches(const Replaceable& text, |
michael@0 | 627 | int32_t& offset, |
michael@0 | 628 | int32_t limit, |
michael@0 | 629 | UBool incremental) { |
michael@0 | 630 | if (offset == limit) { |
michael@0 | 631 | // Strings, if any, have length != 0, so we don't worry |
michael@0 | 632 | // about them here. If we ever allow zero-length strings |
michael@0 | 633 | // we much check for them here. |
michael@0 | 634 | if (contains(U_ETHER)) { |
michael@0 | 635 | return incremental ? U_PARTIAL_MATCH : U_MATCH; |
michael@0 | 636 | } else { |
michael@0 | 637 | return U_MISMATCH; |
michael@0 | 638 | } |
michael@0 | 639 | } else { |
michael@0 | 640 | if (strings->size() != 0) { // try strings first |
michael@0 | 641 | |
michael@0 | 642 | // might separate forward and backward loops later |
michael@0 | 643 | // for now they are combined |
michael@0 | 644 | |
michael@0 | 645 | // TODO Improve efficiency of this, at least in the forward |
michael@0 | 646 | // direction, if not in both. In the forward direction we |
michael@0 | 647 | // can assume the strings are sorted. |
michael@0 | 648 | |
michael@0 | 649 | int32_t i; |
michael@0 | 650 | UBool forward = offset < limit; |
michael@0 | 651 | |
michael@0 | 652 | // firstChar is the leftmost char to match in the |
michael@0 | 653 | // forward direction or the rightmost char to match in |
michael@0 | 654 | // the reverse direction. |
michael@0 | 655 | UChar firstChar = text.charAt(offset); |
michael@0 | 656 | |
michael@0 | 657 | // If there are multiple strings that can match we |
michael@0 | 658 | // return the longest match. |
michael@0 | 659 | int32_t highWaterLength = 0; |
michael@0 | 660 | |
michael@0 | 661 | for (i=0; i<strings->size(); ++i) { |
michael@0 | 662 | const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i); |
michael@0 | 663 | |
michael@0 | 664 | //if (trial.length() == 0) { |
michael@0 | 665 | // return U_MATCH; // null-string always matches |
michael@0 | 666 | //} |
michael@0 | 667 | // assert(trial.length() != 0); // We ensure this elsewhere |
michael@0 | 668 | |
michael@0 | 669 | UChar c = trial.charAt(forward ? 0 : trial.length() - 1); |
michael@0 | 670 | |
michael@0 | 671 | // Strings are sorted, so we can optimize in the |
michael@0 | 672 | // forward direction. |
michael@0 | 673 | if (forward && c > firstChar) break; |
michael@0 | 674 | if (c != firstChar) continue; |
michael@0 | 675 | |
michael@0 | 676 | int32_t matchLen = matchRest(text, offset, limit, trial); |
michael@0 | 677 | |
michael@0 | 678 | if (incremental) { |
michael@0 | 679 | int32_t maxLen = forward ? limit-offset : offset-limit; |
michael@0 | 680 | if (matchLen == maxLen) { |
michael@0 | 681 | // We have successfully matched but only up to limit. |
michael@0 | 682 | return U_PARTIAL_MATCH; |
michael@0 | 683 | } |
michael@0 | 684 | } |
michael@0 | 685 | |
michael@0 | 686 | if (matchLen == trial.length()) { |
michael@0 | 687 | // We have successfully matched the whole string. |
michael@0 | 688 | if (matchLen > highWaterLength) { |
michael@0 | 689 | highWaterLength = matchLen; |
michael@0 | 690 | } |
michael@0 | 691 | // In the forward direction we know strings |
michael@0 | 692 | // are sorted so we can bail early. |
michael@0 | 693 | if (forward && matchLen < highWaterLength) { |
michael@0 | 694 | break; |
michael@0 | 695 | } |
michael@0 | 696 | continue; |
michael@0 | 697 | } |
michael@0 | 698 | } |
michael@0 | 699 | |
michael@0 | 700 | // We've checked all strings without a partial match. |
michael@0 | 701 | // If we have full matches, return the longest one. |
michael@0 | 702 | if (highWaterLength != 0) { |
michael@0 | 703 | offset += forward ? highWaterLength : -highWaterLength; |
michael@0 | 704 | return U_MATCH; |
michael@0 | 705 | } |
michael@0 | 706 | } |
michael@0 | 707 | return UnicodeFilter::matches(text, offset, limit, incremental); |
michael@0 | 708 | } |
michael@0 | 709 | } |
michael@0 | 710 | |
michael@0 | 711 | /** |
michael@0 | 712 | * Returns the longest match for s in text at the given position. |
michael@0 | 713 | * If limit > start then match forward from start+1 to limit |
michael@0 | 714 | * matching all characters except s.charAt(0). If limit < start, |
michael@0 | 715 | * go backward starting from start-1 matching all characters |
michael@0 | 716 | * except s.charAt(s.length()-1). This method assumes that the |
michael@0 | 717 | * first character, text.charAt(start), matches s, so it does not |
michael@0 | 718 | * check it. |
michael@0 | 719 | * @param text the text to match |
michael@0 | 720 | * @param start the first character to match. In the forward |
michael@0 | 721 | * direction, text.charAt(start) is matched against s.charAt(0). |
michael@0 | 722 | * In the reverse direction, it is matched against |
michael@0 | 723 | * s.charAt(s.length()-1). |
michael@0 | 724 | * @param limit the limit offset for matching, either last+1 in |
michael@0 | 725 | * the forward direction, or last-1 in the reverse direction, |
michael@0 | 726 | * where last is the index of the last character to match. |
michael@0 | 727 | * @return If part of s matches up to the limit, return |limit - |
michael@0 | 728 | * start|. If all of s matches before reaching the limit, return |
michael@0 | 729 | * s.length(). If there is a mismatch between s and text, return |
michael@0 | 730 | * 0 |
michael@0 | 731 | */ |
michael@0 | 732 | int32_t UnicodeSet::matchRest(const Replaceable& text, |
michael@0 | 733 | int32_t start, int32_t limit, |
michael@0 | 734 | const UnicodeString& s) { |
michael@0 | 735 | int32_t i; |
michael@0 | 736 | int32_t maxLen; |
michael@0 | 737 | int32_t slen = s.length(); |
michael@0 | 738 | if (start < limit) { |
michael@0 | 739 | maxLen = limit - start; |
michael@0 | 740 | if (maxLen > slen) maxLen = slen; |
michael@0 | 741 | for (i = 1; i < maxLen; ++i) { |
michael@0 | 742 | if (text.charAt(start + i) != s.charAt(i)) return 0; |
michael@0 | 743 | } |
michael@0 | 744 | } else { |
michael@0 | 745 | maxLen = start - limit; |
michael@0 | 746 | if (maxLen > slen) maxLen = slen; |
michael@0 | 747 | --slen; // <=> slen = s.length() - 1; |
michael@0 | 748 | for (i = 1; i < maxLen; ++i) { |
michael@0 | 749 | if (text.charAt(start - i) != s.charAt(slen - i)) return 0; |
michael@0 | 750 | } |
michael@0 | 751 | } |
michael@0 | 752 | return maxLen; |
michael@0 | 753 | } |
michael@0 | 754 | |
michael@0 | 755 | /** |
michael@0 | 756 | * Implement of UnicodeMatcher |
michael@0 | 757 | */ |
michael@0 | 758 | void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const { |
michael@0 | 759 | toUnionTo.addAll(*this); |
michael@0 | 760 | } |
michael@0 | 761 | |
michael@0 | 762 | /** |
michael@0 | 763 | * Returns the index of the given character within this set, where |
michael@0 | 764 | * the set is ordered by ascending code point. If the character |
michael@0 | 765 | * is not in this set, return -1. The inverse of this method is |
michael@0 | 766 | * <code>charAt()</code>. |
michael@0 | 767 | * @return an index from 0..size()-1, or -1 |
michael@0 | 768 | */ |
michael@0 | 769 | int32_t UnicodeSet::indexOf(UChar32 c) const { |
michael@0 | 770 | if (c < MIN_VALUE || c > MAX_VALUE) { |
michael@0 | 771 | return -1; |
michael@0 | 772 | } |
michael@0 | 773 | int32_t i = 0; |
michael@0 | 774 | int32_t n = 0; |
michael@0 | 775 | for (;;) { |
michael@0 | 776 | UChar32 start = list[i++]; |
michael@0 | 777 | if (c < start) { |
michael@0 | 778 | return -1; |
michael@0 | 779 | } |
michael@0 | 780 | UChar32 limit = list[i++]; |
michael@0 | 781 | if (c < limit) { |
michael@0 | 782 | return n + c - start; |
michael@0 | 783 | } |
michael@0 | 784 | n += limit - start; |
michael@0 | 785 | } |
michael@0 | 786 | } |
michael@0 | 787 | |
michael@0 | 788 | /** |
michael@0 | 789 | * Returns the character at the given index within this set, where |
michael@0 | 790 | * the set is ordered by ascending code point. If the index is |
michael@0 | 791 | * out of range, return (UChar32)-1. The inverse of this method is |
michael@0 | 792 | * <code>indexOf()</code>. |
michael@0 | 793 | * @param index an index from 0..size()-1 |
michael@0 | 794 | * @return the character at the given index, or (UChar32)-1. |
michael@0 | 795 | */ |
michael@0 | 796 | UChar32 UnicodeSet::charAt(int32_t index) const { |
michael@0 | 797 | if (index >= 0) { |
michael@0 | 798 | // len2 is the largest even integer <= len, that is, it is len |
michael@0 | 799 | // for even values and len-1 for odd values. With odd values |
michael@0 | 800 | // the last entry is UNICODESET_HIGH. |
michael@0 | 801 | int32_t len2 = len & ~1; |
michael@0 | 802 | for (int32_t i=0; i < len2;) { |
michael@0 | 803 | UChar32 start = list[i++]; |
michael@0 | 804 | int32_t count = list[i++] - start; |
michael@0 | 805 | if (index < count) { |
michael@0 | 806 | return (UChar32)(start + index); |
michael@0 | 807 | } |
michael@0 | 808 | index -= count; |
michael@0 | 809 | } |
michael@0 | 810 | } |
michael@0 | 811 | return (UChar32)-1; |
michael@0 | 812 | } |
michael@0 | 813 | |
michael@0 | 814 | /** |
michael@0 | 815 | * Make this object represent the range <code>start - end</code>. |
michael@0 | 816 | * If <code>end > start</code> then this object is set to an |
michael@0 | 817 | * an empty range. |
michael@0 | 818 | * |
michael@0 | 819 | * @param start first character in the set, inclusive |
michael@0 | 820 | * @rparam end last character in the set, inclusive |
michael@0 | 821 | */ |
michael@0 | 822 | UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { |
michael@0 | 823 | clear(); |
michael@0 | 824 | complement(start, end); |
michael@0 | 825 | return *this; |
michael@0 | 826 | } |
michael@0 | 827 | |
michael@0 | 828 | /** |
michael@0 | 829 | * Adds the specified range to this set if it is not already |
michael@0 | 830 | * present. If this set already contains the specified range, |
michael@0 | 831 | * the call leaves this set unchanged. If <code>end > start</code> |
michael@0 | 832 | * then an empty range is added, leaving the set unchanged. |
michael@0 | 833 | * |
michael@0 | 834 | * @param start first character, inclusive, of range to be added |
michael@0 | 835 | * to this set. |
michael@0 | 836 | * @param end last character, inclusive, of range to be added |
michael@0 | 837 | * to this set. |
michael@0 | 838 | */ |
michael@0 | 839 | UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { |
michael@0 | 840 | if (pinCodePoint(start) < pinCodePoint(end)) { |
michael@0 | 841 | UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; |
michael@0 | 842 | add(range, 2, 0); |
michael@0 | 843 | } else if (start == end) { |
michael@0 | 844 | add(start); |
michael@0 | 845 | } |
michael@0 | 846 | return *this; |
michael@0 | 847 | } |
michael@0 | 848 | |
michael@0 | 849 | // #define DEBUG_US_ADD |
michael@0 | 850 | |
michael@0 | 851 | #ifdef DEBUG_US_ADD |
michael@0 | 852 | #include <stdio.h> |
michael@0 | 853 | void dump(UChar32 c) { |
michael@0 | 854 | if (c <= 0xFF) { |
michael@0 | 855 | printf("%c", (char)c); |
michael@0 | 856 | } else { |
michael@0 | 857 | printf("U+%04X", c); |
michael@0 | 858 | } |
michael@0 | 859 | } |
michael@0 | 860 | void dump(const UChar32* list, int32_t len) { |
michael@0 | 861 | printf("["); |
michael@0 | 862 | for (int32_t i=0; i<len; ++i) { |
michael@0 | 863 | if (i != 0) printf(", "); |
michael@0 | 864 | dump(list[i]); |
michael@0 | 865 | } |
michael@0 | 866 | printf("]"); |
michael@0 | 867 | } |
michael@0 | 868 | #endif |
michael@0 | 869 | |
michael@0 | 870 | /** |
michael@0 | 871 | * Adds the specified character to this set if it is not already |
michael@0 | 872 | * present. If this set already contains the specified character, |
michael@0 | 873 | * the call leaves this set unchanged. |
michael@0 | 874 | */ |
michael@0 | 875 | UnicodeSet& UnicodeSet::add(UChar32 c) { |
michael@0 | 876 | // find smallest i such that c < list[i] |
michael@0 | 877 | // if odd, then it is IN the set |
michael@0 | 878 | // if even, then it is OUT of the set |
michael@0 | 879 | int32_t i = findCodePoint(pinCodePoint(c)); |
michael@0 | 880 | |
michael@0 | 881 | // already in set? |
michael@0 | 882 | if ((i & 1) != 0 || isFrozen() || isBogus()) return *this; |
michael@0 | 883 | |
michael@0 | 884 | // HIGH is 0x110000 |
michael@0 | 885 | // assert(list[len-1] == HIGH); |
michael@0 | 886 | |
michael@0 | 887 | // empty = [HIGH] |
michael@0 | 888 | // [start_0, limit_0, start_1, limit_1, HIGH] |
michael@0 | 889 | |
michael@0 | 890 | // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] |
michael@0 | 891 | // ^ |
michael@0 | 892 | // list[i] |
michael@0 | 893 | |
michael@0 | 894 | // i == 0 means c is before the first range |
michael@0 | 895 | |
michael@0 | 896 | #ifdef DEBUG_US_ADD |
michael@0 | 897 | printf("Add of "); |
michael@0 | 898 | dump(c); |
michael@0 | 899 | printf(" found at %d", i); |
michael@0 | 900 | printf(": "); |
michael@0 | 901 | dump(list, len); |
michael@0 | 902 | printf(" => "); |
michael@0 | 903 | #endif |
michael@0 | 904 | |
michael@0 | 905 | if (c == list[i]-1) { |
michael@0 | 906 | // c is before start of next range |
michael@0 | 907 | list[i] = c; |
michael@0 | 908 | // if we touched the HIGH mark, then add a new one |
michael@0 | 909 | if (c == (UNICODESET_HIGH - 1)) { |
michael@0 | 910 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 911 | ensureCapacity(len+1, status); |
michael@0 | 912 | if (U_FAILURE(status)) { |
michael@0 | 913 | return *this; // There is no way to report this error :-( |
michael@0 | 914 | } |
michael@0 | 915 | list[len++] = UNICODESET_HIGH; |
michael@0 | 916 | } |
michael@0 | 917 | if (i > 0 && c == list[i-1]) { |
michael@0 | 918 | // collapse adjacent ranges |
michael@0 | 919 | |
michael@0 | 920 | // [..., start_k-1, c, c, limit_k, ..., HIGH] |
michael@0 | 921 | // ^ |
michael@0 | 922 | // list[i] |
michael@0 | 923 | |
michael@0 | 924 | //for (int32_t k=i-1; k<len-2; ++k) { |
michael@0 | 925 | // list[k] = list[k+2]; |
michael@0 | 926 | //} |
michael@0 | 927 | UChar32* dst = list + i - 1; |
michael@0 | 928 | UChar32* src = dst + 2; |
michael@0 | 929 | UChar32* srclimit = list + len; |
michael@0 | 930 | while (src < srclimit) *(dst++) = *(src++); |
michael@0 | 931 | |
michael@0 | 932 | len -= 2; |
michael@0 | 933 | } |
michael@0 | 934 | } |
michael@0 | 935 | |
michael@0 | 936 | else if (i > 0 && c == list[i-1]) { |
michael@0 | 937 | // c is after end of prior range |
michael@0 | 938 | list[i-1]++; |
michael@0 | 939 | // no need to check for collapse here |
michael@0 | 940 | } |
michael@0 | 941 | |
michael@0 | 942 | else { |
michael@0 | 943 | // At this point we know the new char is not adjacent to |
michael@0 | 944 | // any existing ranges, and it is not 10FFFF. |
michael@0 | 945 | |
michael@0 | 946 | |
michael@0 | 947 | // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH] |
michael@0 | 948 | // ^ |
michael@0 | 949 | // list[i] |
michael@0 | 950 | |
michael@0 | 951 | // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH] |
michael@0 | 952 | // ^ |
michael@0 | 953 | // list[i] |
michael@0 | 954 | |
michael@0 | 955 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 956 | ensureCapacity(len+2, status); |
michael@0 | 957 | if (U_FAILURE(status)) { |
michael@0 | 958 | return *this; // There is no way to report this error :-( |
michael@0 | 959 | } |
michael@0 | 960 | |
michael@0 | 961 | //for (int32_t k=len-1; k>=i; --k) { |
michael@0 | 962 | // list[k+2] = list[k]; |
michael@0 | 963 | //} |
michael@0 | 964 | UChar32* src = list + len; |
michael@0 | 965 | UChar32* dst = src + 2; |
michael@0 | 966 | UChar32* srclimit = list + i; |
michael@0 | 967 | while (src > srclimit) *(--dst) = *(--src); |
michael@0 | 968 | |
michael@0 | 969 | list[i] = c; |
michael@0 | 970 | list[i+1] = c+1; |
michael@0 | 971 | len += 2; |
michael@0 | 972 | } |
michael@0 | 973 | |
michael@0 | 974 | #ifdef DEBUG_US_ADD |
michael@0 | 975 | dump(list, len); |
michael@0 | 976 | printf("\n"); |
michael@0 | 977 | |
michael@0 | 978 | for (i=1; i<len; ++i) { |
michael@0 | 979 | if (list[i] <= list[i-1]) { |
michael@0 | 980 | // Corrupt array! |
michael@0 | 981 | printf("ERROR: list has been corrupted\n"); |
michael@0 | 982 | exit(1); |
michael@0 | 983 | } |
michael@0 | 984 | } |
michael@0 | 985 | #endif |
michael@0 | 986 | |
michael@0 | 987 | releasePattern(); |
michael@0 | 988 | return *this; |
michael@0 | 989 | } |
michael@0 | 990 | |
michael@0 | 991 | /** |
michael@0 | 992 | * Adds the specified multicharacter to this set if it is not already |
michael@0 | 993 | * present. If this set already contains the multicharacter, |
michael@0 | 994 | * the call leaves this set unchanged. |
michael@0 | 995 | * Thus "ch" => {"ch"} |
michael@0 | 996 | * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> |
michael@0 | 997 | * @param s the source string |
michael@0 | 998 | * @return the modified set, for chaining |
michael@0 | 999 | */ |
michael@0 | 1000 | UnicodeSet& UnicodeSet::add(const UnicodeString& s) { |
michael@0 | 1001 | if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
michael@0 | 1002 | int32_t cp = getSingleCP(s); |
michael@0 | 1003 | if (cp < 0) { |
michael@0 | 1004 | if (!strings->contains((void*) &s)) { |
michael@0 | 1005 | _add(s); |
michael@0 | 1006 | releasePattern(); |
michael@0 | 1007 | } |
michael@0 | 1008 | } else { |
michael@0 | 1009 | add((UChar32)cp); |
michael@0 | 1010 | } |
michael@0 | 1011 | return *this; |
michael@0 | 1012 | } |
michael@0 | 1013 | |
michael@0 | 1014 | /** |
michael@0 | 1015 | * Adds the given string, in order, to 'strings'. The given string |
michael@0 | 1016 | * must have been checked by the caller to not be empty and to not |
michael@0 | 1017 | * already be in 'strings'. |
michael@0 | 1018 | */ |
michael@0 | 1019 | void UnicodeSet::_add(const UnicodeString& s) { |
michael@0 | 1020 | if (isFrozen() || isBogus()) { |
michael@0 | 1021 | return; |
michael@0 | 1022 | } |
michael@0 | 1023 | UnicodeString* t = new UnicodeString(s); |
michael@0 | 1024 | if (t == NULL) { // Check for memory allocation error. |
michael@0 | 1025 | setToBogus(); |
michael@0 | 1026 | return; |
michael@0 | 1027 | } |
michael@0 | 1028 | UErrorCode ec = U_ZERO_ERROR; |
michael@0 | 1029 | strings->sortedInsert(t, compareUnicodeString, ec); |
michael@0 | 1030 | if (U_FAILURE(ec)) { |
michael@0 | 1031 | setToBogus(); |
michael@0 | 1032 | delete t; |
michael@0 | 1033 | } |
michael@0 | 1034 | } |
michael@0 | 1035 | |
michael@0 | 1036 | /** |
michael@0 | 1037 | * @return a code point IF the string consists of a single one. |
michael@0 | 1038 | * otherwise returns -1. |
michael@0 | 1039 | * @param string to test |
michael@0 | 1040 | */ |
michael@0 | 1041 | int32_t UnicodeSet::getSingleCP(const UnicodeString& s) { |
michael@0 | 1042 | //if (s.length() < 1) { |
michael@0 | 1043 | // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet"); |
michael@0 | 1044 | //} |
michael@0 | 1045 | if (s.length() > 2) return -1; |
michael@0 | 1046 | if (s.length() == 1) return s.charAt(0); |
michael@0 | 1047 | |
michael@0 | 1048 | // at this point, len = 2 |
michael@0 | 1049 | UChar32 cp = s.char32At(0); |
michael@0 | 1050 | if (cp > 0xFFFF) { // is surrogate pair |
michael@0 | 1051 | return cp; |
michael@0 | 1052 | } |
michael@0 | 1053 | return -1; |
michael@0 | 1054 | } |
michael@0 | 1055 | |
michael@0 | 1056 | /** |
michael@0 | 1057 | * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"} |
michael@0 | 1058 | * If this set already any particular character, it has no effect on that character. |
michael@0 | 1059 | * @param the source string |
michael@0 | 1060 | * @return the modified set, for chaining |
michael@0 | 1061 | */ |
michael@0 | 1062 | UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) { |
michael@0 | 1063 | UChar32 cp; |
michael@0 | 1064 | for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { |
michael@0 | 1065 | cp = s.char32At(i); |
michael@0 | 1066 | add(cp); |
michael@0 | 1067 | } |
michael@0 | 1068 | return *this; |
michael@0 | 1069 | } |
michael@0 | 1070 | |
michael@0 | 1071 | /** |
michael@0 | 1072 | * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} |
michael@0 | 1073 | * If this set already any particular character, it has no effect on that character. |
michael@0 | 1074 | * @param the source string |
michael@0 | 1075 | * @return the modified set, for chaining |
michael@0 | 1076 | */ |
michael@0 | 1077 | UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) { |
michael@0 | 1078 | UnicodeSet set; |
michael@0 | 1079 | set.addAll(s); |
michael@0 | 1080 | retainAll(set); |
michael@0 | 1081 | return *this; |
michael@0 | 1082 | } |
michael@0 | 1083 | |
michael@0 | 1084 | /** |
michael@0 | 1085 | * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"} |
michael@0 | 1086 | * If this set already any particular character, it has no effect on that character. |
michael@0 | 1087 | * @param the source string |
michael@0 | 1088 | * @return the modified set, for chaining |
michael@0 | 1089 | */ |
michael@0 | 1090 | UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) { |
michael@0 | 1091 | UnicodeSet set; |
michael@0 | 1092 | set.addAll(s); |
michael@0 | 1093 | complementAll(set); |
michael@0 | 1094 | return *this; |
michael@0 | 1095 | } |
michael@0 | 1096 | |
michael@0 | 1097 | /** |
michael@0 | 1098 | * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"} |
michael@0 | 1099 | * If this set already any particular character, it has no effect on that character. |
michael@0 | 1100 | * @param the source string |
michael@0 | 1101 | * @return the modified set, for chaining |
michael@0 | 1102 | */ |
michael@0 | 1103 | UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { |
michael@0 | 1104 | UnicodeSet set; |
michael@0 | 1105 | set.addAll(s); |
michael@0 | 1106 | removeAll(set); |
michael@0 | 1107 | return *this; |
michael@0 | 1108 | } |
michael@0 | 1109 | |
michael@0 | 1110 | UnicodeSet& UnicodeSet::removeAllStrings() { |
michael@0 | 1111 | strings->removeAllElements(); |
michael@0 | 1112 | return *this; |
michael@0 | 1113 | } |
michael@0 | 1114 | |
michael@0 | 1115 | |
michael@0 | 1116 | /** |
michael@0 | 1117 | * Makes a set from a multicharacter string. Thus "ch" => {"ch"} |
michael@0 | 1118 | * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> |
michael@0 | 1119 | * @param the source string |
michael@0 | 1120 | * @return a newly created set containing the given string |
michael@0 | 1121 | */ |
michael@0 | 1122 | UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) { |
michael@0 | 1123 | UnicodeSet *set = new UnicodeSet(); |
michael@0 | 1124 | if (set != NULL) { // Check for memory allocation error. |
michael@0 | 1125 | set->add(s); |
michael@0 | 1126 | } |
michael@0 | 1127 | return set; |
michael@0 | 1128 | } |
michael@0 | 1129 | |
michael@0 | 1130 | |
michael@0 | 1131 | /** |
michael@0 | 1132 | * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"} |
michael@0 | 1133 | * @param the source string |
michael@0 | 1134 | * @return a newly created set containing the given characters |
michael@0 | 1135 | */ |
michael@0 | 1136 | UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) { |
michael@0 | 1137 | UnicodeSet *set = new UnicodeSet(); |
michael@0 | 1138 | if (set != NULL) { // Check for memory allocation error. |
michael@0 | 1139 | set->addAll(s); |
michael@0 | 1140 | } |
michael@0 | 1141 | return set; |
michael@0 | 1142 | } |
michael@0 | 1143 | |
michael@0 | 1144 | /** |
michael@0 | 1145 | * Retain only the elements in this set that are contained in the |
michael@0 | 1146 | * specified range. If <code>end > start</code> then an empty range is |
michael@0 | 1147 | * retained, leaving the set empty. |
michael@0 | 1148 | * |
michael@0 | 1149 | * @param start first character, inclusive, of range to be retained |
michael@0 | 1150 | * to this set. |
michael@0 | 1151 | * @param end last character, inclusive, of range to be retained |
michael@0 | 1152 | * to this set. |
michael@0 | 1153 | */ |
michael@0 | 1154 | UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) { |
michael@0 | 1155 | if (pinCodePoint(start) <= pinCodePoint(end)) { |
michael@0 | 1156 | UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; |
michael@0 | 1157 | retain(range, 2, 0); |
michael@0 | 1158 | } else { |
michael@0 | 1159 | clear(); |
michael@0 | 1160 | } |
michael@0 | 1161 | return *this; |
michael@0 | 1162 | } |
michael@0 | 1163 | |
michael@0 | 1164 | UnicodeSet& UnicodeSet::retain(UChar32 c) { |
michael@0 | 1165 | return retain(c, c); |
michael@0 | 1166 | } |
michael@0 | 1167 | |
michael@0 | 1168 | /** |
michael@0 | 1169 | * Removes the specified range from this set if it is present. |
michael@0 | 1170 | * The set will not contain the specified range once the call |
michael@0 | 1171 | * returns. If <code>end > start</code> then an empty range is |
michael@0 | 1172 | * removed, leaving the set unchanged. |
michael@0 | 1173 | * |
michael@0 | 1174 | * @param start first character, inclusive, of range to be removed |
michael@0 | 1175 | * from this set. |
michael@0 | 1176 | * @param end last character, inclusive, of range to be removed |
michael@0 | 1177 | * from this set. |
michael@0 | 1178 | */ |
michael@0 | 1179 | UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) { |
michael@0 | 1180 | if (pinCodePoint(start) <= pinCodePoint(end)) { |
michael@0 | 1181 | UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; |
michael@0 | 1182 | retain(range, 2, 2); |
michael@0 | 1183 | } |
michael@0 | 1184 | return *this; |
michael@0 | 1185 | } |
michael@0 | 1186 | |
michael@0 | 1187 | /** |
michael@0 | 1188 | * Removes the specified character from this set if it is present. |
michael@0 | 1189 | * The set will not contain the specified range once the call |
michael@0 | 1190 | * returns. |
michael@0 | 1191 | */ |
michael@0 | 1192 | UnicodeSet& UnicodeSet::remove(UChar32 c) { |
michael@0 | 1193 | return remove(c, c); |
michael@0 | 1194 | } |
michael@0 | 1195 | |
michael@0 | 1196 | /** |
michael@0 | 1197 | * Removes the specified string from this set if it is present. |
michael@0 | 1198 | * The set will not contain the specified character once the call |
michael@0 | 1199 | * returns. |
michael@0 | 1200 | * @param the source string |
michael@0 | 1201 | * @return the modified set, for chaining |
michael@0 | 1202 | */ |
michael@0 | 1203 | UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { |
michael@0 | 1204 | if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
michael@0 | 1205 | int32_t cp = getSingleCP(s); |
michael@0 | 1206 | if (cp < 0) { |
michael@0 | 1207 | strings->removeElement((void*) &s); |
michael@0 | 1208 | releasePattern(); |
michael@0 | 1209 | } else { |
michael@0 | 1210 | remove((UChar32)cp, (UChar32)cp); |
michael@0 | 1211 | } |
michael@0 | 1212 | return *this; |
michael@0 | 1213 | } |
michael@0 | 1214 | |
michael@0 | 1215 | /** |
michael@0 | 1216 | * Complements the specified range in this set. Any character in |
michael@0 | 1217 | * the range will be removed if it is in this set, or will be |
michael@0 | 1218 | * added if it is not in this set. If <code>end > start</code> |
michael@0 | 1219 | * then an empty range is xor'ed, leaving the set unchanged. |
michael@0 | 1220 | * |
michael@0 | 1221 | * @param start first character, inclusive, of range to be removed |
michael@0 | 1222 | * from this set. |
michael@0 | 1223 | * @param end last character, inclusive, of range to be removed |
michael@0 | 1224 | * from this set. |
michael@0 | 1225 | */ |
michael@0 | 1226 | UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) { |
michael@0 | 1227 | if (isFrozen() || isBogus()) { |
michael@0 | 1228 | return *this; |
michael@0 | 1229 | } |
michael@0 | 1230 | if (pinCodePoint(start) <= pinCodePoint(end)) { |
michael@0 | 1231 | UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; |
michael@0 | 1232 | exclusiveOr(range, 2, 0); |
michael@0 | 1233 | } |
michael@0 | 1234 | releasePattern(); |
michael@0 | 1235 | return *this; |
michael@0 | 1236 | } |
michael@0 | 1237 | |
michael@0 | 1238 | UnicodeSet& UnicodeSet::complement(UChar32 c) { |
michael@0 | 1239 | return complement(c, c); |
michael@0 | 1240 | } |
michael@0 | 1241 | |
michael@0 | 1242 | /** |
michael@0 | 1243 | * This is equivalent to |
michael@0 | 1244 | * <code>complement(MIN_VALUE, MAX_VALUE)</code>. |
michael@0 | 1245 | */ |
michael@0 | 1246 | UnicodeSet& UnicodeSet::complement(void) { |
michael@0 | 1247 | if (isFrozen() || isBogus()) { |
michael@0 | 1248 | return *this; |
michael@0 | 1249 | } |
michael@0 | 1250 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1251 | if (list[0] == UNICODESET_LOW) { |
michael@0 | 1252 | ensureBufferCapacity(len-1, status); |
michael@0 | 1253 | if (U_FAILURE(status)) { |
michael@0 | 1254 | return *this; |
michael@0 | 1255 | } |
michael@0 | 1256 | uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32)); |
michael@0 | 1257 | --len; |
michael@0 | 1258 | } else { |
michael@0 | 1259 | ensureBufferCapacity(len+1, status); |
michael@0 | 1260 | if (U_FAILURE(status)) { |
michael@0 | 1261 | return *this; |
michael@0 | 1262 | } |
michael@0 | 1263 | uprv_memcpy(buffer + 1, list, len*sizeof(UChar32)); |
michael@0 | 1264 | buffer[0] = UNICODESET_LOW; |
michael@0 | 1265 | ++len; |
michael@0 | 1266 | } |
michael@0 | 1267 | swapBuffers(); |
michael@0 | 1268 | releasePattern(); |
michael@0 | 1269 | return *this; |
michael@0 | 1270 | } |
michael@0 | 1271 | |
michael@0 | 1272 | /** |
michael@0 | 1273 | * Complement the specified string in this set. |
michael@0 | 1274 | * The set will not contain the specified string once the call |
michael@0 | 1275 | * returns. |
michael@0 | 1276 | * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b> |
michael@0 | 1277 | * @param s the string to complement |
michael@0 | 1278 | * @return this object, for chaining |
michael@0 | 1279 | */ |
michael@0 | 1280 | UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { |
michael@0 | 1281 | if (s.length() == 0 || isFrozen() || isBogus()) return *this; |
michael@0 | 1282 | int32_t cp = getSingleCP(s); |
michael@0 | 1283 | if (cp < 0) { |
michael@0 | 1284 | if (strings->contains((void*) &s)) { |
michael@0 | 1285 | strings->removeElement((void*) &s); |
michael@0 | 1286 | } else { |
michael@0 | 1287 | _add(s); |
michael@0 | 1288 | } |
michael@0 | 1289 | releasePattern(); |
michael@0 | 1290 | } else { |
michael@0 | 1291 | complement((UChar32)cp, (UChar32)cp); |
michael@0 | 1292 | } |
michael@0 | 1293 | return *this; |
michael@0 | 1294 | } |
michael@0 | 1295 | |
michael@0 | 1296 | /** |
michael@0 | 1297 | * Adds all of the elements in the specified set to this set if |
michael@0 | 1298 | * they're not already present. This operation effectively |
michael@0 | 1299 | * modifies this set so that its value is the <i>union</i> of the two |
michael@0 | 1300 | * sets. The behavior of this operation is unspecified if the specified |
michael@0 | 1301 | * collection is modified while the operation is in progress. |
michael@0 | 1302 | * |
michael@0 | 1303 | * @param c set whose elements are to be added to this set. |
michael@0 | 1304 | * @see #add(char, char) |
michael@0 | 1305 | */ |
michael@0 | 1306 | UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { |
michael@0 | 1307 | if ( c.len>0 && c.list!=NULL ) { |
michael@0 | 1308 | add(c.list, c.len, 0); |
michael@0 | 1309 | } |
michael@0 | 1310 | |
michael@0 | 1311 | // Add strings in order |
michael@0 | 1312 | if ( c.strings!=NULL ) { |
michael@0 | 1313 | for (int32_t i=0; i<c.strings->size(); ++i) { |
michael@0 | 1314 | const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); |
michael@0 | 1315 | if (!strings->contains((void*) s)) { |
michael@0 | 1316 | _add(*s); |
michael@0 | 1317 | } |
michael@0 | 1318 | } |
michael@0 | 1319 | } |
michael@0 | 1320 | return *this; |
michael@0 | 1321 | } |
michael@0 | 1322 | |
michael@0 | 1323 | /** |
michael@0 | 1324 | * Retains only the elements in this set that are contained in the |
michael@0 | 1325 | * specified set. In other words, removes from this set all of |
michael@0 | 1326 | * its elements that are not contained in the specified set. This |
michael@0 | 1327 | * operation effectively modifies this set so that its value is |
michael@0 | 1328 | * the <i>intersection</i> of the two sets. |
michael@0 | 1329 | * |
michael@0 | 1330 | * @param c set that defines which elements this set will retain. |
michael@0 | 1331 | */ |
michael@0 | 1332 | UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { |
michael@0 | 1333 | if (isFrozen() || isBogus()) { |
michael@0 | 1334 | return *this; |
michael@0 | 1335 | } |
michael@0 | 1336 | retain(c.list, c.len, 0); |
michael@0 | 1337 | strings->retainAll(*c.strings); |
michael@0 | 1338 | return *this; |
michael@0 | 1339 | } |
michael@0 | 1340 | |
michael@0 | 1341 | /** |
michael@0 | 1342 | * Removes from this set all of its elements that are contained in the |
michael@0 | 1343 | * specified set. This operation effectively modifies this |
michael@0 | 1344 | * set so that its value is the <i>asymmetric set difference</i> of |
michael@0 | 1345 | * the two sets. |
michael@0 | 1346 | * |
michael@0 | 1347 | * @param c set that defines which elements will be removed from |
michael@0 | 1348 | * this set. |
michael@0 | 1349 | */ |
michael@0 | 1350 | UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { |
michael@0 | 1351 | if (isFrozen() || isBogus()) { |
michael@0 | 1352 | return *this; |
michael@0 | 1353 | } |
michael@0 | 1354 | retain(c.list, c.len, 2); |
michael@0 | 1355 | strings->removeAll(*c.strings); |
michael@0 | 1356 | return *this; |
michael@0 | 1357 | } |
michael@0 | 1358 | |
michael@0 | 1359 | /** |
michael@0 | 1360 | * Complements in this set all elements contained in the specified |
michael@0 | 1361 | * set. Any character in the other set will be removed if it is |
michael@0 | 1362 | * in this set, or will be added if it is not in this set. |
michael@0 | 1363 | * |
michael@0 | 1364 | * @param c set that defines which elements will be xor'ed from |
michael@0 | 1365 | * this set. |
michael@0 | 1366 | */ |
michael@0 | 1367 | UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { |
michael@0 | 1368 | if (isFrozen() || isBogus()) { |
michael@0 | 1369 | return *this; |
michael@0 | 1370 | } |
michael@0 | 1371 | exclusiveOr(c.list, c.len, 0); |
michael@0 | 1372 | |
michael@0 | 1373 | for (int32_t i=0; i<c.strings->size(); ++i) { |
michael@0 | 1374 | void* e = c.strings->elementAt(i); |
michael@0 | 1375 | if (!strings->removeElement(e)) { |
michael@0 | 1376 | _add(*(const UnicodeString*)e); |
michael@0 | 1377 | } |
michael@0 | 1378 | } |
michael@0 | 1379 | return *this; |
michael@0 | 1380 | } |
michael@0 | 1381 | |
michael@0 | 1382 | /** |
michael@0 | 1383 | * Removes all of the elements from this set. This set will be |
michael@0 | 1384 | * empty after this call returns. |
michael@0 | 1385 | */ |
michael@0 | 1386 | UnicodeSet& UnicodeSet::clear(void) { |
michael@0 | 1387 | if (isFrozen()) { |
michael@0 | 1388 | return *this; |
michael@0 | 1389 | } |
michael@0 | 1390 | if (list != NULL) { |
michael@0 | 1391 | list[0] = UNICODESET_HIGH; |
michael@0 | 1392 | } |
michael@0 | 1393 | len = 1; |
michael@0 | 1394 | releasePattern(); |
michael@0 | 1395 | if (strings != NULL) { |
michael@0 | 1396 | strings->removeAllElements(); |
michael@0 | 1397 | } |
michael@0 | 1398 | if (list != NULL && strings != NULL) { |
michael@0 | 1399 | // Remove bogus |
michael@0 | 1400 | fFlags = 0; |
michael@0 | 1401 | } |
michael@0 | 1402 | return *this; |
michael@0 | 1403 | } |
michael@0 | 1404 | |
michael@0 | 1405 | /** |
michael@0 | 1406 | * Iteration method that returns the number of ranges contained in |
michael@0 | 1407 | * this set. |
michael@0 | 1408 | * @see #getRangeStart |
michael@0 | 1409 | * @see #getRangeEnd |
michael@0 | 1410 | */ |
michael@0 | 1411 | int32_t UnicodeSet::getRangeCount() const { |
michael@0 | 1412 | return len/2; |
michael@0 | 1413 | } |
michael@0 | 1414 | |
michael@0 | 1415 | /** |
michael@0 | 1416 | * Iteration method that returns the first character in the |
michael@0 | 1417 | * specified range of this set. |
michael@0 | 1418 | * @see #getRangeCount |
michael@0 | 1419 | * @see #getRangeEnd |
michael@0 | 1420 | */ |
michael@0 | 1421 | UChar32 UnicodeSet::getRangeStart(int32_t index) const { |
michael@0 | 1422 | return list[index*2]; |
michael@0 | 1423 | } |
michael@0 | 1424 | |
michael@0 | 1425 | /** |
michael@0 | 1426 | * Iteration method that returns the last character in the |
michael@0 | 1427 | * specified range of this set. |
michael@0 | 1428 | * @see #getRangeStart |
michael@0 | 1429 | * @see #getRangeEnd |
michael@0 | 1430 | */ |
michael@0 | 1431 | UChar32 UnicodeSet::getRangeEnd(int32_t index) const { |
michael@0 | 1432 | return list[index*2 + 1] - 1; |
michael@0 | 1433 | } |
michael@0 | 1434 | |
michael@0 | 1435 | int32_t UnicodeSet::getStringCount() const { |
michael@0 | 1436 | return strings->size(); |
michael@0 | 1437 | } |
michael@0 | 1438 | |
michael@0 | 1439 | const UnicodeString* UnicodeSet::getString(int32_t index) const { |
michael@0 | 1440 | return (const UnicodeString*) strings->elementAt(index); |
michael@0 | 1441 | } |
michael@0 | 1442 | |
michael@0 | 1443 | /** |
michael@0 | 1444 | * Reallocate this objects internal structures to take up the least |
michael@0 | 1445 | * possible space, without changing this object's value. |
michael@0 | 1446 | */ |
michael@0 | 1447 | UnicodeSet& UnicodeSet::compact() { |
michael@0 | 1448 | if (isFrozen() || isBogus()) { |
michael@0 | 1449 | return *this; |
michael@0 | 1450 | } |
michael@0 | 1451 | // Delete buffer first to defragment memory less. |
michael@0 | 1452 | if (buffer != NULL) { |
michael@0 | 1453 | uprv_free(buffer); |
michael@0 | 1454 | buffer = NULL; |
michael@0 | 1455 | } |
michael@0 | 1456 | if (len < capacity) { |
michael@0 | 1457 | // Make the capacity equal to len or 1. |
michael@0 | 1458 | // We don't want to realloc of 0 size. |
michael@0 | 1459 | int32_t newCapacity = len + (len == 0); |
michael@0 | 1460 | UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); |
michael@0 | 1461 | if (temp) { |
michael@0 | 1462 | list = temp; |
michael@0 | 1463 | capacity = newCapacity; |
michael@0 | 1464 | } |
michael@0 | 1465 | // else what the heck happened?! We allocated less memory! |
michael@0 | 1466 | // Oh well. We'll keep our original array. |
michael@0 | 1467 | } |
michael@0 | 1468 | return *this; |
michael@0 | 1469 | } |
michael@0 | 1470 | |
michael@0 | 1471 | int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const { |
michael@0 | 1472 | int32_t bmpLength, length, destLength; |
michael@0 | 1473 | |
michael@0 | 1474 | if (U_FAILURE(ec)) { |
michael@0 | 1475 | return 0; |
michael@0 | 1476 | } |
michael@0 | 1477 | |
michael@0 | 1478 | if (destCapacity<0 || (destCapacity>0 && dest==NULL)) { |
michael@0 | 1479 | ec=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1480 | return 0; |
michael@0 | 1481 | } |
michael@0 | 1482 | |
michael@0 | 1483 | /* count necessary 16-bit units */ |
michael@0 | 1484 | length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH |
michael@0 | 1485 | // assert(length>=0); |
michael@0 | 1486 | if (length==0) { |
michael@0 | 1487 | /* empty set */ |
michael@0 | 1488 | if (destCapacity>0) { |
michael@0 | 1489 | *dest=0; |
michael@0 | 1490 | } else { |
michael@0 | 1491 | ec=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1492 | } |
michael@0 | 1493 | return 1; |
michael@0 | 1494 | } |
michael@0 | 1495 | /* now length>0 */ |
michael@0 | 1496 | |
michael@0 | 1497 | if (this->list[length-1]<=0xffff) { |
michael@0 | 1498 | /* all BMP */ |
michael@0 | 1499 | bmpLength=length; |
michael@0 | 1500 | } else if (this->list[0]>=0x10000) { |
michael@0 | 1501 | /* all supplementary */ |
michael@0 | 1502 | bmpLength=0; |
michael@0 | 1503 | length*=2; |
michael@0 | 1504 | } else { |
michael@0 | 1505 | /* some BMP, some supplementary */ |
michael@0 | 1506 | for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {} |
michael@0 | 1507 | length=bmpLength+2*(length-bmpLength); |
michael@0 | 1508 | } |
michael@0 | 1509 | |
michael@0 | 1510 | /* length: number of 16-bit array units */ |
michael@0 | 1511 | if (length>0x7fff) { |
michael@0 | 1512 | /* there are only 15 bits for the length in the first serialized word */ |
michael@0 | 1513 | ec=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1514 | return 0; |
michael@0 | 1515 | } |
michael@0 | 1516 | |
michael@0 | 1517 | /* |
michael@0 | 1518 | * total serialized length: |
michael@0 | 1519 | * number of 16-bit array units (length) + |
michael@0 | 1520 | * 1 length unit (always) + |
michael@0 | 1521 | * 1 bmpLength unit (if there are supplementary values) |
michael@0 | 1522 | */ |
michael@0 | 1523 | destLength=length+((length>bmpLength)?2:1); |
michael@0 | 1524 | if (destLength<=destCapacity) { |
michael@0 | 1525 | const UChar32 *p; |
michael@0 | 1526 | int32_t i; |
michael@0 | 1527 | |
michael@0 | 1528 | *dest=(uint16_t)length; |
michael@0 | 1529 | if (length>bmpLength) { |
michael@0 | 1530 | *dest|=0x8000; |
michael@0 | 1531 | *++dest=(uint16_t)bmpLength; |
michael@0 | 1532 | } |
michael@0 | 1533 | ++dest; |
michael@0 | 1534 | |
michael@0 | 1535 | /* write the BMP part of the array */ |
michael@0 | 1536 | p=this->list; |
michael@0 | 1537 | for (i=0; i<bmpLength; ++i) { |
michael@0 | 1538 | *dest++=(uint16_t)*p++; |
michael@0 | 1539 | } |
michael@0 | 1540 | |
michael@0 | 1541 | /* write the supplementary part of the array */ |
michael@0 | 1542 | for (; i<length; i+=2) { |
michael@0 | 1543 | *dest++=(uint16_t)(*p>>16); |
michael@0 | 1544 | *dest++=(uint16_t)*p++; |
michael@0 | 1545 | } |
michael@0 | 1546 | } else { |
michael@0 | 1547 | ec=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1548 | } |
michael@0 | 1549 | return destLength; |
michael@0 | 1550 | } |
michael@0 | 1551 | |
michael@0 | 1552 | //---------------------------------------------------------------- |
michael@0 | 1553 | // Implementation: Utility methods |
michael@0 | 1554 | //---------------------------------------------------------------- |
michael@0 | 1555 | |
michael@0 | 1556 | /** |
michael@0 | 1557 | * Allocate our strings vector and return TRUE if successful. |
michael@0 | 1558 | */ |
michael@0 | 1559 | UBool UnicodeSet::allocateStrings(UErrorCode &status) { |
michael@0 | 1560 | if (U_FAILURE(status)) { |
michael@0 | 1561 | return FALSE; |
michael@0 | 1562 | } |
michael@0 | 1563 | strings = new UVector(uprv_deleteUObject, |
michael@0 | 1564 | uhash_compareUnicodeString, 1, status); |
michael@0 | 1565 | if (strings == NULL) { // Check for memory allocation error. |
michael@0 | 1566 | status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1567 | return FALSE; |
michael@0 | 1568 | } |
michael@0 | 1569 | if (U_FAILURE(status)) { |
michael@0 | 1570 | delete strings; |
michael@0 | 1571 | strings = NULL; |
michael@0 | 1572 | return FALSE; |
michael@0 | 1573 | } |
michael@0 | 1574 | return TRUE; |
michael@0 | 1575 | } |
michael@0 | 1576 | |
michael@0 | 1577 | void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { |
michael@0 | 1578 | if (newLen <= capacity) |
michael@0 | 1579 | return; |
michael@0 | 1580 | UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); |
michael@0 | 1581 | if (temp == NULL) { |
michael@0 | 1582 | ec = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1583 | setToBogus(); |
michael@0 | 1584 | return; |
michael@0 | 1585 | } |
michael@0 | 1586 | list = temp; |
michael@0 | 1587 | capacity = newLen + GROW_EXTRA; |
michael@0 | 1588 | // else we keep the original contents on the memory failure. |
michael@0 | 1589 | } |
michael@0 | 1590 | |
michael@0 | 1591 | void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { |
michael@0 | 1592 | if (buffer != NULL && newLen <= bufferCapacity) |
michael@0 | 1593 | return; |
michael@0 | 1594 | UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); |
michael@0 | 1595 | if (temp == NULL) { |
michael@0 | 1596 | ec = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1597 | setToBogus(); |
michael@0 | 1598 | return; |
michael@0 | 1599 | } |
michael@0 | 1600 | buffer = temp; |
michael@0 | 1601 | bufferCapacity = newLen + GROW_EXTRA; |
michael@0 | 1602 | // else we keep the original contents on the memory failure. |
michael@0 | 1603 | } |
michael@0 | 1604 | |
michael@0 | 1605 | /** |
michael@0 | 1606 | * Swap list and buffer. |
michael@0 | 1607 | */ |
michael@0 | 1608 | void UnicodeSet::swapBuffers(void) { |
michael@0 | 1609 | // swap list and buffer |
michael@0 | 1610 | UChar32* temp = list; |
michael@0 | 1611 | list = buffer; |
michael@0 | 1612 | buffer = temp; |
michael@0 | 1613 | |
michael@0 | 1614 | int32_t c = capacity; |
michael@0 | 1615 | capacity = bufferCapacity; |
michael@0 | 1616 | bufferCapacity = c; |
michael@0 | 1617 | } |
michael@0 | 1618 | |
michael@0 | 1619 | void UnicodeSet::setToBogus() { |
michael@0 | 1620 | clear(); // Remove everything in the set. |
michael@0 | 1621 | fFlags = kIsBogus; |
michael@0 | 1622 | } |
michael@0 | 1623 | |
michael@0 | 1624 | //---------------------------------------------------------------- |
michael@0 | 1625 | // Implementation: Fundamental operators |
michael@0 | 1626 | //---------------------------------------------------------------- |
michael@0 | 1627 | |
michael@0 | 1628 | static inline UChar32 max(UChar32 a, UChar32 b) { |
michael@0 | 1629 | return (a > b) ? a : b; |
michael@0 | 1630 | } |
michael@0 | 1631 | |
michael@0 | 1632 | // polarity = 0, 3 is normal: x xor y |
michael@0 | 1633 | // polarity = 1, 2: x xor ~y == x === y |
michael@0 | 1634 | |
michael@0 | 1635 | void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) { |
michael@0 | 1636 | if (isFrozen() || isBogus()) { |
michael@0 | 1637 | return; |
michael@0 | 1638 | } |
michael@0 | 1639 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1640 | ensureBufferCapacity(len + otherLen, status); |
michael@0 | 1641 | if (U_FAILURE(status)) { |
michael@0 | 1642 | return; |
michael@0 | 1643 | } |
michael@0 | 1644 | |
michael@0 | 1645 | int32_t i = 0, j = 0, k = 0; |
michael@0 | 1646 | UChar32 a = list[i++]; |
michael@0 | 1647 | UChar32 b; |
michael@0 | 1648 | if (polarity == 1 || polarity == 2) { |
michael@0 | 1649 | b = UNICODESET_LOW; |
michael@0 | 1650 | if (other[j] == UNICODESET_LOW) { // skip base if already LOW |
michael@0 | 1651 | ++j; |
michael@0 | 1652 | b = other[j]; |
michael@0 | 1653 | } |
michael@0 | 1654 | } else { |
michael@0 | 1655 | b = other[j++]; |
michael@0 | 1656 | } |
michael@0 | 1657 | // simplest of all the routines |
michael@0 | 1658 | // sort the values, discarding identicals! |
michael@0 | 1659 | for (;;) { |
michael@0 | 1660 | if (a < b) { |
michael@0 | 1661 | buffer[k++] = a; |
michael@0 | 1662 | a = list[i++]; |
michael@0 | 1663 | } else if (b < a) { |
michael@0 | 1664 | buffer[k++] = b; |
michael@0 | 1665 | b = other[j++]; |
michael@0 | 1666 | } else if (a != UNICODESET_HIGH) { // at this point, a == b |
michael@0 | 1667 | // discard both values! |
michael@0 | 1668 | a = list[i++]; |
michael@0 | 1669 | b = other[j++]; |
michael@0 | 1670 | } else { // DONE! |
michael@0 | 1671 | buffer[k++] = UNICODESET_HIGH; |
michael@0 | 1672 | len = k; |
michael@0 | 1673 | break; |
michael@0 | 1674 | } |
michael@0 | 1675 | } |
michael@0 | 1676 | swapBuffers(); |
michael@0 | 1677 | releasePattern(); |
michael@0 | 1678 | } |
michael@0 | 1679 | |
michael@0 | 1680 | // polarity = 0 is normal: x union y |
michael@0 | 1681 | // polarity = 2: x union ~y |
michael@0 | 1682 | // polarity = 1: ~x union y |
michael@0 | 1683 | // polarity = 3: ~x union ~y |
michael@0 | 1684 | |
michael@0 | 1685 | void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { |
michael@0 | 1686 | if (isFrozen() || isBogus() || other==NULL) { |
michael@0 | 1687 | return; |
michael@0 | 1688 | } |
michael@0 | 1689 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1690 | ensureBufferCapacity(len + otherLen, status); |
michael@0 | 1691 | if (U_FAILURE(status)) { |
michael@0 | 1692 | return; |
michael@0 | 1693 | } |
michael@0 | 1694 | |
michael@0 | 1695 | int32_t i = 0, j = 0, k = 0; |
michael@0 | 1696 | UChar32 a = list[i++]; |
michael@0 | 1697 | UChar32 b = other[j++]; |
michael@0 | 1698 | // change from xor is that we have to check overlapping pairs |
michael@0 | 1699 | // polarity bit 1 means a is second, bit 2 means b is. |
michael@0 | 1700 | for (;;) { |
michael@0 | 1701 | switch (polarity) { |
michael@0 | 1702 | case 0: // both first; take lower if unequal |
michael@0 | 1703 | if (a < b) { // take a |
michael@0 | 1704 | // Back up over overlapping ranges in buffer[] |
michael@0 | 1705 | if (k > 0 && a <= buffer[k-1]) { |
michael@0 | 1706 | // Pick latter end value in buffer[] vs. list[] |
michael@0 | 1707 | a = max(list[i], buffer[--k]); |
michael@0 | 1708 | } else { |
michael@0 | 1709 | // No overlap |
michael@0 | 1710 | buffer[k++] = a; |
michael@0 | 1711 | a = list[i]; |
michael@0 | 1712 | } |
michael@0 | 1713 | i++; // Common if/else code factored out |
michael@0 | 1714 | polarity ^= 1; |
michael@0 | 1715 | } else if (b < a) { // take b |
michael@0 | 1716 | if (k > 0 && b <= buffer[k-1]) { |
michael@0 | 1717 | b = max(other[j], buffer[--k]); |
michael@0 | 1718 | } else { |
michael@0 | 1719 | buffer[k++] = b; |
michael@0 | 1720 | b = other[j]; |
michael@0 | 1721 | } |
michael@0 | 1722 | j++; |
michael@0 | 1723 | polarity ^= 2; |
michael@0 | 1724 | } else { // a == b, take a, drop b |
michael@0 | 1725 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1726 | // This is symmetrical; it doesn't matter if |
michael@0 | 1727 | // we backtrack with a or b. - liu |
michael@0 | 1728 | if (k > 0 && a <= buffer[k-1]) { |
michael@0 | 1729 | a = max(list[i], buffer[--k]); |
michael@0 | 1730 | } else { |
michael@0 | 1731 | // No overlap |
michael@0 | 1732 | buffer[k++] = a; |
michael@0 | 1733 | a = list[i]; |
michael@0 | 1734 | } |
michael@0 | 1735 | i++; |
michael@0 | 1736 | polarity ^= 1; |
michael@0 | 1737 | b = other[j++]; |
michael@0 | 1738 | polarity ^= 2; |
michael@0 | 1739 | } |
michael@0 | 1740 | break; |
michael@0 | 1741 | case 3: // both second; take higher if unequal, and drop other |
michael@0 | 1742 | if (b <= a) { // take a |
michael@0 | 1743 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1744 | buffer[k++] = a; |
michael@0 | 1745 | } else { // take b |
michael@0 | 1746 | if (b == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1747 | buffer[k++] = b; |
michael@0 | 1748 | } |
michael@0 | 1749 | a = list[i++]; |
michael@0 | 1750 | polarity ^= 1; // factored common code |
michael@0 | 1751 | b = other[j++]; |
michael@0 | 1752 | polarity ^= 2; |
michael@0 | 1753 | break; |
michael@0 | 1754 | case 1: // a second, b first; if b < a, overlap |
michael@0 | 1755 | if (a < b) { // no overlap, take a |
michael@0 | 1756 | buffer[k++] = a; a = list[i++]; polarity ^= 1; |
michael@0 | 1757 | } else if (b < a) { // OVERLAP, drop b |
michael@0 | 1758 | b = other[j++]; |
michael@0 | 1759 | polarity ^= 2; |
michael@0 | 1760 | } else { // a == b, drop both! |
michael@0 | 1761 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1762 | a = list[i++]; |
michael@0 | 1763 | polarity ^= 1; |
michael@0 | 1764 | b = other[j++]; |
michael@0 | 1765 | polarity ^= 2; |
michael@0 | 1766 | } |
michael@0 | 1767 | break; |
michael@0 | 1768 | case 2: // a first, b second; if a < b, overlap |
michael@0 | 1769 | if (b < a) { // no overlap, take b |
michael@0 | 1770 | buffer[k++] = b; |
michael@0 | 1771 | b = other[j++]; |
michael@0 | 1772 | polarity ^= 2; |
michael@0 | 1773 | } else if (a < b) { // OVERLAP, drop a |
michael@0 | 1774 | a = list[i++]; |
michael@0 | 1775 | polarity ^= 1; |
michael@0 | 1776 | } else { // a == b, drop both! |
michael@0 | 1777 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1778 | a = list[i++]; |
michael@0 | 1779 | polarity ^= 1; |
michael@0 | 1780 | b = other[j++]; |
michael@0 | 1781 | polarity ^= 2; |
michael@0 | 1782 | } |
michael@0 | 1783 | break; |
michael@0 | 1784 | } |
michael@0 | 1785 | } |
michael@0 | 1786 | loop_end: |
michael@0 | 1787 | buffer[k++] = UNICODESET_HIGH; // terminate |
michael@0 | 1788 | len = k; |
michael@0 | 1789 | swapBuffers(); |
michael@0 | 1790 | releasePattern(); |
michael@0 | 1791 | } |
michael@0 | 1792 | |
michael@0 | 1793 | // polarity = 0 is normal: x intersect y |
michael@0 | 1794 | // polarity = 2: x intersect ~y == set-minus |
michael@0 | 1795 | // polarity = 1: ~x intersect y |
michael@0 | 1796 | // polarity = 3: ~x intersect ~y |
michael@0 | 1797 | |
michael@0 | 1798 | void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) { |
michael@0 | 1799 | if (isFrozen() || isBogus()) { |
michael@0 | 1800 | return; |
michael@0 | 1801 | } |
michael@0 | 1802 | UErrorCode status = U_ZERO_ERROR; |
michael@0 | 1803 | ensureBufferCapacity(len + otherLen, status); |
michael@0 | 1804 | if (U_FAILURE(status)) { |
michael@0 | 1805 | return; |
michael@0 | 1806 | } |
michael@0 | 1807 | |
michael@0 | 1808 | int32_t i = 0, j = 0, k = 0; |
michael@0 | 1809 | UChar32 a = list[i++]; |
michael@0 | 1810 | UChar32 b = other[j++]; |
michael@0 | 1811 | // change from xor is that we have to check overlapping pairs |
michael@0 | 1812 | // polarity bit 1 means a is second, bit 2 means b is. |
michael@0 | 1813 | for (;;) { |
michael@0 | 1814 | switch (polarity) { |
michael@0 | 1815 | case 0: // both first; drop the smaller |
michael@0 | 1816 | if (a < b) { // drop a |
michael@0 | 1817 | a = list[i++]; |
michael@0 | 1818 | polarity ^= 1; |
michael@0 | 1819 | } else if (b < a) { // drop b |
michael@0 | 1820 | b = other[j++]; |
michael@0 | 1821 | polarity ^= 2; |
michael@0 | 1822 | } else { // a == b, take one, drop other |
michael@0 | 1823 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1824 | buffer[k++] = a; |
michael@0 | 1825 | a = list[i++]; |
michael@0 | 1826 | polarity ^= 1; |
michael@0 | 1827 | b = other[j++]; |
michael@0 | 1828 | polarity ^= 2; |
michael@0 | 1829 | } |
michael@0 | 1830 | break; |
michael@0 | 1831 | case 3: // both second; take lower if unequal |
michael@0 | 1832 | if (a < b) { // take a |
michael@0 | 1833 | buffer[k++] = a; |
michael@0 | 1834 | a = list[i++]; |
michael@0 | 1835 | polarity ^= 1; |
michael@0 | 1836 | } else if (b < a) { // take b |
michael@0 | 1837 | buffer[k++] = b; |
michael@0 | 1838 | b = other[j++]; |
michael@0 | 1839 | polarity ^= 2; |
michael@0 | 1840 | } else { // a == b, take one, drop other |
michael@0 | 1841 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1842 | buffer[k++] = a; |
michael@0 | 1843 | a = list[i++]; |
michael@0 | 1844 | polarity ^= 1; |
michael@0 | 1845 | b = other[j++]; |
michael@0 | 1846 | polarity ^= 2; |
michael@0 | 1847 | } |
michael@0 | 1848 | break; |
michael@0 | 1849 | case 1: // a second, b first; |
michael@0 | 1850 | if (a < b) { // NO OVERLAP, drop a |
michael@0 | 1851 | a = list[i++]; |
michael@0 | 1852 | polarity ^= 1; |
michael@0 | 1853 | } else if (b < a) { // OVERLAP, take b |
michael@0 | 1854 | buffer[k++] = b; |
michael@0 | 1855 | b = other[j++]; |
michael@0 | 1856 | polarity ^= 2; |
michael@0 | 1857 | } else { // a == b, drop both! |
michael@0 | 1858 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1859 | a = list[i++]; |
michael@0 | 1860 | polarity ^= 1; |
michael@0 | 1861 | b = other[j++]; |
michael@0 | 1862 | polarity ^= 2; |
michael@0 | 1863 | } |
michael@0 | 1864 | break; |
michael@0 | 1865 | case 2: // a first, b second; if a < b, overlap |
michael@0 | 1866 | if (b < a) { // no overlap, drop b |
michael@0 | 1867 | b = other[j++]; |
michael@0 | 1868 | polarity ^= 2; |
michael@0 | 1869 | } else if (a < b) { // OVERLAP, take a |
michael@0 | 1870 | buffer[k++] = a; |
michael@0 | 1871 | a = list[i++]; |
michael@0 | 1872 | polarity ^= 1; |
michael@0 | 1873 | } else { // a == b, drop both! |
michael@0 | 1874 | if (a == UNICODESET_HIGH) goto loop_end; |
michael@0 | 1875 | a = list[i++]; |
michael@0 | 1876 | polarity ^= 1; |
michael@0 | 1877 | b = other[j++]; |
michael@0 | 1878 | polarity ^= 2; |
michael@0 | 1879 | } |
michael@0 | 1880 | break; |
michael@0 | 1881 | } |
michael@0 | 1882 | } |
michael@0 | 1883 | loop_end: |
michael@0 | 1884 | buffer[k++] = UNICODESET_HIGH; // terminate |
michael@0 | 1885 | len = k; |
michael@0 | 1886 | swapBuffers(); |
michael@0 | 1887 | releasePattern(); |
michael@0 | 1888 | } |
michael@0 | 1889 | |
michael@0 | 1890 | /** |
michael@0 | 1891 | * Append the <code>toPattern()</code> representation of a |
michael@0 | 1892 | * string to the given <code>StringBuffer</code>. |
michael@0 | 1893 | */ |
michael@0 | 1894 | void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool |
michael@0 | 1895 | escapeUnprintable) { |
michael@0 | 1896 | UChar32 cp; |
michael@0 | 1897 | for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) { |
michael@0 | 1898 | _appendToPat(buf, cp = s.char32At(i), escapeUnprintable); |
michael@0 | 1899 | } |
michael@0 | 1900 | } |
michael@0 | 1901 | |
michael@0 | 1902 | /** |
michael@0 | 1903 | * Append the <code>toPattern()</code> representation of a |
michael@0 | 1904 | * character to the given <code>StringBuffer</code>. |
michael@0 | 1905 | */ |
michael@0 | 1906 | void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool |
michael@0 | 1907 | escapeUnprintable) { |
michael@0 | 1908 | if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { |
michael@0 | 1909 | // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything |
michael@0 | 1910 | // unprintable |
michael@0 | 1911 | if (ICU_Utility::escapeUnprintable(buf, c)) { |
michael@0 | 1912 | return; |
michael@0 | 1913 | } |
michael@0 | 1914 | } |
michael@0 | 1915 | // Okay to let ':' pass through |
michael@0 | 1916 | switch (c) { |
michael@0 | 1917 | case SET_OPEN: |
michael@0 | 1918 | case SET_CLOSE: |
michael@0 | 1919 | case HYPHEN: |
michael@0 | 1920 | case COMPLEMENT: |
michael@0 | 1921 | case INTERSECTION: |
michael@0 | 1922 | case BACKSLASH: |
michael@0 | 1923 | case OPEN_BRACE: |
michael@0 | 1924 | case CLOSE_BRACE: |
michael@0 | 1925 | case COLON: |
michael@0 | 1926 | case SymbolTable::SYMBOL_REF: |
michael@0 | 1927 | buf.append(BACKSLASH); |
michael@0 | 1928 | break; |
michael@0 | 1929 | default: |
michael@0 | 1930 | // Escape whitespace |
michael@0 | 1931 | if (PatternProps::isWhiteSpace(c)) { |
michael@0 | 1932 | buf.append(BACKSLASH); |
michael@0 | 1933 | } |
michael@0 | 1934 | break; |
michael@0 | 1935 | } |
michael@0 | 1936 | buf.append(c); |
michael@0 | 1937 | } |
michael@0 | 1938 | |
michael@0 | 1939 | /** |
michael@0 | 1940 | * Append a string representation of this set to result. This will be |
michael@0 | 1941 | * a cleaned version of the string passed to applyPattern(), if there |
michael@0 | 1942 | * is one. Otherwise it will be generated. |
michael@0 | 1943 | */ |
michael@0 | 1944 | UnicodeString& UnicodeSet::_toPattern(UnicodeString& result, |
michael@0 | 1945 | UBool escapeUnprintable) const |
michael@0 | 1946 | { |
michael@0 | 1947 | if (pat != NULL) { |
michael@0 | 1948 | int32_t i; |
michael@0 | 1949 | int32_t backslashCount = 0; |
michael@0 | 1950 | for (i=0; i<patLen; ) { |
michael@0 | 1951 | UChar32 c; |
michael@0 | 1952 | U16_NEXT(pat, i, patLen, c); |
michael@0 | 1953 | if (escapeUnprintable && ICU_Utility::isUnprintable(c)) { |
michael@0 | 1954 | // If the unprintable character is preceded by an odd |
michael@0 | 1955 | // number of backslashes, then it has been escaped. |
michael@0 | 1956 | // Before unescaping it, we delete the final |
michael@0 | 1957 | // backslash. |
michael@0 | 1958 | if ((backslashCount % 2) == 1) { |
michael@0 | 1959 | result.truncate(result.length() - 1); |
michael@0 | 1960 | } |
michael@0 | 1961 | ICU_Utility::escapeUnprintable(result, c); |
michael@0 | 1962 | backslashCount = 0; |
michael@0 | 1963 | } else { |
michael@0 | 1964 | result.append(c); |
michael@0 | 1965 | if (c == BACKSLASH) { |
michael@0 | 1966 | ++backslashCount; |
michael@0 | 1967 | } else { |
michael@0 | 1968 | backslashCount = 0; |
michael@0 | 1969 | } |
michael@0 | 1970 | } |
michael@0 | 1971 | } |
michael@0 | 1972 | return result; |
michael@0 | 1973 | } |
michael@0 | 1974 | |
michael@0 | 1975 | return _generatePattern(result, escapeUnprintable); |
michael@0 | 1976 | } |
michael@0 | 1977 | |
michael@0 | 1978 | /** |
michael@0 | 1979 | * Returns a string representation of this set. If the result of |
michael@0 | 1980 | * calling this function is passed to a UnicodeSet constructor, it |
michael@0 | 1981 | * will produce another set that is equal to this one. |
michael@0 | 1982 | */ |
michael@0 | 1983 | UnicodeString& UnicodeSet::toPattern(UnicodeString& result, |
michael@0 | 1984 | UBool escapeUnprintable) const |
michael@0 | 1985 | { |
michael@0 | 1986 | result.truncate(0); |
michael@0 | 1987 | return _toPattern(result, escapeUnprintable); |
michael@0 | 1988 | } |
michael@0 | 1989 | |
michael@0 | 1990 | /** |
michael@0 | 1991 | * Generate and append a string representation of this set to result. |
michael@0 | 1992 | * This does not use this.pat, the cleaned up copy of the string |
michael@0 | 1993 | * passed to applyPattern(). |
michael@0 | 1994 | */ |
michael@0 | 1995 | UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, |
michael@0 | 1996 | UBool escapeUnprintable) const |
michael@0 | 1997 | { |
michael@0 | 1998 | result.append(SET_OPEN); |
michael@0 | 1999 | |
michael@0 | 2000 | // // Check against the predefined categories. We implicitly build |
michael@0 | 2001 | // // up ALL category sets the first time toPattern() is called. |
michael@0 | 2002 | // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) { |
michael@0 | 2003 | // if (*this == getCategorySet(cat)) { |
michael@0 | 2004 | // result.append(COLON); |
michael@0 | 2005 | // result.append(CATEGORY_NAMES, cat*2, 2); |
michael@0 | 2006 | // return result.append(CATEGORY_CLOSE); |
michael@0 | 2007 | // } |
michael@0 | 2008 | // } |
michael@0 | 2009 | |
michael@0 | 2010 | int32_t count = getRangeCount(); |
michael@0 | 2011 | |
michael@0 | 2012 | // If the set contains at least 2 intervals and includes both |
michael@0 | 2013 | // MIN_VALUE and MAX_VALUE, then the inverse representation will |
michael@0 | 2014 | // be more economical. |
michael@0 | 2015 | if (count > 1 && |
michael@0 | 2016 | getRangeStart(0) == MIN_VALUE && |
michael@0 | 2017 | getRangeEnd(count-1) == MAX_VALUE) { |
michael@0 | 2018 | |
michael@0 | 2019 | // Emit the inverse |
michael@0 | 2020 | result.append(COMPLEMENT); |
michael@0 | 2021 | |
michael@0 | 2022 | for (int32_t i = 1; i < count; ++i) { |
michael@0 | 2023 | UChar32 start = getRangeEnd(i-1)+1; |
michael@0 | 2024 | UChar32 end = getRangeStart(i)-1; |
michael@0 | 2025 | _appendToPat(result, start, escapeUnprintable); |
michael@0 | 2026 | if (start != end) { |
michael@0 | 2027 | if ((start+1) != end) { |
michael@0 | 2028 | result.append(HYPHEN); |
michael@0 | 2029 | } |
michael@0 | 2030 | _appendToPat(result, end, escapeUnprintable); |
michael@0 | 2031 | } |
michael@0 | 2032 | } |
michael@0 | 2033 | } |
michael@0 | 2034 | |
michael@0 | 2035 | // Default; emit the ranges as pairs |
michael@0 | 2036 | else { |
michael@0 | 2037 | for (int32_t i = 0; i < count; ++i) { |
michael@0 | 2038 | UChar32 start = getRangeStart(i); |
michael@0 | 2039 | UChar32 end = getRangeEnd(i); |
michael@0 | 2040 | _appendToPat(result, start, escapeUnprintable); |
michael@0 | 2041 | if (start != end) { |
michael@0 | 2042 | if ((start+1) != end) { |
michael@0 | 2043 | result.append(HYPHEN); |
michael@0 | 2044 | } |
michael@0 | 2045 | _appendToPat(result, end, escapeUnprintable); |
michael@0 | 2046 | } |
michael@0 | 2047 | } |
michael@0 | 2048 | } |
michael@0 | 2049 | |
michael@0 | 2050 | for (int32_t i = 0; i<strings->size(); ++i) { |
michael@0 | 2051 | result.append(OPEN_BRACE); |
michael@0 | 2052 | _appendToPat(result, |
michael@0 | 2053 | *(const UnicodeString*) strings->elementAt(i), |
michael@0 | 2054 | escapeUnprintable); |
michael@0 | 2055 | result.append(CLOSE_BRACE); |
michael@0 | 2056 | } |
michael@0 | 2057 | return result.append(SET_CLOSE); |
michael@0 | 2058 | } |
michael@0 | 2059 | |
michael@0 | 2060 | /** |
michael@0 | 2061 | * Release existing cached pattern |
michael@0 | 2062 | */ |
michael@0 | 2063 | void UnicodeSet::releasePattern() { |
michael@0 | 2064 | if (pat) { |
michael@0 | 2065 | uprv_free(pat); |
michael@0 | 2066 | pat = NULL; |
michael@0 | 2067 | patLen = 0; |
michael@0 | 2068 | } |
michael@0 | 2069 | } |
michael@0 | 2070 | |
michael@0 | 2071 | /** |
michael@0 | 2072 | * Set the new pattern to cache. |
michael@0 | 2073 | */ |
michael@0 | 2074 | void UnicodeSet::setPattern(const UnicodeString& newPat) { |
michael@0 | 2075 | releasePattern(); |
michael@0 | 2076 | int32_t newPatLen = newPat.length(); |
michael@0 | 2077 | pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); |
michael@0 | 2078 | if (pat) { |
michael@0 | 2079 | patLen = newPatLen; |
michael@0 | 2080 | newPat.extractBetween(0, patLen, pat); |
michael@0 | 2081 | pat[patLen] = 0; |
michael@0 | 2082 | } |
michael@0 | 2083 | // else we don't care if malloc failed. This was just a nice cache. |
michael@0 | 2084 | // We can regenerate an equivalent pattern later when requested. |
michael@0 | 2085 | } |
michael@0 | 2086 | |
michael@0 | 2087 | UnicodeFunctor *UnicodeSet::freeze() { |
michael@0 | 2088 | if(!isFrozen() && !isBogus()) { |
michael@0 | 2089 | // Do most of what compact() does before freezing because |
michael@0 | 2090 | // compact() will not work when the set is frozen. |
michael@0 | 2091 | // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). |
michael@0 | 2092 | |
michael@0 | 2093 | // Delete buffer first to defragment memory less. |
michael@0 | 2094 | if (buffer != NULL) { |
michael@0 | 2095 | uprv_free(buffer); |
michael@0 | 2096 | buffer = NULL; |
michael@0 | 2097 | } |
michael@0 | 2098 | if (capacity > (len + GROW_EXTRA)) { |
michael@0 | 2099 | // Make the capacity equal to len or 1. |
michael@0 | 2100 | // We don't want to realloc of 0 size. |
michael@0 | 2101 | capacity = len + (len == 0); |
michael@0 | 2102 | list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); |
michael@0 | 2103 | if (list == NULL) { // Check for memory allocation error. |
michael@0 | 2104 | setToBogus(); |
michael@0 | 2105 | return this; |
michael@0 | 2106 | } |
michael@0 | 2107 | } |
michael@0 | 2108 | |
michael@0 | 2109 | // Optimize contains() and span() and similar functions. |
michael@0 | 2110 | if (!strings->isEmpty()) { |
michael@0 | 2111 | stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); |
michael@0 | 2112 | if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { |
michael@0 | 2113 | // All strings are irrelevant for span() etc. because |
michael@0 | 2114 | // all of each string's code points are contained in this set. |
michael@0 | 2115 | // Do not check needsStringSpanUTF8() because UTF-8 has at most as |
michael@0 | 2116 | // many relevant strings as UTF-16. |
michael@0 | 2117 | // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().) |
michael@0 | 2118 | delete stringSpan; |
michael@0 | 2119 | stringSpan = NULL; |
michael@0 | 2120 | } |
michael@0 | 2121 | } |
michael@0 | 2122 | if (stringSpan == NULL) { |
michael@0 | 2123 | // No span-relevant strings: Optimize for code point spans. |
michael@0 | 2124 | bmpSet=new BMPSet(list, len); |
michael@0 | 2125 | if (bmpSet == NULL) { // Check for memory allocation error. |
michael@0 | 2126 | setToBogus(); |
michael@0 | 2127 | } |
michael@0 | 2128 | } |
michael@0 | 2129 | } |
michael@0 | 2130 | return this; |
michael@0 | 2131 | } |
michael@0 | 2132 | |
michael@0 | 2133 | int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 2134 | if(length>0 && bmpSet!=NULL) { |
michael@0 | 2135 | return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s); |
michael@0 | 2136 | } |
michael@0 | 2137 | if(length<0) { |
michael@0 | 2138 | length=u_strlen(s); |
michael@0 | 2139 | } |
michael@0 | 2140 | if(length==0) { |
michael@0 | 2141 | return 0; |
michael@0 | 2142 | } |
michael@0 | 2143 | if(stringSpan!=NULL) { |
michael@0 | 2144 | return stringSpan->span(s, length, spanCondition); |
michael@0 | 2145 | } else if(!strings->isEmpty()) { |
michael@0 | 2146 | uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
michael@0 | 2147 | UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : |
michael@0 | 2148 | UnicodeSetStringSpan::FWD_UTF16_CONTAINED; |
michael@0 | 2149 | UnicodeSetStringSpan strSpan(*this, *strings, which); |
michael@0 | 2150 | if(strSpan.needsStringSpanUTF16()) { |
michael@0 | 2151 | return strSpan.span(s, length, spanCondition); |
michael@0 | 2152 | } |
michael@0 | 2153 | } |
michael@0 | 2154 | |
michael@0 | 2155 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 2156 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 2157 | } |
michael@0 | 2158 | |
michael@0 | 2159 | UChar32 c; |
michael@0 | 2160 | int32_t start=0, prev=0; |
michael@0 | 2161 | do { |
michael@0 | 2162 | U16_NEXT(s, start, length, c); |
michael@0 | 2163 | if(spanCondition!=contains(c)) { |
michael@0 | 2164 | break; |
michael@0 | 2165 | } |
michael@0 | 2166 | } while((prev=start)<length); |
michael@0 | 2167 | return prev; |
michael@0 | 2168 | } |
michael@0 | 2169 | |
michael@0 | 2170 | int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 2171 | if(length>0 && bmpSet!=NULL) { |
michael@0 | 2172 | return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s); |
michael@0 | 2173 | } |
michael@0 | 2174 | if(length<0) { |
michael@0 | 2175 | length=u_strlen(s); |
michael@0 | 2176 | } |
michael@0 | 2177 | if(length==0) { |
michael@0 | 2178 | return 0; |
michael@0 | 2179 | } |
michael@0 | 2180 | if(stringSpan!=NULL) { |
michael@0 | 2181 | return stringSpan->spanBack(s, length, spanCondition); |
michael@0 | 2182 | } else if(!strings->isEmpty()) { |
michael@0 | 2183 | uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
michael@0 | 2184 | UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : |
michael@0 | 2185 | UnicodeSetStringSpan::BACK_UTF16_CONTAINED; |
michael@0 | 2186 | UnicodeSetStringSpan strSpan(*this, *strings, which); |
michael@0 | 2187 | if(strSpan.needsStringSpanUTF16()) { |
michael@0 | 2188 | return strSpan.spanBack(s, length, spanCondition); |
michael@0 | 2189 | } |
michael@0 | 2190 | } |
michael@0 | 2191 | |
michael@0 | 2192 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 2193 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 2194 | } |
michael@0 | 2195 | |
michael@0 | 2196 | UChar32 c; |
michael@0 | 2197 | int32_t prev=length; |
michael@0 | 2198 | do { |
michael@0 | 2199 | U16_PREV(s, 0, length, c); |
michael@0 | 2200 | if(spanCondition!=contains(c)) { |
michael@0 | 2201 | break; |
michael@0 | 2202 | } |
michael@0 | 2203 | } while((prev=length)>0); |
michael@0 | 2204 | return prev; |
michael@0 | 2205 | } |
michael@0 | 2206 | |
michael@0 | 2207 | int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 2208 | if(length>0 && bmpSet!=NULL) { |
michael@0 | 2209 | const uint8_t *s0=(const uint8_t *)s; |
michael@0 | 2210 | return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0); |
michael@0 | 2211 | } |
michael@0 | 2212 | if(length<0) { |
michael@0 | 2213 | length=(int32_t)uprv_strlen(s); |
michael@0 | 2214 | } |
michael@0 | 2215 | if(length==0) { |
michael@0 | 2216 | return 0; |
michael@0 | 2217 | } |
michael@0 | 2218 | if(stringSpan!=NULL) { |
michael@0 | 2219 | return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); |
michael@0 | 2220 | } else if(!strings->isEmpty()) { |
michael@0 | 2221 | uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
michael@0 | 2222 | UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : |
michael@0 | 2223 | UnicodeSetStringSpan::FWD_UTF8_CONTAINED; |
michael@0 | 2224 | UnicodeSetStringSpan strSpan(*this, *strings, which); |
michael@0 | 2225 | if(strSpan.needsStringSpanUTF8()) { |
michael@0 | 2226 | return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition); |
michael@0 | 2227 | } |
michael@0 | 2228 | } |
michael@0 | 2229 | |
michael@0 | 2230 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 2231 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 2232 | } |
michael@0 | 2233 | |
michael@0 | 2234 | UChar32 c; |
michael@0 | 2235 | int32_t start=0, prev=0; |
michael@0 | 2236 | do { |
michael@0 | 2237 | U8_NEXT_OR_FFFD(s, start, length, c); |
michael@0 | 2238 | if(spanCondition!=contains(c)) { |
michael@0 | 2239 | break; |
michael@0 | 2240 | } |
michael@0 | 2241 | } while((prev=start)<length); |
michael@0 | 2242 | return prev; |
michael@0 | 2243 | } |
michael@0 | 2244 | |
michael@0 | 2245 | int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 2246 | if(length>0 && bmpSet!=NULL) { |
michael@0 | 2247 | const uint8_t *s0=(const uint8_t *)s; |
michael@0 | 2248 | return bmpSet->spanBackUTF8(s0, length, spanCondition); |
michael@0 | 2249 | } |
michael@0 | 2250 | if(length<0) { |
michael@0 | 2251 | length=(int32_t)uprv_strlen(s); |
michael@0 | 2252 | } |
michael@0 | 2253 | if(length==0) { |
michael@0 | 2254 | return 0; |
michael@0 | 2255 | } |
michael@0 | 2256 | if(stringSpan!=NULL) { |
michael@0 | 2257 | return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); |
michael@0 | 2258 | } else if(!strings->isEmpty()) { |
michael@0 | 2259 | uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? |
michael@0 | 2260 | UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : |
michael@0 | 2261 | UnicodeSetStringSpan::BACK_UTF8_CONTAINED; |
michael@0 | 2262 | UnicodeSetStringSpan strSpan(*this, *strings, which); |
michael@0 | 2263 | if(strSpan.needsStringSpanUTF8()) { |
michael@0 | 2264 | return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition); |
michael@0 | 2265 | } |
michael@0 | 2266 | } |
michael@0 | 2267 | |
michael@0 | 2268 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 2269 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 2270 | } |
michael@0 | 2271 | |
michael@0 | 2272 | UChar32 c; |
michael@0 | 2273 | int32_t prev=length; |
michael@0 | 2274 | do { |
michael@0 | 2275 | U8_PREV_OR_FFFD(s, 0, length, c); |
michael@0 | 2276 | if(spanCondition!=contains(c)) { |
michael@0 | 2277 | break; |
michael@0 | 2278 | } |
michael@0 | 2279 | } while((prev=length)>0); |
michael@0 | 2280 | return prev; |
michael@0 | 2281 | } |
michael@0 | 2282 | |
michael@0 | 2283 | U_NAMESPACE_END |