michael@0: /*
michael@0: **********************************************************************
michael@0: *   Copyright (C) 1999-2012, International Business Machines
michael@0: *   Corporation and others.  All Rights Reserved.
michael@0: **********************************************************************
michael@0: *   Date        Name        Description
michael@0: *   10/20/99    alan        Creation.
michael@0: **********************************************************************
michael@0: */
michael@0: 
michael@0: #include "unicode/utypes.h"
michael@0: #include "unicode/parsepos.h"
michael@0: #include "unicode/symtable.h"
michael@0: #include "unicode/uniset.h"
michael@0: #include "unicode/utf8.h"
michael@0: #include "unicode/utf16.h"
michael@0: #include "ruleiter.h"
michael@0: #include "cmemory.h"
michael@0: #include "cstring.h"
michael@0: #include "patternprops.h"
michael@0: #include "uelement.h"
michael@0: #include "util.h"
michael@0: #include "uvector.h"
michael@0: #include "charstr.h"
michael@0: #include "ustrfmt.h"
michael@0: #include "uassert.h"
michael@0: #include "bmpset.h"
michael@0: #include "unisetspan.h"
michael@0: 
michael@0: // Define UChar constants using hex for EBCDIC compatibility
michael@0: // Used #define to reduce private static exports and memory access time.
michael@0: #define SET_OPEN        ((UChar)0x005B) /*[*/
michael@0: #define SET_CLOSE       ((UChar)0x005D) /*]*/
michael@0: #define HYPHEN          ((UChar)0x002D) /*-*/
michael@0: #define COMPLEMENT      ((UChar)0x005E) /*^*/
michael@0: #define COLON           ((UChar)0x003A) /*:*/
michael@0: #define BACKSLASH       ((UChar)0x005C) /*\*/
michael@0: #define INTERSECTION    ((UChar)0x0026) /*&*/
michael@0: #define UPPER_U         ((UChar)0x0055) /*U*/
michael@0: #define LOWER_U         ((UChar)0x0075) /*u*/
michael@0: #define OPEN_BRACE      ((UChar)123)    /*{*/
michael@0: #define CLOSE_BRACE     ((UChar)125)    /*}*/
michael@0: #define UPPER_P         ((UChar)0x0050) /*P*/
michael@0: #define LOWER_P         ((UChar)0x0070) /*p*/
michael@0: #define UPPER_N         ((UChar)78)     /*N*/
michael@0: #define EQUALS          ((UChar)0x003D) /*=*/
michael@0: 
michael@0: // HIGH_VALUE > all valid values. 110000 for codepoints
michael@0: #define UNICODESET_HIGH 0x0110000
michael@0: 
michael@0: // LOW <= all valid values. ZERO for codepoints
michael@0: #define UNICODESET_LOW 0x000000
michael@0: 
michael@0: // initial storage. Must be >= 0
michael@0: #define START_EXTRA 16
michael@0: 
michael@0: // extra amount for growth. Must be >= 0
michael@0: #define GROW_EXTRA START_EXTRA
michael@0: 
michael@0: U_NAMESPACE_BEGIN
michael@0: 
michael@0: SymbolTable::~SymbolTable() {}
michael@0: 
michael@0: UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
michael@0: 
michael@0: /**
michael@0:  * Modify the given UChar32 variable so that it is in range, by
michael@0:  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
michael@0:  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
michael@0:  * It modifies its argument in-place and also returns it.
michael@0:  */
michael@0: static inline UChar32 pinCodePoint(UChar32& c) {
michael@0:     if (c < UNICODESET_LOW) {
michael@0:         c = UNICODESET_LOW;
michael@0:     } else if (c > (UNICODESET_HIGH-1)) {
michael@0:         c = (UNICODESET_HIGH-1);
michael@0:     }
michael@0:     return c;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // Debugging
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
michael@0: // To enable the debugging, define the symbol DEBUG_MEM in the line
michael@0: // below.  This will result in text being sent to stdout that looks
michael@0: // like this:
michael@0: //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
michael@0: //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
michael@0: // Each line lists a construction (ct) or destruction (dt) event, the
michael@0: // object address, the number of outstanding objects after the event,
michael@0: // and the pattern of the object in question.
michael@0: 
michael@0: // #define DEBUG_MEM
michael@0: 
michael@0: #ifdef DEBUG_MEM
michael@0: #include <stdio.h>
michael@0: static int32_t _dbgCount = 0;
michael@0: 
michael@0: static inline void _dbgct(UnicodeSet* set) {
michael@0:     UnicodeString str;
michael@0:     set->toPattern(str, TRUE);
michael@0:     char buf[40];
michael@0:     str.extract(0, 39, buf, "");
michael@0:     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
michael@0: }
michael@0: 
michael@0: static inline void _dbgdt(UnicodeSet* set) {
michael@0:     UnicodeString str;
michael@0:     set->toPattern(str, TRUE);
michael@0:     char buf[40];
michael@0:     str.extract(0, 39, buf, "");
michael@0:     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
michael@0: }
michael@0: 
michael@0: #else
michael@0: 
michael@0: #define _dbgct(set)
michael@0: #define _dbgdt(set)
michael@0: 
michael@0: #endif
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // UnicodeString in UVector support
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
michael@0:     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
michael@0: }
michael@0: 
michael@0: static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
michael@0:     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
michael@0:     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
michael@0:     return a.compare(b);
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // Constructors &c
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: /**
michael@0:  * Constructs an empty set.
michael@0:  */
michael@0: UnicodeSet::UnicodeSet() :
michael@0:     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
michael@0:     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0:     fFlags(0)
michael@0: {
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     allocateStrings(status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0:     if(list!=NULL){
michael@0:         list[0] = UNICODESET_HIGH;
michael@0:     } else { // If memory allocation failed, set to bogus state.
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     _dbgct(this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Constructs a set containing the given range. If <code>end >
michael@0:  * start</code> then an empty set is created.
michael@0:  *
michael@0:  * @param start first character, inclusive, of range
michael@0:  * @param end last character, inclusive, of range
michael@0:  */
michael@0: UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
michael@0:     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
michael@0:     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0:     fFlags(0)
michael@0: {
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     allocateStrings(status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0:     if(list!=NULL){
michael@0:         list[0] = UNICODESET_HIGH;
michael@0:         complement(start, end);
michael@0:     } else { // If memory allocation failed, set to bogus state.
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     _dbgct(this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Constructs a set that is identical to the given UnicodeSet.
michael@0:  */
michael@0: UnicodeSet::UnicodeSet(const UnicodeSet& o) :
michael@0:     UnicodeFilter(o),
michael@0:     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
michael@0:     bmpSet(0),
michael@0:     buffer(0), bufferCapacity(0),
michael@0:     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0:     fFlags(0)
michael@0: {
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     allocateStrings(status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0:     if(list!=NULL){
michael@0:         *this = o;
michael@0:     } else { // If memory allocation failed, set to bogus state.
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     _dbgct(this);
michael@0: }
michael@0: 
michael@0: // Copy-construct as thawed.
michael@0: UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
michael@0:     UnicodeFilter(o),
michael@0:     len(0), capacity(o.len + GROW_EXTRA), list(0),
michael@0:     bmpSet(0),
michael@0:     buffer(0), bufferCapacity(0),
michael@0:     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
michael@0:     fFlags(0)
michael@0: {
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     allocateStrings(status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0:     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
michael@0:     if(list!=NULL){
michael@0:         // *this = o except for bmpSet and stringSpan
michael@0:         len = o.len;
michael@0:         uprv_memcpy(list, o.list, len*sizeof(UChar32));
michael@0:         if (strings != NULL && o.strings != NULL) {
michael@0:             strings->assign(*o.strings, cloneUnicodeString, status);
michael@0:         } else { // Invalid strings.
michael@0:             setToBogus();
michael@0:             return;
michael@0:         }
michael@0:         if (o.pat) {
michael@0:             setPattern(UnicodeString(o.pat, o.patLen));
michael@0:         }
michael@0:     } else { // If memory allocation failed, set to bogus state.
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     _dbgct(this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Destructs the set.
michael@0:  */
michael@0: UnicodeSet::~UnicodeSet() {
michael@0:     _dbgdt(this); // first!
michael@0:     uprv_free(list);
michael@0:     delete bmpSet;
michael@0:     if (buffer) {
michael@0:         uprv_free(buffer);
michael@0:     }
michael@0:     delete strings;
michael@0:     delete stringSpan;
michael@0:     releasePattern();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Assigns this object to be a copy of another.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
michael@0:     if (this == &o) {
michael@0:         return *this;
michael@0:     }
michael@0:     if (isFrozen()) {
michael@0:         return *this;
michael@0:     }
michael@0:     if (o.isBogus()) {
michael@0:         setToBogus();
michael@0:         return *this;
michael@0:     }
michael@0:     UErrorCode ec = U_ZERO_ERROR;
michael@0:     ensureCapacity(o.len, ec);
michael@0:     if (U_FAILURE(ec)) {
michael@0:         return *this; // There is no way to report this error :-(
michael@0:     }
michael@0:     len = o.len;
michael@0:     uprv_memcpy(list, o.list, len*sizeof(UChar32));
michael@0:     if (o.bmpSet == NULL) {
michael@0:         bmpSet = NULL;
michael@0:     } else {
michael@0:         bmpSet = new BMPSet(*o.bmpSet, list, len);
michael@0:         if (bmpSet == NULL) { // Check for memory allocation error.
michael@0:             setToBogus();
michael@0:             return *this;
michael@0:         }
michael@0:     }
michael@0:     if (strings != NULL && o.strings != NULL) {
michael@0:         strings->assign(*o.strings, cloneUnicodeString, ec);
michael@0:     } else { // Invalid strings.
michael@0:         setToBogus();
michael@0:         return *this;
michael@0:     }
michael@0:     if (o.stringSpan == NULL) {
michael@0:         stringSpan = NULL;
michael@0:     } else {
michael@0:         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
michael@0:         if (stringSpan == NULL) { // Check for memory allocation error.
michael@0:             setToBogus();
michael@0:             return *this;
michael@0:         }
michael@0:     }
michael@0:     releasePattern();
michael@0:     if (o.pat) {
michael@0:         setPattern(UnicodeString(o.pat, o.patLen));
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns a copy of this object.  All UnicodeMatcher objects have
michael@0:  * to support cloning in order to allow classes using
michael@0:  * UnicodeMatchers, such as Transliterator, to implement cloning.
michael@0:  */
michael@0: UnicodeFunctor* UnicodeSet::clone() const {
michael@0:     return new UnicodeSet(*this);
michael@0: }
michael@0: 
michael@0: UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
michael@0:     return new UnicodeSet(*this, TRUE);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Compares the specified object with this set for equality.  Returns
michael@0:  * <tt>true</tt> if the two sets
michael@0:  * have the same size, and every member of the specified set is
michael@0:  * contained in this set (or equivalently, every member of this set is
michael@0:  * contained in the specified set).
michael@0:  *
michael@0:  * @param o set to be compared for equality with this set.
michael@0:  * @return <tt>true</tt> if the specified set is equal to this set.
michael@0:  */
michael@0: UBool UnicodeSet::operator==(const UnicodeSet& o) const {
michael@0:     if (len != o.len) return FALSE;
michael@0:     for (int32_t i = 0; i < len; ++i) {
michael@0:         if (list[i] != o.list[i]) return FALSE;
michael@0:     }
michael@0:     if (*strings != *o.strings) return FALSE;
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns the hash code value for this set.
michael@0:  *
michael@0:  * @return the hash code value for this set.
michael@0:  * @see Object#hashCode()
michael@0:  */
michael@0: int32_t UnicodeSet::hashCode(void) const {
michael@0:     int32_t result = len;
michael@0:     for (int32_t i = 0; i < len; ++i) {
michael@0:         result *= 1000003;
michael@0:         result += list[i];
michael@0:     }
michael@0:     return result;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // Public API
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: /**
michael@0:  * Returns the number of elements in this set (its cardinality),
michael@0:  * Note than the elements of a set may include both individual
michael@0:  * codepoints and strings.
michael@0:  *
michael@0:  * @return the number of elements in this set (its cardinality).
michael@0:  */
michael@0: int32_t UnicodeSet::size(void) const {
michael@0:     int32_t n = 0;
michael@0:     int32_t count = getRangeCount();
michael@0:     for (int32_t i = 0; i < count; ++i) {
michael@0:         n += getRangeEnd(i) - getRangeStart(i) + 1;
michael@0:     }
michael@0:     return n + strings->size();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns <tt>true</tt> if this set contains no elements.
michael@0:  *
michael@0:  * @return <tt>true</tt> if this set contains no elements.
michael@0:  */
michael@0: UBool UnicodeSet::isEmpty(void) const {
michael@0:     return len == 1 && strings->size() == 0;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains the given character.
michael@0:  * @param c character to be checked for containment
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::contains(UChar32 c) const {
michael@0:     // Set i to the index of the start item greater than ch
michael@0:     // We know we will terminate without length test!
michael@0:     // LATER: for large sets, add binary search
michael@0:     //int32_t i = -1;
michael@0:     //for (;;) {
michael@0:     //    if (c < list[++i]) break;
michael@0:     //}
michael@0:     if (bmpSet != NULL) {
michael@0:         return bmpSet->contains(c);
michael@0:     }
michael@0:     if (stringSpan != NULL) {
michael@0:         return stringSpan->contains(c);
michael@0:     }
michael@0:     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
michael@0:         return FALSE;
michael@0:     }
michael@0:     int32_t i = findCodePoint(c);
michael@0:     return (UBool)(i & 1); // return true if odd
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns the smallest value i such that c < list[i].  Caller
michael@0:  * must ensure that c is a legal value or this method will enter
michael@0:  * an infinite loop.  This method performs a binary search.
michael@0:  * @param c a character in the range MIN_VALUE..MAX_VALUE
michael@0:  * inclusive
michael@0:  * @return the smallest integer i in the range 0..len-1,
michael@0:  * inclusive, such that c < list[i]
michael@0:  */
michael@0: int32_t UnicodeSet::findCodePoint(UChar32 c) const {
michael@0:     /* Examples:
michael@0:                                        findCodePoint(c)
michael@0:        set              list[]         c=0 1 3 4 7 8
michael@0:        ===              ==============   ===========
michael@0:        []               [110000]         0 0 0 0 0 0
michael@0:        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
michael@0:        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
michael@0:        [:Any:]          [0, 110000]      1 1 1 1 1 1
michael@0:      */
michael@0: 
michael@0:     // Return the smallest i such that c < list[i].  Assume
michael@0:     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
michael@0:     if (c < list[0])
michael@0:         return 0;
michael@0:     // High runner test.  c is often after the last range, so an
michael@0:     // initial check for this condition pays off.
michael@0:     int32_t lo = 0;
michael@0:     int32_t hi = len - 1;
michael@0:     if (lo >= hi || c >= list[hi-1])
michael@0:         return hi;
michael@0:     // invariant: c >= list[lo]
michael@0:     // invariant: c < list[hi]
michael@0:     for (;;) {
michael@0:         int32_t i = (lo + hi) >> 1;
michael@0:         if (i == lo) {
michael@0:             break; // Found!
michael@0:         } else if (c < list[i]) {
michael@0:             hi = i;
michael@0:         } else {
michael@0:             lo = i;
michael@0:         }
michael@0:     }
michael@0:     return hi;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains every character
michael@0:  * of the given range.
michael@0:  * @param start first character, inclusive, of the range
michael@0:  * @param end last character, inclusive, of the range
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
michael@0:     //int32_t i = -1;
michael@0:     //for (;;) {
michael@0:     //    if (start < list[++i]) break;
michael@0:     //}
michael@0:     int32_t i = findCodePoint(start);
michael@0:     return ((i & 1) != 0 && end < list[i]);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns <tt>true</tt> if this set contains the given
michael@0:  * multicharacter string.
michael@0:  * @param s string to be checked for containment
michael@0:  * @return <tt>true</tt> if this set contains the specified string
michael@0:  */
michael@0: UBool UnicodeSet::contains(const UnicodeString& s) const {
michael@0:     if (s.length() == 0) return FALSE;
michael@0:     int32_t cp = getSingleCP(s);
michael@0:     if (cp < 0) {
michael@0:         return strings->contains((void*) &s);
michael@0:     } else {
michael@0:         return contains((UChar32) cp);
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains all the characters and strings
michael@0:  * of the given set.
michael@0:  * @param c set to be checked for containment
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
michael@0:     // The specified set is a subset if all of its pairs are contained in
michael@0:     // this set.  It's possible to code this more efficiently in terms of
michael@0:     // direct manipulation of the inversion lists if the need arises.
michael@0:     int32_t n = c.getRangeCount();
michael@0:     for (int i=0; i<n; ++i) {
michael@0:         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
michael@0:             return FALSE;
michael@0:         }
michael@0:     }
michael@0:     if (!strings->containsAll(*c.strings)) return FALSE;
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains all the characters
michael@0:  * of the given string.
michael@0:  * @param s string containing characters to be checked for containment
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::containsAll(const UnicodeString& s) const {
michael@0:     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
michael@0:                    s.length());
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains none of the characters
michael@0:  * of the given range.
michael@0:  * @param start first character, inclusive, of the range
michael@0:  * @param end last character, inclusive, of the range
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
michael@0:     //int32_t i = -1;
michael@0:     //for (;;) {
michael@0:     //    if (start < list[++i]) break;
michael@0:     //}
michael@0:     int32_t i = findCodePoint(start);
michael@0:     return ((i & 1) == 0 && end < list[i]);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains none of the characters and strings
michael@0:  * of the given set.
michael@0:  * @param c set to be checked for containment
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
michael@0:     // The specified set is a subset if all of its pairs are contained in
michael@0:     // this set.  It's possible to code this more efficiently in terms of
michael@0:     // direct manipulation of the inversion lists if the need arises.
michael@0:     int32_t n = c.getRangeCount();
michael@0:     for (int32_t i=0; i<n; ++i) {
michael@0:         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
michael@0:             return FALSE;
michael@0:         }
michael@0:     }
michael@0:     if (!strings->containsNone(*c.strings)) return FALSE;
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns true if this set contains none of the characters
michael@0:  * of the given string.
michael@0:  * @param s string containing characters to be checked for containment
michael@0:  * @return true if the test condition is met
michael@0:  */
michael@0: UBool UnicodeSet::containsNone(const UnicodeString& s) const {
michael@0:     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
michael@0:                    s.length());
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns <tt>true</tt> if this set contains any character whose low byte
michael@0:  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
michael@0:  * indexing.
michael@0:  */
michael@0: UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
michael@0:     /* The index value v, in the range [0,255], is contained in this set if
michael@0:      * it is contained in any pair of this set.  Pairs either have the high
michael@0:      * bytes equal, or unequal.  If the high bytes are equal, then we have
michael@0:      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
michael@0:      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
michael@0:      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
michael@0:      * time zone month containment logic.)
michael@0:      */
michael@0:     int32_t i;
michael@0:     int32_t rangeCount=getRangeCount();
michael@0:     for (i=0; i<rangeCount; ++i) {
michael@0:         UChar32 low = getRangeStart(i);
michael@0:         UChar32 high = getRangeEnd(i);
michael@0:         if ((low & ~0xFF) == (high & ~0xFF)) {
michael@0:             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
michael@0:                 return TRUE;
michael@0:             }
michael@0:         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
michael@0:             return TRUE;
michael@0:         }
michael@0:     }
michael@0:     if (strings->size() != 0) {
michael@0:         for (i=0; i<strings->size(); ++i) {
michael@0:             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
michael@0:             //if (s.length() == 0) {
michael@0:             //    // Empty strings match everything
michael@0:             //    return TRUE;
michael@0:             //}
michael@0:             // assert(s.length() != 0); // We enforce this elsewhere
michael@0:             UChar32 c = s.char32At(0);
michael@0:             if ((c & 0xFF) == v) {
michael@0:                 return TRUE;
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     return FALSE;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Implementation of UnicodeMatcher::matches().  Always matches the
michael@0:  * longest possible multichar string.
michael@0:  */
michael@0: UMatchDegree UnicodeSet::matches(const Replaceable& text,
michael@0:                                  int32_t& offset,
michael@0:                                  int32_t limit,
michael@0:                                  UBool incremental) {
michael@0:     if (offset == limit) {
michael@0:         // Strings, if any, have length != 0, so we don't worry
michael@0:         // about them here.  If we ever allow zero-length strings
michael@0:         // we much check for them here.
michael@0:         if (contains(U_ETHER)) {
michael@0:             return incremental ? U_PARTIAL_MATCH : U_MATCH;
michael@0:         } else {
michael@0:             return U_MISMATCH;
michael@0:         }
michael@0:     } else {
michael@0:         if (strings->size() != 0) { // try strings first
michael@0: 
michael@0:             // might separate forward and backward loops later
michael@0:             // for now they are combined
michael@0: 
michael@0:             // TODO Improve efficiency of this, at least in the forward
michael@0:             // direction, if not in both.  In the forward direction we
michael@0:             // can assume the strings are sorted.
michael@0: 
michael@0:             int32_t i;
michael@0:             UBool forward = offset < limit;
michael@0: 
michael@0:             // firstChar is the leftmost char to match in the
michael@0:             // forward direction or the rightmost char to match in
michael@0:             // the reverse direction.
michael@0:             UChar firstChar = text.charAt(offset);
michael@0: 
michael@0:             // If there are multiple strings that can match we
michael@0:             // return the longest match.
michael@0:             int32_t highWaterLength = 0;
michael@0: 
michael@0:             for (i=0; i<strings->size(); ++i) {
michael@0:                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
michael@0: 
michael@0:                 //if (trial.length() == 0) {
michael@0:                 //    return U_MATCH; // null-string always matches
michael@0:                 //}
michael@0:                 // assert(trial.length() != 0); // We ensure this elsewhere
michael@0: 
michael@0:                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
michael@0: 
michael@0:                 // Strings are sorted, so we can optimize in the
michael@0:                 // forward direction.
michael@0:                 if (forward && c > firstChar) break;
michael@0:                 if (c != firstChar) continue;
michael@0: 
michael@0:                 int32_t matchLen = matchRest(text, offset, limit, trial);
michael@0: 
michael@0:                 if (incremental) {
michael@0:                     int32_t maxLen = forward ? limit-offset : offset-limit;
michael@0:                     if (matchLen == maxLen) {
michael@0:                         // We have successfully matched but only up to limit.
michael@0:                         return U_PARTIAL_MATCH;
michael@0:                     }
michael@0:                 }
michael@0: 
michael@0:                 if (matchLen == trial.length()) {
michael@0:                     // We have successfully matched the whole string.
michael@0:                     if (matchLen > highWaterLength) {
michael@0:                         highWaterLength = matchLen;
michael@0:                     }
michael@0:                     // In the forward direction we know strings
michael@0:                     // are sorted so we can bail early.
michael@0:                     if (forward && matchLen < highWaterLength) {
michael@0:                         break;
michael@0:                     }
michael@0:                     continue;
michael@0:                 }
michael@0:             }
michael@0: 
michael@0:             // We've checked all strings without a partial match.
michael@0:             // If we have full matches, return the longest one.
michael@0:             if (highWaterLength != 0) {
michael@0:                 offset += forward ? highWaterLength : -highWaterLength;
michael@0:                 return U_MATCH;
michael@0:             }
michael@0:         }
michael@0:         return UnicodeFilter::matches(text, offset, limit, incremental);
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns the longest match for s in text at the given position.
michael@0:  * If limit > start then match forward from start+1 to limit
michael@0:  * matching all characters except s.charAt(0).  If limit < start,
michael@0:  * go backward starting from start-1 matching all characters
michael@0:  * except s.charAt(s.length()-1).  This method assumes that the
michael@0:  * first character, text.charAt(start), matches s, so it does not
michael@0:  * check it.
michael@0:  * @param text the text to match
michael@0:  * @param start the first character to match.  In the forward
michael@0:  * direction, text.charAt(start) is matched against s.charAt(0).
michael@0:  * In the reverse direction, it is matched against
michael@0:  * s.charAt(s.length()-1).
michael@0:  * @param limit the limit offset for matching, either last+1 in
michael@0:  * the forward direction, or last-1 in the reverse direction,
michael@0:  * where last is the index of the last character to match.
michael@0:  * @return If part of s matches up to the limit, return |limit -
michael@0:  * start|.  If all of s matches before reaching the limit, return
michael@0:  * s.length().  If there is a mismatch between s and text, return
michael@0:  * 0
michael@0:  */
michael@0: int32_t UnicodeSet::matchRest(const Replaceable& text,
michael@0:                               int32_t start, int32_t limit,
michael@0:                               const UnicodeString& s) {
michael@0:     int32_t i;
michael@0:     int32_t maxLen;
michael@0:     int32_t slen = s.length();
michael@0:     if (start < limit) {
michael@0:         maxLen = limit - start;
michael@0:         if (maxLen > slen) maxLen = slen;
michael@0:         for (i = 1; i < maxLen; ++i) {
michael@0:             if (text.charAt(start + i) != s.charAt(i)) return 0;
michael@0:         }
michael@0:     } else {
michael@0:         maxLen = start - limit;
michael@0:         if (maxLen > slen) maxLen = slen;
michael@0:         --slen; // <=> slen = s.length() - 1;
michael@0:         for (i = 1; i < maxLen; ++i) {
michael@0:             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
michael@0:         }
michael@0:     }
michael@0:     return maxLen;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Implement of UnicodeMatcher
michael@0:  */
michael@0: void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
michael@0:     toUnionTo.addAll(*this);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns the index of the given character within this set, where
michael@0:  * the set is ordered by ascending code point.  If the character
michael@0:  * is not in this set, return -1.  The inverse of this method is
michael@0:  * <code>charAt()</code>.
michael@0:  * @return an index from 0..size()-1, or -1
michael@0:  */
michael@0: int32_t UnicodeSet::indexOf(UChar32 c) const {
michael@0:     if (c < MIN_VALUE || c > MAX_VALUE) {
michael@0:         return -1;
michael@0:     }
michael@0:     int32_t i = 0;
michael@0:     int32_t n = 0;
michael@0:     for (;;) {
michael@0:         UChar32 start = list[i++];
michael@0:         if (c < start) {
michael@0:             return -1;
michael@0:         }
michael@0:         UChar32 limit = list[i++];
michael@0:         if (c < limit) {
michael@0:             return n + c - start;
michael@0:         }
michael@0:         n += limit - start;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns the character at the given index within this set, where
michael@0:  * the set is ordered by ascending code point.  If the index is
michael@0:  * out of range, return (UChar32)-1.  The inverse of this method is
michael@0:  * <code>indexOf()</code>.
michael@0:  * @param index an index from 0..size()-1
michael@0:  * @return the character at the given index, or (UChar32)-1.
michael@0:  */
michael@0: UChar32 UnicodeSet::charAt(int32_t index) const {
michael@0:     if (index >= 0) {
michael@0:         // len2 is the largest even integer <= len, that is, it is len
michael@0:         // for even values and len-1 for odd values.  With odd values
michael@0:         // the last entry is UNICODESET_HIGH.
michael@0:         int32_t len2 = len & ~1;
michael@0:         for (int32_t i=0; i < len2;) {
michael@0:             UChar32 start = list[i++];
michael@0:             int32_t count = list[i++] - start;
michael@0:             if (index < count) {
michael@0:                 return (UChar32)(start + index);
michael@0:             }
michael@0:             index -= count;
michael@0:         }
michael@0:     }
michael@0:     return (UChar32)-1;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Make this object represent the range <code>start - end</code>.
michael@0:  * If <code>end > start</code> then this object is set to an
michael@0:  * an empty range.
michael@0:  *
michael@0:  * @param start first character in the set, inclusive
michael@0:  * @rparam end last character in the set, inclusive
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
michael@0:     clear();
michael@0:     complement(start, end);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Adds the specified range to this set if it is not already
michael@0:  * present.  If this set already contains the specified range,
michael@0:  * the call leaves this set unchanged.  If <code>end > start</code>
michael@0:  * then an empty range is added, leaving the set unchanged.
michael@0:  *
michael@0:  * @param start first character, inclusive, of range to be added
michael@0:  * to this set.
michael@0:  * @param end last character, inclusive, of range to be added
michael@0:  * to this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
michael@0:     if (pinCodePoint(start) < pinCodePoint(end)) {
michael@0:         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0:         add(range, 2, 0);
michael@0:     } else if (start == end) {
michael@0:         add(start);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: // #define DEBUG_US_ADD
michael@0: 
michael@0: #ifdef DEBUG_US_ADD
michael@0: #include <stdio.h>
michael@0: void dump(UChar32 c) {
michael@0:     if (c <= 0xFF) {
michael@0:         printf("%c", (char)c);
michael@0:     } else {
michael@0:         printf("U+%04X", c);
michael@0:     }
michael@0: }
michael@0: void dump(const UChar32* list, int32_t len) {
michael@0:     printf("[");
michael@0:     for (int32_t i=0; i<len; ++i) {
michael@0:         if (i != 0) printf(", ");
michael@0:         dump(list[i]);
michael@0:     }
michael@0:     printf("]");
michael@0: }
michael@0: #endif
michael@0: 
michael@0: /**
michael@0:  * Adds the specified character to this set if it is not already
michael@0:  * present.  If this set already contains the specified character,
michael@0:  * the call leaves this set unchanged.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::add(UChar32 c) {
michael@0:     // find smallest i such that c < list[i]
michael@0:     // if odd, then it is IN the set
michael@0:     // if even, then it is OUT of the set
michael@0:     int32_t i = findCodePoint(pinCodePoint(c));
michael@0: 
michael@0:     // already in set?
michael@0:     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
michael@0: 
michael@0:     // HIGH is 0x110000
michael@0:     // assert(list[len-1] == HIGH);
michael@0: 
michael@0:     // empty = [HIGH]
michael@0:     // [start_0, limit_0, start_1, limit_1, HIGH]
michael@0: 
michael@0:     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
michael@0:     //                             ^
michael@0:     //                             list[i]
michael@0: 
michael@0:     // i == 0 means c is before the first range
michael@0: 
michael@0: #ifdef DEBUG_US_ADD
michael@0:     printf("Add of ");
michael@0:     dump(c);
michael@0:     printf(" found at %d", i);
michael@0:     printf(": ");
michael@0:     dump(list, len);
michael@0:     printf(" => ");
michael@0: #endif
michael@0: 
michael@0:     if (c == list[i]-1) {
michael@0:         // c is before start of next range
michael@0:         list[i] = c;
michael@0:         // if we touched the HIGH mark, then add a new one
michael@0:         if (c == (UNICODESET_HIGH - 1)) {
michael@0:             UErrorCode status = U_ZERO_ERROR;
michael@0:             ensureCapacity(len+1, status);
michael@0:             if (U_FAILURE(status)) {
michael@0:                 return *this; // There is no way to report this error :-(
michael@0:             }
michael@0:             list[len++] = UNICODESET_HIGH;
michael@0:         }
michael@0:         if (i > 0 && c == list[i-1]) {
michael@0:             // collapse adjacent ranges
michael@0: 
michael@0:             // [..., start_k-1, c, c, limit_k, ..., HIGH]
michael@0:             //                     ^
michael@0:             //                     list[i]
michael@0: 
michael@0:             //for (int32_t k=i-1; k<len-2; ++k) {
michael@0:             //    list[k] = list[k+2];
michael@0:             //}
michael@0:             UChar32* dst = list + i - 1;
michael@0:             UChar32* src = dst + 2;
michael@0:             UChar32* srclimit = list + len;
michael@0:             while (src < srclimit) *(dst++) = *(src++);
michael@0: 
michael@0:             len -= 2;
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     else if (i > 0 && c == list[i-1]) {
michael@0:         // c is after end of prior range
michael@0:         list[i-1]++;
michael@0:         // no need to check for collapse here
michael@0:     }
michael@0: 
michael@0:     else {
michael@0:         // At this point we know the new char is not adjacent to
michael@0:         // any existing ranges, and it is not 10FFFF.
michael@0: 
michael@0: 
michael@0:         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
michael@0:         //                             ^
michael@0:         //                             list[i]
michael@0: 
michael@0:         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
michael@0:         //                             ^
michael@0:         //                             list[i]
michael@0: 
michael@0:         UErrorCode status = U_ZERO_ERROR;
michael@0:         ensureCapacity(len+2, status);
michael@0:         if (U_FAILURE(status)) {
michael@0:             return *this; // There is no way to report this error :-(
michael@0:         }
michael@0: 
michael@0:         //for (int32_t k=len-1; k>=i; --k) {
michael@0:         //    list[k+2] = list[k];
michael@0:         //}
michael@0:         UChar32* src = list + len;
michael@0:         UChar32* dst = src + 2;
michael@0:         UChar32* srclimit = list + i;
michael@0:         while (src > srclimit) *(--dst) = *(--src);
michael@0: 
michael@0:         list[i] = c;
michael@0:         list[i+1] = c+1;
michael@0:         len += 2;
michael@0:     }
michael@0: 
michael@0: #ifdef DEBUG_US_ADD
michael@0:     dump(list, len);
michael@0:     printf("\n");
michael@0: 
michael@0:     for (i=1; i<len; ++i) {
michael@0:         if (list[i] <= list[i-1]) {
michael@0:             // Corrupt array!
michael@0:             printf("ERROR: list has been corrupted\n");
michael@0:             exit(1);
michael@0:         }
michael@0:     }
michael@0: #endif
michael@0: 
michael@0:     releasePattern();
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Adds the specified multicharacter to this set if it is not already
michael@0:  * present.  If this set already contains the multicharacter,
michael@0:  * the call leaves this set unchanged.
michael@0:  * Thus "ch" => {"ch"}
michael@0:  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0:  * @param s the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
michael@0:     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0:     int32_t cp = getSingleCP(s);
michael@0:     if (cp < 0) {
michael@0:         if (!strings->contains((void*) &s)) {
michael@0:             _add(s);
michael@0:             releasePattern();
michael@0:         }
michael@0:     } else {
michael@0:         add((UChar32)cp);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Adds the given string, in order, to 'strings'.  The given string
michael@0:  * must have been checked by the caller to not be empty and to not
michael@0:  * already be in 'strings'.
michael@0:  */
michael@0: void UnicodeSet::_add(const UnicodeString& s) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return;
michael@0:     }
michael@0:     UnicodeString* t = new UnicodeString(s);
michael@0:     if (t == NULL) { // Check for memory allocation error.
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     UErrorCode ec = U_ZERO_ERROR;
michael@0:     strings->sortedInsert(t, compareUnicodeString, ec);
michael@0:     if (U_FAILURE(ec)) {
michael@0:         setToBogus();
michael@0:         delete t;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * @return a code point IF the string consists of a single one.
michael@0:  * otherwise returns -1.
michael@0:  * @param string to test
michael@0:  */
michael@0: int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
michael@0:     //if (s.length() < 1) {
michael@0:     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
michael@0:     //}
michael@0:     if (s.length() > 2) return -1;
michael@0:     if (s.length() == 1) return s.charAt(0);
michael@0: 
michael@0:     // at this point, len = 2
michael@0:     UChar32 cp = s.char32At(0);
michael@0:     if (cp > 0xFFFF) { // is surrogate pair
michael@0:         return cp;
michael@0:     }
michael@0:     return -1;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
michael@0:  * If this set already any particular character, it has no effect on that character.
michael@0:  * @param the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
michael@0:     UChar32 cp;
michael@0:     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
michael@0:         cp = s.char32At(i);
michael@0:         add(cp);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0:  * If this set already any particular character, it has no effect on that character.
michael@0:  * @param the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
michael@0:     UnicodeSet set;
michael@0:     set.addAll(s);
michael@0:     retainAll(set);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0:  * If this set already any particular character, it has no effect on that character.
michael@0:  * @param the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
michael@0:     UnicodeSet set;
michael@0:     set.addAll(s);
michael@0:     complementAll(set);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
michael@0:  * If this set already any particular character, it has no effect on that character.
michael@0:  * @param the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
michael@0:     UnicodeSet set;
michael@0:     set.addAll(s);
michael@0:     removeAll(set);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: UnicodeSet& UnicodeSet::removeAllStrings() {
michael@0:     strings->removeAllElements();
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
michael@0:  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0:  * @param the source string
michael@0:  * @return a newly created set containing the given string
michael@0:  */
michael@0: UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
michael@0:     UnicodeSet *set = new UnicodeSet();
michael@0:     if (set != NULL) { // Check for memory allocation error.
michael@0:         set->add(s);
michael@0:     }
michael@0:     return set;
michael@0: }
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
michael@0:  * @param the source string
michael@0:  * @return a newly created set containing the given characters
michael@0:  */
michael@0: UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
michael@0:     UnicodeSet *set = new UnicodeSet();
michael@0:     if (set != NULL) { // Check for memory allocation error.
michael@0:         set->addAll(s);
michael@0:     }
michael@0:     return set;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retain only the elements in this set that are contained in the
michael@0:  * specified range.  If <code>end > start</code> then an empty range is
michael@0:  * retained, leaving the set empty.
michael@0:  *
michael@0:  * @param start first character, inclusive, of range to be retained
michael@0:  * to this set.
michael@0:  * @param end last character, inclusive, of range to be retained
michael@0:  * to this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
michael@0:     if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0:         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0:         retain(range, 2, 0);
michael@0:     } else {
michael@0:         clear();
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: UnicodeSet& UnicodeSet::retain(UChar32 c) {
michael@0:     return retain(c, c);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Removes the specified range from this set if it is present.
michael@0:  * The set will not contain the specified range once the call
michael@0:  * returns.  If <code>end > start</code> then an empty range is
michael@0:  * removed, leaving the set unchanged.
michael@0:  *
michael@0:  * @param start first character, inclusive, of range to be removed
michael@0:  * from this set.
michael@0:  * @param end last character, inclusive, of range to be removed
michael@0:  * from this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
michael@0:     if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0:         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0:         retain(range, 2, 2);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Removes the specified character from this set if it is present.
michael@0:  * The set will not contain the specified range once the call
michael@0:  * returns.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::remove(UChar32 c) {
michael@0:     return remove(c, c);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Removes the specified string from this set if it is present.
michael@0:  * The set will not contain the specified character once the call
michael@0:  * returns.
michael@0:  * @param the source string
michael@0:  * @return the modified set, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
michael@0:     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0:     int32_t cp = getSingleCP(s);
michael@0:     if (cp < 0) {
michael@0:         strings->removeElement((void*) &s);
michael@0:         releasePattern();
michael@0:     } else {
michael@0:         remove((UChar32)cp, (UChar32)cp);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Complements the specified range in this set.  Any character in
michael@0:  * the range will be removed if it is in this set, or will be
michael@0:  * added if it is not in this set.  If <code>end > start</code>
michael@0:  * then an empty range is xor'ed, leaving the set unchanged.
michael@0:  *
michael@0:  * @param start first character, inclusive, of range to be removed
michael@0:  * from this set.
michael@0:  * @param end last character, inclusive, of range to be removed
michael@0:  * from this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     if (pinCodePoint(start) <= pinCodePoint(end)) {
michael@0:         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
michael@0:         exclusiveOr(range, 2, 0);
michael@0:     }
michael@0:     releasePattern();
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: UnicodeSet& UnicodeSet::complement(UChar32 c) {
michael@0:     return complement(c, c);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * This is equivalent to
michael@0:  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::complement(void) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     if (list[0] == UNICODESET_LOW) {
michael@0:         ensureBufferCapacity(len-1, status);
michael@0:         if (U_FAILURE(status)) {
michael@0:             return *this;
michael@0:         }
michael@0:         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
michael@0:         --len;
michael@0:     } else {
michael@0:         ensureBufferCapacity(len+1, status);
michael@0:         if (U_FAILURE(status)) {
michael@0:             return *this;
michael@0:         }
michael@0:         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
michael@0:         buffer[0] = UNICODESET_LOW;
michael@0:         ++len;
michael@0:     }
michael@0:     swapBuffers();
michael@0:     releasePattern();
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Complement the specified string in this set.
michael@0:  * The set will not contain the specified string once the call
michael@0:  * returns.
michael@0:  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
michael@0:  * @param s the string to complement
michael@0:  * @return this object, for chaining
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
michael@0:     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
michael@0:     int32_t cp = getSingleCP(s);
michael@0:     if (cp < 0) {
michael@0:         if (strings->contains((void*) &s)) {
michael@0:             strings->removeElement((void*) &s);
michael@0:         } else {
michael@0:             _add(s);
michael@0:         }
michael@0:         releasePattern();
michael@0:     } else {
michael@0:         complement((UChar32)cp, (UChar32)cp);
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Adds all of the elements in the specified set to this set if
michael@0:  * they're not already present.  This operation effectively
michael@0:  * modifies this set so that its value is the <i>union</i> of the two
michael@0:  * sets.  The behavior of this operation is unspecified if the specified
michael@0:  * collection is modified while the operation is in progress.
michael@0:  *
michael@0:  * @param c set whose elements are to be added to this set.
michael@0:  * @see #add(char, char)
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
michael@0:     if ( c.len>0 && c.list!=NULL ) {
michael@0:         add(c.list, c.len, 0);
michael@0:     }
michael@0: 
michael@0:     // Add strings in order
michael@0:     if ( c.strings!=NULL ) {
michael@0:         for (int32_t i=0; i<c.strings->size(); ++i) {
michael@0:             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
michael@0:             if (!strings->contains((void*) s)) {
michael@0:                 _add(*s);
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Retains only the elements in this set that are contained in the
michael@0:  * specified set.  In other words, removes from this set all of
michael@0:  * its elements that are not contained in the specified set.  This
michael@0:  * operation effectively modifies this set so that its value is
michael@0:  * the <i>intersection</i> of the two sets.
michael@0:  *
michael@0:  * @param c set that defines which elements this set will retain.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     retain(c.list, c.len, 0);
michael@0:     strings->retainAll(*c.strings);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Removes from this set all of its elements that are contained in the
michael@0:  * specified set.  This operation effectively modifies this
michael@0:  * set so that its value is the <i>asymmetric set difference</i> of
michael@0:  * the two sets.
michael@0:  *
michael@0:  * @param c set that defines which elements will be removed from
michael@0:  *          this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     retain(c.list, c.len, 2);
michael@0:     strings->removeAll(*c.strings);
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Complements in this set all elements contained in the specified
michael@0:  * set.  Any character in the other set will be removed if it is
michael@0:  * in this set, or will be added if it is not in this set.
michael@0:  *
michael@0:  * @param c set that defines which elements will be xor'ed from
michael@0:  *          this set.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     exclusiveOr(c.list, c.len, 0);
michael@0: 
michael@0:     for (int32_t i=0; i<c.strings->size(); ++i) {
michael@0:         void* e = c.strings->elementAt(i);
michael@0:         if (!strings->removeElement(e)) {
michael@0:             _add(*(const UnicodeString*)e);
michael@0:         }
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Removes all of the elements from this set.  This set will be
michael@0:  * empty after this call returns.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::clear(void) {
michael@0:     if (isFrozen()) {
michael@0:         return *this;
michael@0:     }
michael@0:     if (list != NULL) {
michael@0:         list[0] = UNICODESET_HIGH;
michael@0:     }
michael@0:     len = 1;
michael@0:     releasePattern();
michael@0:     if (strings != NULL) {
michael@0:         strings->removeAllElements();
michael@0:     }
michael@0:     if (list != NULL && strings != NULL) {
michael@0:         // Remove bogus
michael@0:         fFlags = 0;
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Iteration method that returns the number of ranges contained in
michael@0:  * this set.
michael@0:  * @see #getRangeStart
michael@0:  * @see #getRangeEnd
michael@0:  */
michael@0: int32_t UnicodeSet::getRangeCount() const {
michael@0:     return len/2;
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Iteration method that returns the first character in the
michael@0:  * specified range of this set.
michael@0:  * @see #getRangeCount
michael@0:  * @see #getRangeEnd
michael@0:  */
michael@0: UChar32 UnicodeSet::getRangeStart(int32_t index) const {
michael@0:     return list[index*2];
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Iteration method that returns the last character in the
michael@0:  * specified range of this set.
michael@0:  * @see #getRangeStart
michael@0:  * @see #getRangeEnd
michael@0:  */
michael@0: UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
michael@0:     return list[index*2 + 1] - 1;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::getStringCount() const {
michael@0:     return strings->size();
michael@0: }
michael@0: 
michael@0: const UnicodeString* UnicodeSet::getString(int32_t index) const {
michael@0:     return (const UnicodeString*) strings->elementAt(index);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Reallocate this objects internal structures to take up the least
michael@0:  * possible space, without changing this object's value.
michael@0:  */
michael@0: UnicodeSet& UnicodeSet::compact() {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return *this;
michael@0:     }
michael@0:     // Delete buffer first to defragment memory less.
michael@0:     if (buffer != NULL) {
michael@0:         uprv_free(buffer);
michael@0:         buffer = NULL;
michael@0:     }
michael@0:     if (len < capacity) {
michael@0:         // Make the capacity equal to len or 1.
michael@0:         // We don't want to realloc of 0 size.
michael@0:         int32_t newCapacity = len + (len == 0);
michael@0:         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
michael@0:         if (temp) {
michael@0:             list = temp;
michael@0:             capacity = newCapacity;
michael@0:         }
michael@0:         // else what the heck happened?! We allocated less memory!
michael@0:         // Oh well. We'll keep our original array.
michael@0:     }
michael@0:     return *this;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
michael@0:     int32_t bmpLength, length, destLength;
michael@0: 
michael@0:     if (U_FAILURE(ec)) {
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
michael@0:         ec=U_ILLEGAL_ARGUMENT_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     /* count necessary 16-bit units */
michael@0:     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
michael@0:     // assert(length>=0);
michael@0:     if (length==0) {
michael@0:         /* empty set */
michael@0:         if (destCapacity>0) {
michael@0:             *dest=0;
michael@0:         } else {
michael@0:             ec=U_BUFFER_OVERFLOW_ERROR;
michael@0:         }
michael@0:         return 1;
michael@0:     }
michael@0:     /* now length>0 */
michael@0: 
michael@0:     if (this->list[length-1]<=0xffff) {
michael@0:         /* all BMP */
michael@0:         bmpLength=length;
michael@0:     } else if (this->list[0]>=0x10000) {
michael@0:         /* all supplementary */
michael@0:         bmpLength=0;
michael@0:         length*=2;
michael@0:     } else {
michael@0:         /* some BMP, some supplementary */
michael@0:         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
michael@0:         length=bmpLength+2*(length-bmpLength);
michael@0:     }
michael@0: 
michael@0:     /* length: number of 16-bit array units */
michael@0:     if (length>0x7fff) {
michael@0:         /* there are only 15 bits for the length in the first serialized word */
michael@0:         ec=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     /*
michael@0:      * total serialized length:
michael@0:      * number of 16-bit array units (length) +
michael@0:      * 1 length unit (always) +
michael@0:      * 1 bmpLength unit (if there are supplementary values)
michael@0:      */
michael@0:     destLength=length+((length>bmpLength)?2:1);
michael@0:     if (destLength<=destCapacity) {
michael@0:         const UChar32 *p;
michael@0:         int32_t i;
michael@0: 
michael@0:         *dest=(uint16_t)length;
michael@0:         if (length>bmpLength) {
michael@0:             *dest|=0x8000;
michael@0:             *++dest=(uint16_t)bmpLength;
michael@0:         }
michael@0:         ++dest;
michael@0: 
michael@0:         /* write the BMP part of the array */
michael@0:         p=this->list;
michael@0:         for (i=0; i<bmpLength; ++i) {
michael@0:             *dest++=(uint16_t)*p++;
michael@0:         }
michael@0: 
michael@0:         /* write the supplementary part of the array */
michael@0:         for (; i<length; i+=2) {
michael@0:             *dest++=(uint16_t)(*p>>16);
michael@0:             *dest++=(uint16_t)*p++;
michael@0:         }
michael@0:     } else {
michael@0:         ec=U_BUFFER_OVERFLOW_ERROR;
michael@0:     }
michael@0:     return destLength;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // Implementation: Utility methods
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: /**
michael@0:  * Allocate our strings vector and return TRUE if successful.
michael@0:  */
michael@0: UBool UnicodeSet::allocateStrings(UErrorCode &status) {
michael@0:     if (U_FAILURE(status)) {
michael@0:         return FALSE;
michael@0:     }
michael@0:     strings = new UVector(uprv_deleteUObject,
michael@0:                           uhash_compareUnicodeString, 1, status);
michael@0:     if (strings == NULL) { // Check for memory allocation error.
michael@0:         status = U_MEMORY_ALLOCATION_ERROR;
michael@0:         return FALSE;
michael@0:     }
michael@0:     if (U_FAILURE(status)) {
michael@0:         delete strings;
michael@0:         strings = NULL;
michael@0:         return FALSE;
michael@0:     } 
michael@0:     return TRUE;
michael@0: }
michael@0: 
michael@0: void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
michael@0:     if (newLen <= capacity)
michael@0:         return;
michael@0:     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
michael@0:     if (temp == NULL) {
michael@0:         ec = U_MEMORY_ALLOCATION_ERROR;
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     list = temp;
michael@0:     capacity = newLen + GROW_EXTRA;
michael@0:     // else we keep the original contents on the memory failure.
michael@0: }
michael@0: 
michael@0: void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
michael@0:     if (buffer != NULL && newLen <= bufferCapacity)
michael@0:         return;
michael@0:     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
michael@0:     if (temp == NULL) {
michael@0:         ec = U_MEMORY_ALLOCATION_ERROR;
michael@0:         setToBogus();
michael@0:         return;
michael@0:     }
michael@0:     buffer = temp;
michael@0:     bufferCapacity = newLen + GROW_EXTRA;
michael@0:     // else we keep the original contents on the memory failure.
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Swap list and buffer.
michael@0:  */
michael@0: void UnicodeSet::swapBuffers(void) {
michael@0:     // swap list and buffer
michael@0:     UChar32* temp = list;
michael@0:     list = buffer;
michael@0:     buffer = temp;
michael@0: 
michael@0:     int32_t c = capacity;
michael@0:     capacity = bufferCapacity;
michael@0:     bufferCapacity = c;
michael@0: }
michael@0: 
michael@0: void UnicodeSet::setToBogus() {
michael@0:     clear(); // Remove everything in the set.
michael@0:     fFlags = kIsBogus;
michael@0: }
michael@0: 
michael@0: //----------------------------------------------------------------
michael@0: // Implementation: Fundamental operators
michael@0: //----------------------------------------------------------------
michael@0: 
michael@0: static inline UChar32 max(UChar32 a, UChar32 b) {
michael@0:     return (a > b) ? a : b;
michael@0: }
michael@0: 
michael@0: // polarity = 0, 3 is normal: x xor y
michael@0: // polarity = 1, 2: x xor ~y == x === y
michael@0: 
michael@0: void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return;
michael@0:     }
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     ensureBufferCapacity(len + otherLen, status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     int32_t i = 0, j = 0, k = 0;
michael@0:     UChar32 a = list[i++];
michael@0:     UChar32 b;
michael@0:     if (polarity == 1 || polarity == 2) {
michael@0:         b = UNICODESET_LOW;
michael@0:         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
michael@0:             ++j;
michael@0:             b = other[j];
michael@0:         }
michael@0:     } else {
michael@0:         b = other[j++];
michael@0:     }
michael@0:     // simplest of all the routines
michael@0:     // sort the values, discarding identicals!
michael@0:     for (;;) {
michael@0:         if (a < b) {
michael@0:             buffer[k++] = a;
michael@0:             a = list[i++];
michael@0:         } else if (b < a) {
michael@0:             buffer[k++] = b;
michael@0:             b = other[j++];
michael@0:         } else if (a != UNICODESET_HIGH) { // at this point, a == b
michael@0:             // discard both values!
michael@0:             a = list[i++];
michael@0:             b = other[j++];
michael@0:         } else { // DONE!
michael@0:             buffer[k++] = UNICODESET_HIGH;
michael@0:             len = k;
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:     swapBuffers();
michael@0:     releasePattern();
michael@0: }
michael@0: 
michael@0: // polarity = 0 is normal: x union y
michael@0: // polarity = 2: x union ~y
michael@0: // polarity = 1: ~x union y
michael@0: // polarity = 3: ~x union ~y
michael@0: 
michael@0: void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0:     if (isFrozen() || isBogus() || other==NULL) {
michael@0:         return;
michael@0:     }
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     ensureBufferCapacity(len + otherLen, status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     int32_t i = 0, j = 0, k = 0;
michael@0:     UChar32 a = list[i++];
michael@0:     UChar32 b = other[j++];
michael@0:     // change from xor is that we have to check overlapping pairs
michael@0:     // polarity bit 1 means a is second, bit 2 means b is.
michael@0:     for (;;) {
michael@0:         switch (polarity) {
michael@0:           case 0: // both first; take lower if unequal
michael@0:             if (a < b) { // take a
michael@0:                 // Back up over overlapping ranges in buffer[]
michael@0:                 if (k > 0 && a <= buffer[k-1]) {
michael@0:                     // Pick latter end value in buffer[] vs. list[]
michael@0:                     a = max(list[i], buffer[--k]);
michael@0:                 } else {
michael@0:                     // No overlap
michael@0:                     buffer[k++] = a;
michael@0:                     a = list[i];
michael@0:                 }
michael@0:                 i++; // Common if/else code factored out
michael@0:                 polarity ^= 1;
michael@0:             } else if (b < a) { // take b
michael@0:                 if (k > 0 && b <= buffer[k-1]) {
michael@0:                     b = max(other[j], buffer[--k]);
michael@0:                 } else {
michael@0:                     buffer[k++] = b;
michael@0:                     b = other[j];
michael@0:                 }
michael@0:                 j++;
michael@0:                 polarity ^= 2;
michael@0:             } else { // a == b, take a, drop b
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 // This is symmetrical; it doesn't matter if
michael@0:                 // we backtrack with a or b. - liu
michael@0:                 if (k > 0 && a <= buffer[k-1]) {
michael@0:                     a = max(list[i], buffer[--k]);
michael@0:                 } else {
michael@0:                     // No overlap
michael@0:                     buffer[k++] = a;
michael@0:                     a = list[i];
michael@0:                 }
michael@0:                 i++;
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:           case 3: // both second; take higher if unequal, and drop other
michael@0:             if (b <= a) { // take a
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 buffer[k++] = a;
michael@0:             } else { // take b
michael@0:                 if (b == UNICODESET_HIGH) goto loop_end;
michael@0:                 buffer[k++] = b;
michael@0:             }
michael@0:             a = list[i++];
michael@0:             polarity ^= 1;   // factored common code
michael@0:             b = other[j++];
michael@0:             polarity ^= 2;
michael@0:             break;
michael@0:           case 1: // a second, b first; if b < a, overlap
michael@0:             if (a < b) { // no overlap, take a
michael@0:                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
michael@0:             } else if (b < a) { // OVERLAP, drop b
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else { // a == b, drop both!
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:           case 2: // a first, b second; if a < b, overlap
michael@0:             if (b < a) { // no overlap, take b
michael@0:                 buffer[k++] = b;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else  if (a < b) { // OVERLAP, drop a
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:             } else { // a == b, drop both!
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:  loop_end:
michael@0:     buffer[k++] = UNICODESET_HIGH;    // terminate
michael@0:     len = k;
michael@0:     swapBuffers();
michael@0:     releasePattern();
michael@0: }
michael@0: 
michael@0: // polarity = 0 is normal: x intersect y
michael@0: // polarity = 2: x intersect ~y == set-minus
michael@0: // polarity = 1: ~x intersect y
michael@0: // polarity = 3: ~x intersect ~y
michael@0: 
michael@0: void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
michael@0:     if (isFrozen() || isBogus()) {
michael@0:         return;
michael@0:     }
michael@0:     UErrorCode status = U_ZERO_ERROR;
michael@0:     ensureBufferCapacity(len + otherLen, status);
michael@0:     if (U_FAILURE(status)) {
michael@0:         return;
michael@0:     }
michael@0: 
michael@0:     int32_t i = 0, j = 0, k = 0;
michael@0:     UChar32 a = list[i++];
michael@0:     UChar32 b = other[j++];
michael@0:     // change from xor is that we have to check overlapping pairs
michael@0:     // polarity bit 1 means a is second, bit 2 means b is.
michael@0:     for (;;) {
michael@0:         switch (polarity) {
michael@0:           case 0: // both first; drop the smaller
michael@0:             if (a < b) { // drop a
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:             } else if (b < a) { // drop b
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else { // a == b, take one, drop other
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 buffer[k++] = a;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:           case 3: // both second; take lower if unequal
michael@0:             if (a < b) { // take a
michael@0:                 buffer[k++] = a;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:             } else if (b < a) { // take b
michael@0:                 buffer[k++] = b;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else { // a == b, take one, drop other
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 buffer[k++] = a;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:           case 1: // a second, b first;
michael@0:             if (a < b) { // NO OVERLAP, drop a
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:             } else if (b < a) { // OVERLAP, take b
michael@0:                 buffer[k++] = b;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else { // a == b, drop both!
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:           case 2: // a first, b second; if a < b, overlap
michael@0:             if (b < a) { // no overlap, drop b
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             } else  if (a < b) { // OVERLAP, take a
michael@0:                 buffer[k++] = a;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:             } else { // a == b, drop both!
michael@0:                 if (a == UNICODESET_HIGH) goto loop_end;
michael@0:                 a = list[i++];
michael@0:                 polarity ^= 1;
michael@0:                 b = other[j++];
michael@0:                 polarity ^= 2;
michael@0:             }
michael@0:             break;
michael@0:         }
michael@0:     }
michael@0:  loop_end:
michael@0:     buffer[k++] = UNICODESET_HIGH;    // terminate
michael@0:     len = k;
michael@0:     swapBuffers();
michael@0:     releasePattern();
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Append the <code>toPattern()</code> representation of a
michael@0:  * string to the given <code>StringBuffer</code>.
michael@0:  */
michael@0: void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
michael@0: escapeUnprintable) {
michael@0:     UChar32 cp;
michael@0:     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
michael@0:         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Append the <code>toPattern()</code> representation of a
michael@0:  * character to the given <code>StringBuffer</code>.
michael@0:  */
michael@0: void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
michael@0: escapeUnprintable) {
michael@0:     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
michael@0:         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
michael@0:         // unprintable
michael@0:         if (ICU_Utility::escapeUnprintable(buf, c)) {
michael@0:             return;
michael@0:         }
michael@0:     }
michael@0:     // Okay to let ':' pass through
michael@0:     switch (c) {
michael@0:     case SET_OPEN:
michael@0:     case SET_CLOSE:
michael@0:     case HYPHEN:
michael@0:     case COMPLEMENT:
michael@0:     case INTERSECTION:
michael@0:     case BACKSLASH:
michael@0:     case OPEN_BRACE:
michael@0:     case CLOSE_BRACE:
michael@0:     case COLON:
michael@0:     case SymbolTable::SYMBOL_REF:
michael@0:         buf.append(BACKSLASH);
michael@0:         break;
michael@0:     default:
michael@0:         // Escape whitespace
michael@0:         if (PatternProps::isWhiteSpace(c)) {
michael@0:             buf.append(BACKSLASH);
michael@0:         }
michael@0:         break;
michael@0:     }
michael@0:     buf.append(c);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Append a string representation of this set to result.  This will be
michael@0:  * a cleaned version of the string passed to applyPattern(), if there
michael@0:  * is one.  Otherwise it will be generated.
michael@0:  */
michael@0: UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
michael@0:                                       UBool escapeUnprintable) const
michael@0: {
michael@0:     if (pat != NULL) {
michael@0:         int32_t i;
michael@0:         int32_t backslashCount = 0;
michael@0:         for (i=0; i<patLen; ) {
michael@0:             UChar32 c;
michael@0:             U16_NEXT(pat, i, patLen, c);
michael@0:             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
michael@0:                 // If the unprintable character is preceded by an odd
michael@0:                 // number of backslashes, then it has been escaped.
michael@0:                 // Before unescaping it, we delete the final
michael@0:                 // backslash.
michael@0:                 if ((backslashCount % 2) == 1) {
michael@0:                     result.truncate(result.length() - 1);
michael@0:                 }
michael@0:                 ICU_Utility::escapeUnprintable(result, c);
michael@0:                 backslashCount = 0;
michael@0:             } else {
michael@0:                 result.append(c);
michael@0:                 if (c == BACKSLASH) {
michael@0:                     ++backslashCount;
michael@0:                 } else {
michael@0:                     backslashCount = 0;
michael@0:                 }
michael@0:             }
michael@0:         }
michael@0:         return result;
michael@0:     }
michael@0: 
michael@0:     return _generatePattern(result, escapeUnprintable);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Returns a string representation of this set.  If the result of
michael@0:  * calling this function is passed to a UnicodeSet constructor, it
michael@0:  * will produce another set that is equal to this one.
michael@0:  */
michael@0: UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
michael@0:                                      UBool escapeUnprintable) const
michael@0: {
michael@0:     result.truncate(0);
michael@0:     return _toPattern(result, escapeUnprintable);
michael@0: }
michael@0: 
michael@0: /**
michael@0:  * Generate and append a string representation of this set to result.
michael@0:  * This does not use this.pat, the cleaned up copy of the string
michael@0:  * passed to applyPattern().
michael@0:  */
michael@0: UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
michael@0:                                             UBool escapeUnprintable) const
michael@0: {
michael@0:     result.append(SET_OPEN);
michael@0: 
michael@0: //  // Check against the predefined categories.  We implicitly build
michael@0: //  // up ALL category sets the first time toPattern() is called.
michael@0: //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
michael@0: //      if (*this == getCategorySet(cat)) {
michael@0: //          result.append(COLON);
michael@0: //          result.append(CATEGORY_NAMES, cat*2, 2);
michael@0: //          return result.append(CATEGORY_CLOSE);
michael@0: //      }
michael@0: //  }
michael@0: 
michael@0:     int32_t count = getRangeCount();
michael@0: 
michael@0:     // If the set contains at least 2 intervals and includes both
michael@0:     // MIN_VALUE and MAX_VALUE, then the inverse representation will
michael@0:     // be more economical.
michael@0:     if (count > 1 &&
michael@0:         getRangeStart(0) == MIN_VALUE &&
michael@0:         getRangeEnd(count-1) == MAX_VALUE) {
michael@0: 
michael@0:         // Emit the inverse
michael@0:         result.append(COMPLEMENT);
michael@0: 
michael@0:         for (int32_t i = 1; i < count; ++i) {
michael@0:             UChar32 start = getRangeEnd(i-1)+1;
michael@0:             UChar32 end = getRangeStart(i)-1;
michael@0:             _appendToPat(result, start, escapeUnprintable);
michael@0:             if (start != end) {
michael@0:                 if ((start+1) != end) {
michael@0:                     result.append(HYPHEN);
michael@0:                 }
michael@0:                 _appendToPat(result, end, escapeUnprintable);
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     // Default; emit the ranges as pairs
michael@0:     else {
michael@0:         for (int32_t i = 0; i < count; ++i) {
michael@0:             UChar32 start = getRangeStart(i);
michael@0:             UChar32 end = getRangeEnd(i);
michael@0:             _appendToPat(result, start, escapeUnprintable);
michael@0:             if (start != end) {
michael@0:                 if ((start+1) != end) {
michael@0:                     result.append(HYPHEN);
michael@0:                 }
michael@0:                 _appendToPat(result, end, escapeUnprintable);
michael@0:             }
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     for (int32_t i = 0; i<strings->size(); ++i) {
michael@0:         result.append(OPEN_BRACE);
michael@0:         _appendToPat(result,
michael@0:                      *(const UnicodeString*) strings->elementAt(i),
michael@0:                      escapeUnprintable);
michael@0:         result.append(CLOSE_BRACE);
michael@0:     }
michael@0:     return result.append(SET_CLOSE);
michael@0: }
michael@0: 
michael@0: /**
michael@0: * Release existing cached pattern
michael@0: */
michael@0: void UnicodeSet::releasePattern() {
michael@0:     if (pat) {
michael@0:         uprv_free(pat);
michael@0:         pat = NULL;
michael@0:         patLen = 0;
michael@0:     }
michael@0: }
michael@0: 
michael@0: /**
michael@0: * Set the new pattern to cache.
michael@0: */
michael@0: void UnicodeSet::setPattern(const UnicodeString& newPat) {
michael@0:     releasePattern();
michael@0:     int32_t newPatLen = newPat.length();
michael@0:     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
michael@0:     if (pat) {
michael@0:         patLen = newPatLen;
michael@0:         newPat.extractBetween(0, patLen, pat);
michael@0:         pat[patLen] = 0;
michael@0:     }
michael@0:     // else we don't care if malloc failed. This was just a nice cache.
michael@0:     // We can regenerate an equivalent pattern later when requested.
michael@0: }
michael@0: 
michael@0: UnicodeFunctor *UnicodeSet::freeze() {
michael@0:     if(!isFrozen() && !isBogus()) {
michael@0:         // Do most of what compact() does before freezing because
michael@0:         // compact() will not work when the set is frozen.
michael@0:         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
michael@0: 
michael@0:         // Delete buffer first to defragment memory less.
michael@0:         if (buffer != NULL) {
michael@0:             uprv_free(buffer);
michael@0:             buffer = NULL;
michael@0:         }
michael@0:         if (capacity > (len + GROW_EXTRA)) {
michael@0:             // Make the capacity equal to len or 1.
michael@0:             // We don't want to realloc of 0 size.
michael@0:             capacity = len + (len == 0);
michael@0:             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
michael@0:             if (list == NULL) { // Check for memory allocation error.
michael@0:                 setToBogus();
michael@0:                 return this;
michael@0:             }
michael@0:         }
michael@0: 
michael@0:         // Optimize contains() and span() and similar functions.
michael@0:         if (!strings->isEmpty()) {
michael@0:             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
michael@0:             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
michael@0:                 // All strings are irrelevant for span() etc. because
michael@0:                 // all of each string's code points are contained in this set.
michael@0:                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
michael@0:                 // many relevant strings as UTF-16.
michael@0:                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
michael@0:                 delete stringSpan;
michael@0:                 stringSpan = NULL;
michael@0:             }
michael@0:         }
michael@0:         if (stringSpan == NULL) {
michael@0:             // No span-relevant strings: Optimize for code point spans.
michael@0:             bmpSet=new BMPSet(list, len);
michael@0:             if (bmpSet == NULL) { // Check for memory allocation error.
michael@0:                 setToBogus();
michael@0:             }
michael@0:         }
michael@0:     }
michael@0:     return this;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0:     if(length>0 && bmpSet!=NULL) {
michael@0:         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
michael@0:     }
michael@0:     if(length<0) {
michael@0:         length=u_strlen(s);
michael@0:     }
michael@0:     if(length==0) {
michael@0:         return 0;
michael@0:     }
michael@0:     if(stringSpan!=NULL) {
michael@0:         return stringSpan->span(s, length, spanCondition);
michael@0:     } else if(!strings->isEmpty()) {
michael@0:         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0:                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
michael@0:                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
michael@0:         UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0:         if(strSpan.needsStringSpanUTF16()) {
michael@0:             return strSpan.span(s, length, spanCondition);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0:         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
michael@0:     }
michael@0: 
michael@0:     UChar32 c;
michael@0:     int32_t start=0, prev=0;
michael@0:     do {
michael@0:         U16_NEXT(s, start, length, c);
michael@0:         if(spanCondition!=contains(c)) {
michael@0:             break;
michael@0:         }
michael@0:     } while((prev=start)<length);
michael@0:     return prev;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0:     if(length>0 && bmpSet!=NULL) {
michael@0:         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
michael@0:     }
michael@0:     if(length<0) {
michael@0:         length=u_strlen(s);
michael@0:     }
michael@0:     if(length==0) {
michael@0:         return 0;
michael@0:     }
michael@0:     if(stringSpan!=NULL) {
michael@0:         return stringSpan->spanBack(s, length, spanCondition);
michael@0:     } else if(!strings->isEmpty()) {
michael@0:         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0:                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
michael@0:                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
michael@0:         UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0:         if(strSpan.needsStringSpanUTF16()) {
michael@0:             return strSpan.spanBack(s, length, spanCondition);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0:         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
michael@0:     }
michael@0: 
michael@0:     UChar32 c;
michael@0:     int32_t prev=length;
michael@0:     do {
michael@0:         U16_PREV(s, 0, length, c);
michael@0:         if(spanCondition!=contains(c)) {
michael@0:             break;
michael@0:         }
michael@0:     } while((prev=length)>0);
michael@0:     return prev;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0:     if(length>0 && bmpSet!=NULL) {
michael@0:         const uint8_t *s0=(const uint8_t *)s;
michael@0:         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
michael@0:     }
michael@0:     if(length<0) {
michael@0:         length=(int32_t)uprv_strlen(s);
michael@0:     }
michael@0:     if(length==0) {
michael@0:         return 0;
michael@0:     }
michael@0:     if(stringSpan!=NULL) {
michael@0:         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
michael@0:     } else if(!strings->isEmpty()) {
michael@0:         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0:                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
michael@0:                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
michael@0:         UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0:         if(strSpan.needsStringSpanUTF8()) {
michael@0:             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0:         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
michael@0:     }
michael@0: 
michael@0:     UChar32 c;
michael@0:     int32_t start=0, prev=0;
michael@0:     do {
michael@0:         U8_NEXT_OR_FFFD(s, start, length, c);
michael@0:         if(spanCondition!=contains(c)) {
michael@0:             break;
michael@0:         }
michael@0:     } while((prev=start)<length);
michael@0:     return prev;
michael@0: }
michael@0: 
michael@0: int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0:     if(length>0 && bmpSet!=NULL) {
michael@0:         const uint8_t *s0=(const uint8_t *)s;
michael@0:         return bmpSet->spanBackUTF8(s0, length, spanCondition);
michael@0:     }
michael@0:     if(length<0) {
michael@0:         length=(int32_t)uprv_strlen(s);
michael@0:     }
michael@0:     if(length==0) {
michael@0:         return 0;
michael@0:     }
michael@0:     if(stringSpan!=NULL) {
michael@0:         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
michael@0:     } else if(!strings->isEmpty()) {
michael@0:         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
michael@0:                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
michael@0:                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
michael@0:         UnicodeSetStringSpan strSpan(*this, *strings, which);
michael@0:         if(strSpan.needsStringSpanUTF8()) {
michael@0:             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
michael@0:         }
michael@0:     }
michael@0: 
michael@0:     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0:         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
michael@0:     }
michael@0: 
michael@0:     UChar32 c;
michael@0:     int32_t prev=length;
michael@0:     do {
michael@0:         U8_PREV_OR_FFFD(s, 0, length, c);
michael@0:         if(spanCondition!=contains(c)) {
michael@0:             break;
michael@0:         }
michael@0:     } while((prev=length)>0);
michael@0:     return prev;
michael@0: }
michael@0: 
michael@0: U_NAMESPACE_END