michael@0: /* michael@0: ******************************************************************************* michael@0: * michael@0: * Copyright (C) 2002-2011, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ******************************************************************************* michael@0: * file name: uset.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2002mar07 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * There are functions to efficiently serialize a USet into an array of uint16_t michael@0: * and functions to use such a serialized form efficiently without michael@0: * instantiating a new USet. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uobject.h" michael@0: #include "unicode/uset.h" michael@0: #include "unicode/uniset.h" michael@0: #include "cmemory.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/parsepos.h" michael@0: michael@0: U_NAMESPACE_USE michael@0: michael@0: U_CAPI USet* U_EXPORT2 michael@0: uset_openEmpty() { michael@0: return (USet*) new UnicodeSet(); michael@0: } michael@0: michael@0: U_CAPI USet* U_EXPORT2 michael@0: uset_open(UChar32 start, UChar32 end) { michael@0: return (USet*) new UnicodeSet(start, end); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_close(USet* set) { michael@0: delete (UnicodeSet*) set; michael@0: } michael@0: michael@0: U_CAPI USet * U_EXPORT2 michael@0: uset_clone(const USet *set) { michael@0: return (USet*) (((UnicodeSet*) set)->UnicodeSet::clone()); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_isFrozen(const USet *set) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::isFrozen(); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_freeze(USet *set) { michael@0: ((UnicodeSet*) set)->UnicodeSet::freeze(); michael@0: } michael@0: michael@0: U_CAPI USet * U_EXPORT2 michael@0: uset_cloneAsThawed(const USet *set) { michael@0: return (USet*) (((UnicodeSet*) set)->UnicodeSet::cloneAsThawed()); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_set(USet* set, michael@0: UChar32 start, UChar32 end) { michael@0: ((UnicodeSet*) set)->UnicodeSet::set(start, end); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_addAll(USet* set, const USet *additionalSet) { michael@0: ((UnicodeSet*) set)->UnicodeSet::addAll(*((const UnicodeSet*)additionalSet)); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_add(USet* set, UChar32 c) { michael@0: ((UnicodeSet*) set)->UnicodeSet::add(c); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_addRange(USet* set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet*) set)->UnicodeSet::add(start, end); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_addString(USet* set, const UChar* str, int32_t strLen) { michael@0: // UnicodeString handles -1 for strLen michael@0: UnicodeString s(strLen<0, str, strLen); michael@0: ((UnicodeSet*) set)->UnicodeSet::add(s); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen) { michael@0: // UnicodeString handles -1 for strLen michael@0: UnicodeString s(str, strLen); michael@0: ((UnicodeSet*) set)->UnicodeSet::addAll(s); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_remove(USet* set, UChar32 c) { michael@0: ((UnicodeSet*) set)->UnicodeSet::remove(c); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_removeRange(USet* set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet*) set)->UnicodeSet::remove(start, end); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_removeString(USet* set, const UChar* str, int32_t strLen) { michael@0: UnicodeString s(strLen==-1, str, strLen); michael@0: ((UnicodeSet*) set)->UnicodeSet::remove(s); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_removeAll(USet* set, const USet* remove) { michael@0: ((UnicodeSet*) set)->UnicodeSet::removeAll(*(const UnicodeSet*)remove); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_retain(USet* set, UChar32 start, UChar32 end) { michael@0: ((UnicodeSet*) set)->UnicodeSet::retain(start, end); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_retainAll(USet* set, const USet* retain) { michael@0: ((UnicodeSet*) set)->UnicodeSet::retainAll(*(const UnicodeSet*)retain); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_compact(USet* set) { michael@0: ((UnicodeSet*) set)->UnicodeSet::compact(); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_complement(USet* set) { michael@0: ((UnicodeSet*) set)->UnicodeSet::complement(); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_complementAll(USet* set, const USet* complement) { michael@0: ((UnicodeSet*) set)->UnicodeSet::complementAll(*(const UnicodeSet*)complement); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_clear(USet* set) { michael@0: ((UnicodeSet*) set)->UnicodeSet::clear(); michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_removeAllStrings(USet* set) { michael@0: ((UnicodeSet*) set)->UnicodeSet::removeAllStrings(); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_isEmpty(const USet* set) { michael@0: return ((const UnicodeSet*) set)->UnicodeSet::isEmpty(); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_contains(const USet* set, UChar32 c) { michael@0: return ((const UnicodeSet*) set)->UnicodeSet::contains(c); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsRange(const USet* set, UChar32 start, UChar32 end) { michael@0: return ((const UnicodeSet*) set)->UnicodeSet::contains(start, end); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsString(const USet* set, const UChar* str, int32_t strLen) { michael@0: UnicodeString s(strLen==-1, str, strLen); michael@0: return ((const UnicodeSet*) set)->UnicodeSet::contains(s); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsAll(const USet* set1, const USet* set2) { michael@0: return ((const UnicodeSet*) set1)->UnicodeSet::containsAll(* (const UnicodeSet*) set2); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen) { michael@0: // Create a string alias, since nothing is being added to the set. michael@0: UnicodeString s(strLen==-1, str, strLen); michael@0: return ((const UnicodeSet*) set)->UnicodeSet::containsAll(s); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsNone(const USet* set1, const USet* set2) { michael@0: return ((const UnicodeSet*) set1)->UnicodeSet::containsNone(* (const UnicodeSet*) set2); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_containsSome(const USet* set1, const USet* set2) { michael@0: return ((const UnicodeSet*) set1)->UnicodeSet::containsSome(* (const UnicodeSet*) set2); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::span(s, length, spanCondition); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::spanBack(s, length, spanCondition); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::spanUTF8(s, length, spanCondition); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::spanBackUTF8(s, length, spanCondition); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_equals(const USet* set1, const USet* set2) { michael@0: return *(const UnicodeSet*)set1 == *(const UnicodeSet*)set2; michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_indexOf(const USet* set, UChar32 c) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::indexOf(c); michael@0: } michael@0: michael@0: U_CAPI UChar32 U_EXPORT2 michael@0: uset_charAt(const USet* set, int32_t index) { michael@0: return ((UnicodeSet*) set)->UnicodeSet::charAt(index); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_size(const USet* set) { michael@0: return ((const UnicodeSet*) set)->UnicodeSet::size(); michael@0: } michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: /** michael@0: * This class only exists to provide access to the UnicodeSet private michael@0: * USet support API. Declaring a class a friend is more portable than michael@0: * trying to declare extern "C" functions as friends. michael@0: */ michael@0: class USetAccess /* not : public UObject because all methods are static */ { michael@0: public: michael@0: /* Try to have the compiler inline these*/ michael@0: inline static int32_t getStringCount(const UnicodeSet& set) { michael@0: return set.getStringCount(); michael@0: } michael@0: inline static const UnicodeString* getString(const UnicodeSet& set, michael@0: int32_t i) { michael@0: return set.getString(i); michael@0: } michael@0: private: michael@0: /* do not instantiate*/ michael@0: USetAccess(); michael@0: }; michael@0: U_NAMESPACE_END michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_getItemCount(const USet* uset) { michael@0: const UnicodeSet& set = *(const UnicodeSet*)uset; michael@0: return set.getRangeCount() + USetAccess::getStringCount(set); michael@0: } michael@0: michael@0: U_CAPI int32_t U_EXPORT2 michael@0: uset_getItem(const USet* uset, int32_t itemIndex, michael@0: UChar32* start, UChar32* end, michael@0: UChar* str, int32_t strCapacity, michael@0: UErrorCode* ec) { michael@0: if (U_FAILURE(*ec)) return 0; michael@0: const UnicodeSet& set = *(const UnicodeSet*)uset; michael@0: int32_t rangeCount; michael@0: michael@0: if (itemIndex < 0) { michael@0: *ec = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return -1; michael@0: } else if (itemIndex < (rangeCount = set.getRangeCount())) { michael@0: *start = set.getRangeStart(itemIndex); michael@0: *end = set.getRangeEnd(itemIndex); michael@0: return 0; michael@0: } else { michael@0: itemIndex -= rangeCount; michael@0: if (itemIndex < USetAccess::getStringCount(set)) { michael@0: const UnicodeString* s = USetAccess::getString(set, itemIndex); michael@0: return s->extract(str, strCapacity, *ec); michael@0: } else { michael@0: *ec = U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return -1; michael@0: } michael@0: } michael@0: } michael@0: michael@0: //U_CAPI int32_t U_EXPORT2 michael@0: //uset_getRangeCount(const USet* set) { michael@0: // return ((const UnicodeSet*) set)->getRangeCount(); michael@0: //} michael@0: // michael@0: //U_CAPI UBool U_EXPORT2 michael@0: //uset_getRange(const USet* set, int32_t rangeIndex, michael@0: // UChar32* pStart, UChar32* pEnd) { michael@0: // if ((uint32_t) rangeIndex >= (uint32_t) uset_getRangeCount(set)) { michael@0: // return FALSE; michael@0: // } michael@0: // const UnicodeSet* us = (const UnicodeSet*) set; michael@0: // *pStart = us->getRangeStart(rangeIndex); michael@0: // *pEnd = us->getRangeEnd(rangeIndex); michael@0: // return TRUE; michael@0: //} michael@0: michael@0: /* michael@0: * Serialize a USet into 16-bit units. michael@0: * Store BMP code points as themselves with one 16-bit unit each. michael@0: * michael@0: * Important: the code points in the array are in ascending order, michael@0: * therefore all BMP code points precede all supplementary code points. michael@0: * michael@0: * Store each supplementary code point in 2 16-bit units, michael@0: * simply with higher-then-lower 16-bit halfs. michael@0: * michael@0: * Precede the entire list with the length. michael@0: * If there are supplementary code points, then set bit 15 in the length michael@0: * and add the bmpLength between it and the array. michael@0: * michael@0: * In other words: michael@0: * - all BMP: (length=bmpLength) BMP, .., BMP michael@0: * - some supplementary: (length|0x8000) (bmpLengthUnicodeSet::serialize(dest, destCapacity,* ec); michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength) { michael@0: int32_t length; michael@0: michael@0: if(fillSet==NULL) { michael@0: return FALSE; michael@0: } michael@0: if(src==NULL || srcLength<=0) { michael@0: fillSet->length=fillSet->bmpLength=0; michael@0: return FALSE; michael@0: } michael@0: michael@0: length=*src++; michael@0: if(length&0x8000) { michael@0: /* there are supplementary values */ michael@0: length&=0x7fff; michael@0: if(srcLength<(2+length)) { michael@0: fillSet->length=fillSet->bmpLength=0; michael@0: return FALSE; michael@0: } michael@0: fillSet->bmpLength=*src++; michael@0: } else { michael@0: /* only BMP values */ michael@0: if(srcLength<(1+length)) { michael@0: fillSet->length=fillSet->bmpLength=0; michael@0: return FALSE; michael@0: } michael@0: fillSet->bmpLength=length; michael@0: } michael@0: fillSet->array=src; michael@0: fillSet->length=length; michael@0: return TRUE; michael@0: } michael@0: michael@0: U_CAPI void U_EXPORT2 michael@0: uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c) { michael@0: if(fillSet==NULL || (uint32_t)c>0x10ffff) { michael@0: return; michael@0: } michael@0: michael@0: fillSet->array=fillSet->staticArray; michael@0: if(c<0xffff) { michael@0: fillSet->bmpLength=fillSet->length=2; michael@0: fillSet->staticArray[0]=(uint16_t)c; michael@0: fillSet->staticArray[1]=(uint16_t)c+1; michael@0: } else if(c==0xffff) { michael@0: fillSet->bmpLength=1; michael@0: fillSet->length=3; michael@0: fillSet->staticArray[0]=0xffff; michael@0: fillSet->staticArray[1]=1; michael@0: fillSet->staticArray[2]=0; michael@0: } else if(c<0x10ffff) { michael@0: fillSet->bmpLength=0; michael@0: fillSet->length=4; michael@0: fillSet->staticArray[0]=(uint16_t)(c>>16); michael@0: fillSet->staticArray[1]=(uint16_t)c; michael@0: ++c; michael@0: fillSet->staticArray[2]=(uint16_t)(c>>16); michael@0: fillSet->staticArray[3]=(uint16_t)c; michael@0: } else /* c==0x10ffff */ { michael@0: fillSet->bmpLength=0; michael@0: fillSet->length=2; michael@0: fillSet->staticArray[0]=0x10; michael@0: fillSet->staticArray[1]=0xffff; michael@0: } michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_serializedContains(const USerializedSet* set, UChar32 c) { michael@0: const uint16_t* array; michael@0: michael@0: if(set==NULL || (uint32_t)c>0x10ffff) { michael@0: return FALSE; michael@0: } michael@0: michael@0: array=set->array; michael@0: if(c<=0xffff) { michael@0: /* find c in the BMP part */ michael@0: int32_t lo = 0; michael@0: int32_t hi = set->bmpLength-1; michael@0: if (c < array[0]) { michael@0: hi = 0; michael@0: } else if (c < array[hi]) { michael@0: for(;;) { michael@0: int32_t i = (lo + hi) >> 1; michael@0: if (i == lo) { michael@0: break; // Done! michael@0: } else if (c < array[i]) { michael@0: hi = i; michael@0: } else { michael@0: lo = i; michael@0: } michael@0: } michael@0: } else { michael@0: hi += 1; michael@0: } michael@0: return (UBool)(hi&1); michael@0: } else { michael@0: /* find c in the supplementary part */ michael@0: uint16_t high=(uint16_t)(c>>16), low=(uint16_t)c; michael@0: int32_t base = set->bmpLength; michael@0: int32_t lo = 0; michael@0: int32_t hi = set->length - 2 - base; michael@0: if (high < array[base] || (high==array[base] && low> 1) & ~1; // Guarantee even result michael@0: int32_t iabs = i + base; michael@0: if (i == lo) { michael@0: break; // Done! michael@0: } else if (high < array[iabs] || (high==array[iabs] && lowbmpLength+(set->length-set->bmpLength)/2+1)/2; michael@0: } michael@0: michael@0: U_CAPI UBool U_EXPORT2 michael@0: uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, michael@0: UChar32* pStart, UChar32* pEnd) { michael@0: const uint16_t* array; michael@0: int32_t bmpLength, length; michael@0: michael@0: if(set==NULL || rangeIndex<0 || pStart==NULL || pEnd==NULL) { michael@0: return FALSE; michael@0: } michael@0: michael@0: array=set->array; michael@0: length=set->length; michael@0: bmpLength=set->bmpLength; michael@0: michael@0: rangeIndex*=2; /* address start/limit pairs */ michael@0: if(rangeIndex0) { michael@0: // if(c>=array[length-1]) { michael@0: // return length; michael@0: // } michael@0: // michael@0: // /* do not check the last range limit again in the loop below */ michael@0: // --length; michael@0: // } michael@0: // michael@0: // for(i=0; i=array[i]; ++i) {} michael@0: // return i; michael@0: // } michael@0: // michael@0: // static UBool michael@0: // addRemove(USet* set, UChar32 c, int32_t doRemove) { michael@0: // int32_t i, length, more; michael@0: // michael@0: // if(set==NULL || (uint32_t)c>0x10ffff) { michael@0: // return FALSE; michael@0: // } michael@0: // michael@0: // length=set->length; michael@0: // i=findChar(set->array, length, c); michael@0: // if((i&1)^doRemove) { michael@0: // /* c is already in the set */ michael@0: // return TRUE; michael@0: // } michael@0: // michael@0: // /* how many more array items do we need? */ michael@0: // if(iarray[i]) { michael@0: // /* c is just before the following range, extend that in-place by one */ michael@0: // set->array[i]=c; michael@0: // if(i>0) { michael@0: // --i; michael@0: // if(c==set->array[i]) { michael@0: // /* the previous range collapsed, remove it */ michael@0: // set->length=length-=2; michael@0: // if(iarray+i, set->array+i+2, (length-i)*4); michael@0: // } michael@0: // } michael@0: // } michael@0: // return TRUE; michael@0: // } else if(i>0 && c==set->array[i-1]) { michael@0: // /* c is just after the previous range, extend that in-place by one */ michael@0: // if(++c<=0x10ffff) { michael@0: // set->array[i-1]=c; michael@0: // if(iarray[i]) { michael@0: // /* the following range collapsed, remove it */ michael@0: // --i; michael@0: // set->length=length-=2; michael@0: // if(iarray+i, set->array+i+2, (length-i)*4); michael@0: // } michael@0: // } michael@0: // } else { michael@0: // /* extend the previous range (had limit 0x10ffff) to the end of Unicode */ michael@0: // set->length=i-1; michael@0: // } michael@0: // return TRUE; michael@0: // } else if(i==length && c==0x10ffff) { michael@0: // /* insert one range limit c */ michael@0: // more=1; michael@0: // } else { michael@0: // /* insert two range limits c, c+1 */ michael@0: // more=2; michael@0: // } michael@0: // michael@0: // /* insert range limits */ michael@0: // if(length+more>set->capacity) { michael@0: // /* reallocate */ michael@0: // int32_t newCapacity=set->capacity+set->capacity/2+USET_GROW_DELTA; michael@0: // UChar32* newArray=(UChar32* )uprv_malloc(newCapacity*4); michael@0: // if(newArray==NULL) { michael@0: // return FALSE; michael@0: // } michael@0: // set->capacity=newCapacity; michael@0: // uprv_memcpy(newArray, set->array, length*4); michael@0: // michael@0: // if(set->array!=set->staticBuffer) { michael@0: // uprv_free(set->array); michael@0: // } michael@0: // set->array=newArray; michael@0: // } michael@0: // michael@0: // if(iarray+i+more, set->array+i, (length-i)*4); michael@0: // } michael@0: // set->array[i]=c; michael@0: // if(more==2) { michael@0: // set->array[i+1]=c+1; michael@0: // } michael@0: // set->length+=more; michael@0: // michael@0: // return TRUE; michael@0: // } michael@0: // michael@0: // U_CAPI UBool U_EXPORT2 michael@0: // uset_add(USet* set, UChar32 c) { michael@0: // return addRemove(set, c, 0); michael@0: // } michael@0: // michael@0: // U_CAPI void U_EXPORT2 michael@0: // uset_remove(USet* set, UChar32 c) { michael@0: // addRemove(set, c, 1); michael@0: // }