michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2007-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: bmpset.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2007jan29 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/utf8.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "bmpset.h" michael@0: #include "uassert.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : michael@0: list(parentList), listLength(parentListLength) { michael@0: uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); michael@0: uprv_memset(table7FF, 0, sizeof(table7FF)); michael@0: uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); michael@0: michael@0: /* michael@0: * Set the list indexes for binary searches for michael@0: * U+0800, U+1000, U+2000, .., U+F000, U+10000. michael@0: * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are michael@0: * looked up in the bit tables. michael@0: * The last pair of indexes is for finding supplementary code points. michael@0: */ michael@0: list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); michael@0: int32_t i; michael@0: for(i=1; i<=0x10; ++i) { michael@0: list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); michael@0: } michael@0: list4kStarts[0x11]=listLength-1; michael@0: michael@0: initBits(); michael@0: overrideIllegal(); michael@0: } michael@0: michael@0: BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : michael@0: list(newParentList), listLength(newParentListLength) { michael@0: uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); michael@0: uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); michael@0: uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); michael@0: uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); michael@0: } michael@0: michael@0: BMPSet::~BMPSet() { michael@0: } michael@0: michael@0: /* michael@0: * Set bits in a bit rectangle in "vertical" bit organization. michael@0: * start>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. michael@0: int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. michael@0: michael@0: // Set one bit indicating an all-one block. michael@0: uint32_t bits=(uint32_t)1<>6; michael@0: int32_t limitTrail=limit&0x3f; michael@0: michael@0: if(lead==limitLead) { michael@0: // Partial vertical bit column. michael@0: while(trail0) { michael@0: do { michael@0: table[trail++]|=bits; michael@0: } while(trail<64); michael@0: ++lead; michael@0: } michael@0: if(lead=0x80) { michael@0: break; michael@0: } michael@0: do { michael@0: asciiBytes[start++]=1; michael@0: } while(start0x800) { michael@0: start=0x800; michael@0: break; michael@0: } michael@0: michael@0: start=list[listIndex++]; michael@0: if(listIndex0x10000) { michael@0: limit=0x10000; michael@0: } michael@0: michael@0: if(start>=6; michael@0: bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); michael@0: start=(start+1)<<6; // Round up to the next block boundary. michael@0: minStart=start; // Ignore further ranges in this block. michael@0: } michael@0: if(start>6, limit>>6); michael@0: } michael@0: michael@0: if(limit&0x3f) { michael@0: // Mixed-value block of 64 code points. michael@0: limit>>=6; michael@0: bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); michael@0: limit=(limit+1)<<6; // Round up to the next block boundary. michael@0: minStart=limit; // Ignore further ranges in this block. michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(limit==0x10000) { michael@0: break; michael@0: } michael@0: michael@0: start=list[listIndex++]; michael@0: if(listIndex= hi || c >= list[hi-1]) michael@0: return hi; michael@0: // invariant: c >= list[lo] michael@0: // invariant: c < list[hi] michael@0: for (;;) { michael@0: int32_t i = (lo + hi) >> 1; michael@0: if (i == lo) { michael@0: break; // Found! michael@0: } else if (c < list[i]) { michael@0: hi = i; michael@0: } else { michael@0: lo = i; michael@0: } michael@0: } michael@0: return hi; michael@0: } michael@0: michael@0: UBool michael@0: BMPSet::contains(UChar32 c) const { michael@0: if((uint32_t)c<=0x7f) { michael@0: return (UBool)asciiBytes[c]; michael@0: } else if((uint32_t)c<=0x7ff) { michael@0: return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); michael@0: } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: return (UBool)twoBits; michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); michael@0: } michael@0: } else if((uint32_t)c<=0x10ffff) { michael@0: // surrogate or supplementary code point michael@0: return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); michael@0: } else { michael@0: // Out-of-range code points get FALSE, consistent with long-standing michael@0: // behavior of UnicodeSet::contains(c). michael@0: return FALSE; michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Check for sufficient length for trail unit for each surrogate pair. michael@0: * Handle single surrogates as surrogate code points as usual in ICU. michael@0: */ michael@0: const UChar * michael@0: BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { michael@0: UChar c, c2; michael@0: michael@0: if(spanCondition) { michael@0: // span michael@0: do { michael@0: c=*s; michael@0: if(c<=0x7f) { michael@0: if(!asciiBytes[c]) { michael@0: break; michael@0: } michael@0: } else if(c<=0x7ff) { michael@0: if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { michael@0: break; michael@0: } michael@0: } else if(c<0xd800 || c>=0xe000) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: if(twoBits==0) { michael@0: break; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { michael@0: break; michael@0: } michael@0: } michael@0: } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { michael@0: // surrogate code point michael@0: if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { michael@0: break; michael@0: } michael@0: } else { michael@0: // surrogate pair michael@0: if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { michael@0: break; michael@0: } michael@0: ++s; michael@0: } michael@0: } while(++s>6)))!=0) { michael@0: break; michael@0: } michael@0: } else if(c<0xd800 || c>=0xe000) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: if(twoBits!=0) { michael@0: break; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { michael@0: break; michael@0: } michael@0: } michael@0: } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { michael@0: // surrogate code point michael@0: if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { michael@0: break; michael@0: } michael@0: } else { michael@0: // surrogate pair michael@0: if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { michael@0: break; michael@0: } michael@0: ++s; michael@0: } michael@0: } while(++s>6)))==0) { michael@0: break; michael@0: } michael@0: } else if(c<0xd800 || c>=0xe000) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: if(twoBits==0) { michael@0: break; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { michael@0: break; michael@0: } michael@0: } michael@0: } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { michael@0: // surrogate code point michael@0: if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { michael@0: break; michael@0: } michael@0: } else { michael@0: // surrogate pair michael@0: if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { michael@0: break; michael@0: } michael@0: --limit; michael@0: } michael@0: if(s==limit) { michael@0: return s; michael@0: } michael@0: } michael@0: } else { michael@0: // span not michael@0: for(;;) { michael@0: c=*(--limit); michael@0: if(c<=0x7f) { michael@0: if(asciiBytes[c]) { michael@0: break; michael@0: } michael@0: } else if(c<=0x7ff) { michael@0: if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { michael@0: break; michael@0: } michael@0: } else if(c<0xd800 || c>=0xe000) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: if(twoBits!=0) { michael@0: break; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { michael@0: break; michael@0: } michael@0: } michael@0: } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { michael@0: // surrogate code point michael@0: if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { michael@0: break; michael@0: } michael@0: } else { michael@0: // surrogate pair michael@0: if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { michael@0: break; michael@0: } michael@0: --limit; michael@0: } michael@0: if(s==limit) { michael@0: return s; michael@0: } michael@0: } michael@0: } michael@0: return limit+1; michael@0: } michael@0: michael@0: /* michael@0: * Precheck for sufficient trail bytes at end of string only once per span. michael@0: * Check validity. michael@0: */ michael@0: const uint8_t * michael@0: BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: const uint8_t *limit=s+length; michael@0: uint8_t b=*s; michael@0: if((int8_t)b>=0) { michael@0: // Initial all-ASCII span. michael@0: if(spanCondition) { michael@0: do { michael@0: if(!asciiBytes[b] || ++s==limit) { michael@0: return s; michael@0: } michael@0: b=*s; michael@0: } while((int8_t)b>=0); michael@0: } else { michael@0: do { michael@0: if(asciiBytes[b] || ++s==limit) { michael@0: return s; michael@0: } michael@0: b=*s; michael@0: } while((int8_t)b>=0); michael@0: } michael@0: length=(int32_t)(limit-s); michael@0: } michael@0: michael@0: if(spanCondition!=USET_SPAN_NOT_CONTAINED) { michael@0: spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. michael@0: } michael@0: michael@0: const uint8_t *limit0=limit; michael@0: michael@0: /* michael@0: * Make sure that the last 1/2/3/4-byte sequence before limit is complete michael@0: * or runs into a lead byte. michael@0: * In the span loop compare s with limit only once michael@0: * per multi-byte character. michael@0: * michael@0: * Give a trailing illegal sequence the same value as the result of contains(FFFD), michael@0: * including it if that is part of the span, otherwise set limit0 to before michael@0: * the truncated sequence. michael@0: */ michael@0: b=*(limit-1); michael@0: if((int8_t)b<0) { michael@0: // b>=0x80: lead or trail byte michael@0: if(b<0xc0) { michael@0: // single trail byte, check for preceding 3- or 4-byte lead byte michael@0: if(length>=2 && (b=*(limit-2))>=0xe0) { michael@0: limit-=2; michael@0: if(asciiBytes[0x80]!=spanCondition) { michael@0: limit0=limit; michael@0: } michael@0: } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { michael@0: // 4-byte lead byte with only two trail bytes michael@0: limit-=3; michael@0: if(asciiBytes[0x80]!=spanCondition) { michael@0: limit0=limit; michael@0: } michael@0: } michael@0: } else { michael@0: // lead byte with no trail bytes michael@0: --limit; michael@0: if(asciiBytes[0x80]!=spanCondition) { michael@0: limit0=limit; michael@0: } michael@0: } michael@0: } michael@0: michael@0: uint8_t t1, t2, t3; michael@0: michael@0: while(s=0xe0) { michael@0: if(b<0xf0) { michael@0: if( /* handle U+0000..U+FFFF inline */ michael@0: (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && michael@0: (t2=(uint8_t)(s[1]-0x80)) <= 0x3f michael@0: ) { michael@0: b&=0xf; michael@0: uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with this lead byte and middle trail byte michael@0: // are either in the set or not. michael@0: if(twoBits!=(uint32_t)spanCondition) { michael@0: return s-1; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: UChar32 c=(b<<12)|(t1<<6)|t2; michael@0: if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { michael@0: return s-1; michael@0: } michael@0: } michael@0: s+=2; michael@0: continue; michael@0: } michael@0: } else if( /* handle U+10000..U+10FFFF inline */ michael@0: (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && michael@0: (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && michael@0: (t3=(uint8_t)(s[2]-0x80)) <= 0x3f michael@0: ) { michael@0: // Give an illegal sequence the same value as the result of contains(FFFD). michael@0: UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; michael@0: if( ( (0x10000<=c && c<=0x10ffff) ? michael@0: containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : michael@0: asciiBytes[0x80] michael@0: ) != spanCondition michael@0: ) { michael@0: return s-1; michael@0: } michael@0: s+=3; michael@0: continue; michael@0: } michael@0: } else /* 0xc0<=b<0xe0 */ { michael@0: if( /* handle U+0000..U+07FF inline */ michael@0: (t1=(uint8_t)(*s-0x80)) <= 0x3f michael@0: ) { michael@0: if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { michael@0: return s-1; michael@0: } michael@0: ++s; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: // Give an illegal sequence the same value as the result of contains(FFFD). michael@0: // Handle each byte of an illegal sequence separately to simplify the code; michael@0: // no need to optimize error handling. michael@0: if(asciiBytes[0x80]!=spanCondition) { michael@0: return s-1; michael@0: } michael@0: } michael@0: michael@0: return limit0; michael@0: } michael@0: michael@0: /* michael@0: * While going backwards through UTF-8 optimize only for ASCII. michael@0: * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not michael@0: * possible to tell from the last byte in a multi-byte sequence how many michael@0: * preceding bytes there should be. Therefore, going backwards through UTF-8 michael@0: * is much harder than going forward. michael@0: */ michael@0: int32_t michael@0: BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: if(spanCondition!=USET_SPAN_NOT_CONTAINED) { michael@0: spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. michael@0: } michael@0: michael@0: uint8_t b; michael@0: michael@0: do { michael@0: b=s[--length]; michael@0: if((int8_t)b>=0) { michael@0: // ASCII sub-span michael@0: if(spanCondition) { michael@0: do { michael@0: if(!asciiBytes[b]) { michael@0: return length+1; michael@0: } else if(length==0) { michael@0: return 0; michael@0: } michael@0: b=s[--length]; michael@0: } while((int8_t)b>=0); michael@0: } else { michael@0: do { michael@0: if(asciiBytes[b]) { michael@0: return length+1; michael@0: } else if(length==0) { michael@0: return 0; michael@0: } michael@0: b=s[--length]; michael@0: } while((int8_t)b>=0); michael@0: } michael@0: } michael@0: michael@0: int32_t prev=length; michael@0: UChar32 c; michael@0: // trail byte: collect a multi-byte character michael@0: // (or lead byte in last-trail position) michael@0: c=utf8_prevCharSafeBody(s, 0, &length, b, -3); michael@0: // c is a valid code point, not ASCII, not a surrogate michael@0: if(c<=0x7ff) { michael@0: if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { michael@0: return prev+1; michael@0: } michael@0: } else if(c<=0xffff) { michael@0: int lead=c>>12; michael@0: uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; michael@0: if(twoBits<=1) { michael@0: // All 64 code points with the same bits 15..6 michael@0: // are either in the set or not. michael@0: if(twoBits!=(uint32_t)spanCondition) { michael@0: return prev+1; michael@0: } michael@0: } else { michael@0: // Look up the code point in its 4k block of code points. michael@0: if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { michael@0: return prev+1; michael@0: } michael@0: } michael@0: } else { michael@0: if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { michael@0: return prev+1; michael@0: } michael@0: } michael@0: } while(length>0); michael@0: return 0; michael@0: } michael@0: michael@0: U_NAMESPACE_END