1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/bmpset.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,725 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2007-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: bmpset.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2007jan29 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/utf8.h" 1.23 +#include "unicode/utf16.h" 1.24 +#include "cmemory.h" 1.25 +#include "bmpset.h" 1.26 +#include "uassert.h" 1.27 + 1.28 +U_NAMESPACE_BEGIN 1.29 + 1.30 +BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : 1.31 + list(parentList), listLength(parentListLength) { 1.32 + uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); 1.33 + uprv_memset(table7FF, 0, sizeof(table7FF)); 1.34 + uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); 1.35 + 1.36 + /* 1.37 + * Set the list indexes for binary searches for 1.38 + * U+0800, U+1000, U+2000, .., U+F000, U+10000. 1.39 + * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are 1.40 + * looked up in the bit tables. 1.41 + * The last pair of indexes is for finding supplementary code points. 1.42 + */ 1.43 + list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); 1.44 + int32_t i; 1.45 + for(i=1; i<=0x10; ++i) { 1.46 + list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); 1.47 + } 1.48 + list4kStarts[0x11]=listLength-1; 1.49 + 1.50 + initBits(); 1.51 + overrideIllegal(); 1.52 +} 1.53 + 1.54 +BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : 1.55 + list(newParentList), listLength(newParentListLength) { 1.56 + uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); 1.57 + uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); 1.58 + uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); 1.59 + uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); 1.60 +} 1.61 + 1.62 +BMPSet::~BMPSet() { 1.63 +} 1.64 + 1.65 +/* 1.66 + * Set bits in a bit rectangle in "vertical" bit organization. 1.67 + * start<limit<=0x800 1.68 + */ 1.69 +static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { 1.70 + U_ASSERT(start<limit); 1.71 + U_ASSERT(limit<=0x800); 1.72 + 1.73 + int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 1.74 + int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 1.75 + 1.76 + // Set one bit indicating an all-one block. 1.77 + uint32_t bits=(uint32_t)1<<lead; 1.78 + if((start+1)==limit) { // Single-character shortcut. 1.79 + table[trail]|=bits; 1.80 + return; 1.81 + } 1.82 + 1.83 + int32_t limitLead=limit>>6; 1.84 + int32_t limitTrail=limit&0x3f; 1.85 + 1.86 + if(lead==limitLead) { 1.87 + // Partial vertical bit column. 1.88 + while(trail<limitTrail) { 1.89 + table[trail++]|=bits; 1.90 + } 1.91 + } else { 1.92 + // Partial vertical bit column, 1.93 + // followed by a bit rectangle, 1.94 + // followed by another partial vertical bit column. 1.95 + if(trail>0) { 1.96 + do { 1.97 + table[trail++]|=bits; 1.98 + } while(trail<64); 1.99 + ++lead; 1.100 + } 1.101 + if(lead<limitLead) { 1.102 + bits=~((1<<lead)-1); 1.103 + if(limitLead<0x20) { 1.104 + bits&=(1<<limitLead)-1; 1.105 + } 1.106 + for(trail=0; trail<64; ++trail) { 1.107 + table[trail]|=bits; 1.108 + } 1.109 + } 1.110 + // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 1.111 + // In that case, bits=1<<limitLead is undefined but the bits value 1.112 + // is not used because trail<limitTrail is already false. 1.113 + bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); 1.114 + for(trail=0; trail<limitTrail; ++trail) { 1.115 + table[trail]|=bits; 1.116 + } 1.117 + } 1.118 +} 1.119 + 1.120 +void BMPSet::initBits() { 1.121 + UChar32 start, limit; 1.122 + int32_t listIndex=0; 1.123 + 1.124 + // Set asciiBytes[]. 1.125 + do { 1.126 + start=list[listIndex++]; 1.127 + if(listIndex<listLength) { 1.128 + limit=list[listIndex++]; 1.129 + } else { 1.130 + limit=0x110000; 1.131 + } 1.132 + if(start>=0x80) { 1.133 + break; 1.134 + } 1.135 + do { 1.136 + asciiBytes[start++]=1; 1.137 + } while(start<limit && start<0x80); 1.138 + } while(limit<=0x80); 1.139 + 1.140 + // Set table7FF[]. 1.141 + while(start<0x800) { 1.142 + set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); 1.143 + if(limit>0x800) { 1.144 + start=0x800; 1.145 + break; 1.146 + } 1.147 + 1.148 + start=list[listIndex++]; 1.149 + if(listIndex<listLength) { 1.150 + limit=list[listIndex++]; 1.151 + } else { 1.152 + limit=0x110000; 1.153 + } 1.154 + } 1.155 + 1.156 + // Set bmpBlockBits[]. 1.157 + int32_t minStart=0x800; 1.158 + while(start<0x10000) { 1.159 + if(limit>0x10000) { 1.160 + limit=0x10000; 1.161 + } 1.162 + 1.163 + if(start<minStart) { 1.164 + start=minStart; 1.165 + } 1.166 + if(start<limit) { // Else: Another range entirely in a known mixed-value block. 1.167 + if(start&0x3f) { 1.168 + // Mixed-value block of 64 code points. 1.169 + start>>=6; 1.170 + bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); 1.171 + start=(start+1)<<6; // Round up to the next block boundary. 1.172 + minStart=start; // Ignore further ranges in this block. 1.173 + } 1.174 + if(start<limit) { 1.175 + if(start<(limit&~0x3f)) { 1.176 + // Multiple all-ones blocks of 64 code points each. 1.177 + set32x64Bits(bmpBlockBits, start>>6, limit>>6); 1.178 + } 1.179 + 1.180 + if(limit&0x3f) { 1.181 + // Mixed-value block of 64 code points. 1.182 + limit>>=6; 1.183 + bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); 1.184 + limit=(limit+1)<<6; // Round up to the next block boundary. 1.185 + minStart=limit; // Ignore further ranges in this block. 1.186 + } 1.187 + } 1.188 + } 1.189 + 1.190 + if(limit==0x10000) { 1.191 + break; 1.192 + } 1.193 + 1.194 + start=list[listIndex++]; 1.195 + if(listIndex<listLength) { 1.196 + limit=list[listIndex++]; 1.197 + } else { 1.198 + limit=0x110000; 1.199 + } 1.200 + } 1.201 +} 1.202 + 1.203 +/* 1.204 + * Override some bits and bytes to the result of contains(FFFD) 1.205 + * for faster validity checking at runtime. 1.206 + * No need to set 0 values where they were reset to 0 in the constructor 1.207 + * and not modified by initBits(). 1.208 + * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) 1.209 + * Need to set 0 values for surrogates D800..DFFF. 1.210 + */ 1.211 +void BMPSet::overrideIllegal() { 1.212 + uint32_t bits, mask; 1.213 + int32_t i; 1.214 + 1.215 + if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { 1.216 + // contains(FFFD)==TRUE 1.217 + for(i=0x80; i<0xc0; ++i) { 1.218 + asciiBytes[i]=1; 1.219 + } 1.220 + 1.221 + bits=3; // Lead bytes 0xC0 and 0xC1. 1.222 + for(i=0; i<64; ++i) { 1.223 + table7FF[i]|=bits; 1.224 + } 1.225 + 1.226 + bits=1; // Lead byte 0xE0. 1.227 + for(i=0; i<32; ++i) { // First half of 4k block. 1.228 + bmpBlockBits[i]|=bits; 1.229 + } 1.230 + 1.231 + mask=~(0x10001<<0xd); // Lead byte 0xED. 1.232 + bits=1<<0xd; 1.233 + for(i=32; i<64; ++i) { // Second half of 4k block. 1.234 + bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; 1.235 + } 1.236 + } else { 1.237 + // contains(FFFD)==FALSE 1.238 + mask=~(0x10001<<0xd); // Lead byte 0xED. 1.239 + for(i=32; i<64; ++i) { // Second half of 4k block. 1.240 + bmpBlockBits[i]&=mask; 1.241 + } 1.242 + } 1.243 +} 1.244 + 1.245 +int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { 1.246 + /* Examples: 1.247 + findCodePoint(c) 1.248 + set list[] c=0 1 3 4 7 8 1.249 + === ============== =========== 1.250 + [] [110000] 0 0 0 0 0 0 1.251 + [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 1.252 + [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 1.253 + [:Any:] [0, 110000] 1 1 1 1 1 1 1.254 + */ 1.255 + 1.256 + // Return the smallest i such that c < list[i]. Assume 1.257 + // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 1.258 + if (c < list[lo]) 1.259 + return lo; 1.260 + // High runner test. c is often after the last range, so an 1.261 + // initial check for this condition pays off. 1.262 + if (lo >= hi || c >= list[hi-1]) 1.263 + return hi; 1.264 + // invariant: c >= list[lo] 1.265 + // invariant: c < list[hi] 1.266 + for (;;) { 1.267 + int32_t i = (lo + hi) >> 1; 1.268 + if (i == lo) { 1.269 + break; // Found! 1.270 + } else if (c < list[i]) { 1.271 + hi = i; 1.272 + } else { 1.273 + lo = i; 1.274 + } 1.275 + } 1.276 + return hi; 1.277 +} 1.278 + 1.279 +UBool 1.280 +BMPSet::contains(UChar32 c) const { 1.281 + if((uint32_t)c<=0x7f) { 1.282 + return (UBool)asciiBytes[c]; 1.283 + } else if((uint32_t)c<=0x7ff) { 1.284 + return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); 1.285 + } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { 1.286 + int lead=c>>12; 1.287 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.288 + if(twoBits<=1) { 1.289 + // All 64 code points with the same bits 15..6 1.290 + // are either in the set or not. 1.291 + return (UBool)twoBits; 1.292 + } else { 1.293 + // Look up the code point in its 4k block of code points. 1.294 + return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); 1.295 + } 1.296 + } else if((uint32_t)c<=0x10ffff) { 1.297 + // surrogate or supplementary code point 1.298 + return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 1.299 + } else { 1.300 + // Out-of-range code points get FALSE, consistent with long-standing 1.301 + // behavior of UnicodeSet::contains(c). 1.302 + return FALSE; 1.303 + } 1.304 +} 1.305 + 1.306 +/* 1.307 + * Check for sufficient length for trail unit for each surrogate pair. 1.308 + * Handle single surrogates as surrogate code points as usual in ICU. 1.309 + */ 1.310 +const UChar * 1.311 +BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 1.312 + UChar c, c2; 1.313 + 1.314 + if(spanCondition) { 1.315 + // span 1.316 + do { 1.317 + c=*s; 1.318 + if(c<=0x7f) { 1.319 + if(!asciiBytes[c]) { 1.320 + break; 1.321 + } 1.322 + } else if(c<=0x7ff) { 1.323 + if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 1.324 + break; 1.325 + } 1.326 + } else if(c<0xd800 || c>=0xe000) { 1.327 + int lead=c>>12; 1.328 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.329 + if(twoBits<=1) { 1.330 + // All 64 code points with the same bits 15..6 1.331 + // are either in the set or not. 1.332 + if(twoBits==0) { 1.333 + break; 1.334 + } 1.335 + } else { 1.336 + // Look up the code point in its 4k block of code points. 1.337 + if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 1.338 + break; 1.339 + } 1.340 + } 1.341 + } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 1.342 + // surrogate code point 1.343 + if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 1.344 + break; 1.345 + } 1.346 + } else { 1.347 + // surrogate pair 1.348 + if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 1.349 + break; 1.350 + } 1.351 + ++s; 1.352 + } 1.353 + } while(++s<limit); 1.354 + } else { 1.355 + // span not 1.356 + do { 1.357 + c=*s; 1.358 + if(c<=0x7f) { 1.359 + if(asciiBytes[c]) { 1.360 + break; 1.361 + } 1.362 + } else if(c<=0x7ff) { 1.363 + if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 1.364 + break; 1.365 + } 1.366 + } else if(c<0xd800 || c>=0xe000) { 1.367 + int lead=c>>12; 1.368 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.369 + if(twoBits<=1) { 1.370 + // All 64 code points with the same bits 15..6 1.371 + // are either in the set or not. 1.372 + if(twoBits!=0) { 1.373 + break; 1.374 + } 1.375 + } else { 1.376 + // Look up the code point in its 4k block of code points. 1.377 + if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 1.378 + break; 1.379 + } 1.380 + } 1.381 + } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { 1.382 + // surrogate code point 1.383 + if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 1.384 + break; 1.385 + } 1.386 + } else { 1.387 + // surrogate pair 1.388 + if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { 1.389 + break; 1.390 + } 1.391 + ++s; 1.392 + } 1.393 + } while(++s<limit); 1.394 + } 1.395 + return s; 1.396 +} 1.397 + 1.398 +/* Symmetrical with span(). */ 1.399 +const UChar * 1.400 +BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { 1.401 + UChar c, c2; 1.402 + 1.403 + if(spanCondition) { 1.404 + // span 1.405 + for(;;) { 1.406 + c=*(--limit); 1.407 + if(c<=0x7f) { 1.408 + if(!asciiBytes[c]) { 1.409 + break; 1.410 + } 1.411 + } else if(c<=0x7ff) { 1.412 + if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { 1.413 + break; 1.414 + } 1.415 + } else if(c<0xd800 || c>=0xe000) { 1.416 + int lead=c>>12; 1.417 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.418 + if(twoBits<=1) { 1.419 + // All 64 code points with the same bits 15..6 1.420 + // are either in the set or not. 1.421 + if(twoBits==0) { 1.422 + break; 1.423 + } 1.424 + } else { 1.425 + // Look up the code point in its 4k block of code points. 1.426 + if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 1.427 + break; 1.428 + } 1.429 + } 1.430 + } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 1.431 + // surrogate code point 1.432 + if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 1.433 + break; 1.434 + } 1.435 + } else { 1.436 + // surrogate pair 1.437 + if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 1.438 + break; 1.439 + } 1.440 + --limit; 1.441 + } 1.442 + if(s==limit) { 1.443 + return s; 1.444 + } 1.445 + } 1.446 + } else { 1.447 + // span not 1.448 + for(;;) { 1.449 + c=*(--limit); 1.450 + if(c<=0x7f) { 1.451 + if(asciiBytes[c]) { 1.452 + break; 1.453 + } 1.454 + } else if(c<=0x7ff) { 1.455 + if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { 1.456 + break; 1.457 + } 1.458 + } else if(c<0xd800 || c>=0xe000) { 1.459 + int lead=c>>12; 1.460 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.461 + if(twoBits<=1) { 1.462 + // All 64 code points with the same bits 15..6 1.463 + // are either in the set or not. 1.464 + if(twoBits!=0) { 1.465 + break; 1.466 + } 1.467 + } else { 1.468 + // Look up the code point in its 4k block of code points. 1.469 + if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { 1.470 + break; 1.471 + } 1.472 + } 1.473 + } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { 1.474 + // surrogate code point 1.475 + if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { 1.476 + break; 1.477 + } 1.478 + } else { 1.479 + // surrogate pair 1.480 + if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { 1.481 + break; 1.482 + } 1.483 + --limit; 1.484 + } 1.485 + if(s==limit) { 1.486 + return s; 1.487 + } 1.488 + } 1.489 + } 1.490 + return limit+1; 1.491 +} 1.492 + 1.493 +/* 1.494 + * Precheck for sufficient trail bytes at end of string only once per span. 1.495 + * Check validity. 1.496 + */ 1.497 +const uint8_t * 1.498 +BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1.499 + const uint8_t *limit=s+length; 1.500 + uint8_t b=*s; 1.501 + if((int8_t)b>=0) { 1.502 + // Initial all-ASCII span. 1.503 + if(spanCondition) { 1.504 + do { 1.505 + if(!asciiBytes[b] || ++s==limit) { 1.506 + return s; 1.507 + } 1.508 + b=*s; 1.509 + } while((int8_t)b>=0); 1.510 + } else { 1.511 + do { 1.512 + if(asciiBytes[b] || ++s==limit) { 1.513 + return s; 1.514 + } 1.515 + b=*s; 1.516 + } while((int8_t)b>=0); 1.517 + } 1.518 + length=(int32_t)(limit-s); 1.519 + } 1.520 + 1.521 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.522 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.523 + } 1.524 + 1.525 + const uint8_t *limit0=limit; 1.526 + 1.527 + /* 1.528 + * Make sure that the last 1/2/3/4-byte sequence before limit is complete 1.529 + * or runs into a lead byte. 1.530 + * In the span loop compare s with limit only once 1.531 + * per multi-byte character. 1.532 + * 1.533 + * Give a trailing illegal sequence the same value as the result of contains(FFFD), 1.534 + * including it if that is part of the span, otherwise set limit0 to before 1.535 + * the truncated sequence. 1.536 + */ 1.537 + b=*(limit-1); 1.538 + if((int8_t)b<0) { 1.539 + // b>=0x80: lead or trail byte 1.540 + if(b<0xc0) { 1.541 + // single trail byte, check for preceding 3- or 4-byte lead byte 1.542 + if(length>=2 && (b=*(limit-2))>=0xe0) { 1.543 + limit-=2; 1.544 + if(asciiBytes[0x80]!=spanCondition) { 1.545 + limit0=limit; 1.546 + } 1.547 + } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { 1.548 + // 4-byte lead byte with only two trail bytes 1.549 + limit-=3; 1.550 + if(asciiBytes[0x80]!=spanCondition) { 1.551 + limit0=limit; 1.552 + } 1.553 + } 1.554 + } else { 1.555 + // lead byte with no trail bytes 1.556 + --limit; 1.557 + if(asciiBytes[0x80]!=spanCondition) { 1.558 + limit0=limit; 1.559 + } 1.560 + } 1.561 + } 1.562 + 1.563 + uint8_t t1, t2, t3; 1.564 + 1.565 + while(s<limit) { 1.566 + b=*s; 1.567 + if(b<0xc0) { 1.568 + // ASCII; or trail bytes with the result of contains(FFFD). 1.569 + if(spanCondition) { 1.570 + do { 1.571 + if(!asciiBytes[b]) { 1.572 + return s; 1.573 + } else if(++s==limit) { 1.574 + return limit0; 1.575 + } 1.576 + b=*s; 1.577 + } while(b<0xc0); 1.578 + } else { 1.579 + do { 1.580 + if(asciiBytes[b]) { 1.581 + return s; 1.582 + } else if(++s==limit) { 1.583 + return limit0; 1.584 + } 1.585 + b=*s; 1.586 + } while(b<0xc0); 1.587 + } 1.588 + } 1.589 + ++s; // Advance past the lead byte. 1.590 + if(b>=0xe0) { 1.591 + if(b<0xf0) { 1.592 + if( /* handle U+0000..U+FFFF inline */ 1.593 + (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 1.594 + (t2=(uint8_t)(s[1]-0x80)) <= 0x3f 1.595 + ) { 1.596 + b&=0xf; 1.597 + uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; 1.598 + if(twoBits<=1) { 1.599 + // All 64 code points with this lead byte and middle trail byte 1.600 + // are either in the set or not. 1.601 + if(twoBits!=(uint32_t)spanCondition) { 1.602 + return s-1; 1.603 + } 1.604 + } else { 1.605 + // Look up the code point in its 4k block of code points. 1.606 + UChar32 c=(b<<12)|(t1<<6)|t2; 1.607 + if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { 1.608 + return s-1; 1.609 + } 1.610 + } 1.611 + s+=2; 1.612 + continue; 1.613 + } 1.614 + } else if( /* handle U+10000..U+10FFFF inline */ 1.615 + (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && 1.616 + (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && 1.617 + (t3=(uint8_t)(s[2]-0x80)) <= 0x3f 1.618 + ) { 1.619 + // Give an illegal sequence the same value as the result of contains(FFFD). 1.620 + UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; 1.621 + if( ( (0x10000<=c && c<=0x10ffff) ? 1.622 + containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : 1.623 + asciiBytes[0x80] 1.624 + ) != spanCondition 1.625 + ) { 1.626 + return s-1; 1.627 + } 1.628 + s+=3; 1.629 + continue; 1.630 + } 1.631 + } else /* 0xc0<=b<0xe0 */ { 1.632 + if( /* handle U+0000..U+07FF inline */ 1.633 + (t1=(uint8_t)(*s-0x80)) <= 0x3f 1.634 + ) { 1.635 + if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { 1.636 + return s-1; 1.637 + } 1.638 + ++s; 1.639 + continue; 1.640 + } 1.641 + } 1.642 + 1.643 + // Give an illegal sequence the same value as the result of contains(FFFD). 1.644 + // Handle each byte of an illegal sequence separately to simplify the code; 1.645 + // no need to optimize error handling. 1.646 + if(asciiBytes[0x80]!=spanCondition) { 1.647 + return s-1; 1.648 + } 1.649 + } 1.650 + 1.651 + return limit0; 1.652 +} 1.653 + 1.654 +/* 1.655 + * While going backwards through UTF-8 optimize only for ASCII. 1.656 + * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not 1.657 + * possible to tell from the last byte in a multi-byte sequence how many 1.658 + * preceding bytes there should be. Therefore, going backwards through UTF-8 1.659 + * is much harder than going forward. 1.660 + */ 1.661 +int32_t 1.662 +BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1.663 + if(spanCondition!=USET_SPAN_NOT_CONTAINED) { 1.664 + spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. 1.665 + } 1.666 + 1.667 + uint8_t b; 1.668 + 1.669 + do { 1.670 + b=s[--length]; 1.671 + if((int8_t)b>=0) { 1.672 + // ASCII sub-span 1.673 + if(spanCondition) { 1.674 + do { 1.675 + if(!asciiBytes[b]) { 1.676 + return length+1; 1.677 + } else if(length==0) { 1.678 + return 0; 1.679 + } 1.680 + b=s[--length]; 1.681 + } while((int8_t)b>=0); 1.682 + } else { 1.683 + do { 1.684 + if(asciiBytes[b]) { 1.685 + return length+1; 1.686 + } else if(length==0) { 1.687 + return 0; 1.688 + } 1.689 + b=s[--length]; 1.690 + } while((int8_t)b>=0); 1.691 + } 1.692 + } 1.693 + 1.694 + int32_t prev=length; 1.695 + UChar32 c; 1.696 + // trail byte: collect a multi-byte character 1.697 + // (or lead byte in last-trail position) 1.698 + c=utf8_prevCharSafeBody(s, 0, &length, b, -3); 1.699 + // c is a valid code point, not ASCII, not a surrogate 1.700 + if(c<=0x7ff) { 1.701 + if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { 1.702 + return prev+1; 1.703 + } 1.704 + } else if(c<=0xffff) { 1.705 + int lead=c>>12; 1.706 + uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; 1.707 + if(twoBits<=1) { 1.708 + // All 64 code points with the same bits 15..6 1.709 + // are either in the set or not. 1.710 + if(twoBits!=(uint32_t)spanCondition) { 1.711 + return prev+1; 1.712 + } 1.713 + } else { 1.714 + // Look up the code point in its 4k block of code points. 1.715 + if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { 1.716 + return prev+1; 1.717 + } 1.718 + } 1.719 + } else { 1.720 + if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { 1.721 + return prev+1; 1.722 + } 1.723 + } 1.724 + } while(length>0); 1.725 + return 0; 1.726 +} 1.727 + 1.728 +U_NAMESPACE_END