intl/icu/source/common/bmpset.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2007-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: bmpset.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2007jan29
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/utf8.h"
michael@0 20 #include "unicode/utf16.h"
michael@0 21 #include "cmemory.h"
michael@0 22 #include "bmpset.h"
michael@0 23 #include "uassert.h"
michael@0 24
michael@0 25 U_NAMESPACE_BEGIN
michael@0 26
michael@0 27 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
michael@0 28 list(parentList), listLength(parentListLength) {
michael@0 29 uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
michael@0 30 uprv_memset(table7FF, 0, sizeof(table7FF));
michael@0 31 uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
michael@0 32
michael@0 33 /*
michael@0 34 * Set the list indexes for binary searches for
michael@0 35 * U+0800, U+1000, U+2000, .., U+F000, U+10000.
michael@0 36 * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
michael@0 37 * looked up in the bit tables.
michael@0 38 * The last pair of indexes is for finding supplementary code points.
michael@0 39 */
michael@0 40 list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
michael@0 41 int32_t i;
michael@0 42 for(i=1; i<=0x10; ++i) {
michael@0 43 list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
michael@0 44 }
michael@0 45 list4kStarts[0x11]=listLength-1;
michael@0 46
michael@0 47 initBits();
michael@0 48 overrideIllegal();
michael@0 49 }
michael@0 50
michael@0 51 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
michael@0 52 list(newParentList), listLength(newParentListLength) {
michael@0 53 uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
michael@0 54 uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
michael@0 55 uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
michael@0 56 uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
michael@0 57 }
michael@0 58
michael@0 59 BMPSet::~BMPSet() {
michael@0 60 }
michael@0 61
michael@0 62 /*
michael@0 63 * Set bits in a bit rectangle in "vertical" bit organization.
michael@0 64 * start<limit<=0x800
michael@0 65 */
michael@0 66 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
michael@0 67 U_ASSERT(start<limit);
michael@0 68 U_ASSERT(limit<=0x800);
michael@0 69
michael@0 70 int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
michael@0 71 int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
michael@0 72
michael@0 73 // Set one bit indicating an all-one block.
michael@0 74 uint32_t bits=(uint32_t)1<<lead;
michael@0 75 if((start+1)==limit) { // Single-character shortcut.
michael@0 76 table[trail]|=bits;
michael@0 77 return;
michael@0 78 }
michael@0 79
michael@0 80 int32_t limitLead=limit>>6;
michael@0 81 int32_t limitTrail=limit&0x3f;
michael@0 82
michael@0 83 if(lead==limitLead) {
michael@0 84 // Partial vertical bit column.
michael@0 85 while(trail<limitTrail) {
michael@0 86 table[trail++]|=bits;
michael@0 87 }
michael@0 88 } else {
michael@0 89 // Partial vertical bit column,
michael@0 90 // followed by a bit rectangle,
michael@0 91 // followed by another partial vertical bit column.
michael@0 92 if(trail>0) {
michael@0 93 do {
michael@0 94 table[trail++]|=bits;
michael@0 95 } while(trail<64);
michael@0 96 ++lead;
michael@0 97 }
michael@0 98 if(lead<limitLead) {
michael@0 99 bits=~((1<<lead)-1);
michael@0 100 if(limitLead<0x20) {
michael@0 101 bits&=(1<<limitLead)-1;
michael@0 102 }
michael@0 103 for(trail=0; trail<64; ++trail) {
michael@0 104 table[trail]|=bits;
michael@0 105 }
michael@0 106 }
michael@0 107 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
michael@0 108 // In that case, bits=1<<limitLead is undefined but the bits value
michael@0 109 // is not used because trail<limitTrail is already false.
michael@0 110 bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
michael@0 111 for(trail=0; trail<limitTrail; ++trail) {
michael@0 112 table[trail]|=bits;
michael@0 113 }
michael@0 114 }
michael@0 115 }
michael@0 116
michael@0 117 void BMPSet::initBits() {
michael@0 118 UChar32 start, limit;
michael@0 119 int32_t listIndex=0;
michael@0 120
michael@0 121 // Set asciiBytes[].
michael@0 122 do {
michael@0 123 start=list[listIndex++];
michael@0 124 if(listIndex<listLength) {
michael@0 125 limit=list[listIndex++];
michael@0 126 } else {
michael@0 127 limit=0x110000;
michael@0 128 }
michael@0 129 if(start>=0x80) {
michael@0 130 break;
michael@0 131 }
michael@0 132 do {
michael@0 133 asciiBytes[start++]=1;
michael@0 134 } while(start<limit && start<0x80);
michael@0 135 } while(limit<=0x80);
michael@0 136
michael@0 137 // Set table7FF[].
michael@0 138 while(start<0x800) {
michael@0 139 set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
michael@0 140 if(limit>0x800) {
michael@0 141 start=0x800;
michael@0 142 break;
michael@0 143 }
michael@0 144
michael@0 145 start=list[listIndex++];
michael@0 146 if(listIndex<listLength) {
michael@0 147 limit=list[listIndex++];
michael@0 148 } else {
michael@0 149 limit=0x110000;
michael@0 150 }
michael@0 151 }
michael@0 152
michael@0 153 // Set bmpBlockBits[].
michael@0 154 int32_t minStart=0x800;
michael@0 155 while(start<0x10000) {
michael@0 156 if(limit>0x10000) {
michael@0 157 limit=0x10000;
michael@0 158 }
michael@0 159
michael@0 160 if(start<minStart) {
michael@0 161 start=minStart;
michael@0 162 }
michael@0 163 if(start<limit) { // Else: Another range entirely in a known mixed-value block.
michael@0 164 if(start&0x3f) {
michael@0 165 // Mixed-value block of 64 code points.
michael@0 166 start>>=6;
michael@0 167 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
michael@0 168 start=(start+1)<<6; // Round up to the next block boundary.
michael@0 169 minStart=start; // Ignore further ranges in this block.
michael@0 170 }
michael@0 171 if(start<limit) {
michael@0 172 if(start<(limit&~0x3f)) {
michael@0 173 // Multiple all-ones blocks of 64 code points each.
michael@0 174 set32x64Bits(bmpBlockBits, start>>6, limit>>6);
michael@0 175 }
michael@0 176
michael@0 177 if(limit&0x3f) {
michael@0 178 // Mixed-value block of 64 code points.
michael@0 179 limit>>=6;
michael@0 180 bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
michael@0 181 limit=(limit+1)<<6; // Round up to the next block boundary.
michael@0 182 minStart=limit; // Ignore further ranges in this block.
michael@0 183 }
michael@0 184 }
michael@0 185 }
michael@0 186
michael@0 187 if(limit==0x10000) {
michael@0 188 break;
michael@0 189 }
michael@0 190
michael@0 191 start=list[listIndex++];
michael@0 192 if(listIndex<listLength) {
michael@0 193 limit=list[listIndex++];
michael@0 194 } else {
michael@0 195 limit=0x110000;
michael@0 196 }
michael@0 197 }
michael@0 198 }
michael@0 199
michael@0 200 /*
michael@0 201 * Override some bits and bytes to the result of contains(FFFD)
michael@0 202 * for faster validity checking at runtime.
michael@0 203 * No need to set 0 values where they were reset to 0 in the constructor
michael@0 204 * and not modified by initBits().
michael@0 205 * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
michael@0 206 * Need to set 0 values for surrogates D800..DFFF.
michael@0 207 */
michael@0 208 void BMPSet::overrideIllegal() {
michael@0 209 uint32_t bits, mask;
michael@0 210 int32_t i;
michael@0 211
michael@0 212 if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
michael@0 213 // contains(FFFD)==TRUE
michael@0 214 for(i=0x80; i<0xc0; ++i) {
michael@0 215 asciiBytes[i]=1;
michael@0 216 }
michael@0 217
michael@0 218 bits=3; // Lead bytes 0xC0 and 0xC1.
michael@0 219 for(i=0; i<64; ++i) {
michael@0 220 table7FF[i]|=bits;
michael@0 221 }
michael@0 222
michael@0 223 bits=1; // Lead byte 0xE0.
michael@0 224 for(i=0; i<32; ++i) { // First half of 4k block.
michael@0 225 bmpBlockBits[i]|=bits;
michael@0 226 }
michael@0 227
michael@0 228 mask=~(0x10001<<0xd); // Lead byte 0xED.
michael@0 229 bits=1<<0xd;
michael@0 230 for(i=32; i<64; ++i) { // Second half of 4k block.
michael@0 231 bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
michael@0 232 }
michael@0 233 } else {
michael@0 234 // contains(FFFD)==FALSE
michael@0 235 mask=~(0x10001<<0xd); // Lead byte 0xED.
michael@0 236 for(i=32; i<64; ++i) { // Second half of 4k block.
michael@0 237 bmpBlockBits[i]&=mask;
michael@0 238 }
michael@0 239 }
michael@0 240 }
michael@0 241
michael@0 242 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
michael@0 243 /* Examples:
michael@0 244 findCodePoint(c)
michael@0 245 set list[] c=0 1 3 4 7 8
michael@0 246 === ============== ===========
michael@0 247 [] [110000] 0 0 0 0 0 0
michael@0 248 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
michael@0 249 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
michael@0 250 [:Any:] [0, 110000] 1 1 1 1 1 1
michael@0 251 */
michael@0 252
michael@0 253 // Return the smallest i such that c < list[i]. Assume
michael@0 254 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
michael@0 255 if (c < list[lo])
michael@0 256 return lo;
michael@0 257 // High runner test. c is often after the last range, so an
michael@0 258 // initial check for this condition pays off.
michael@0 259 if (lo >= hi || c >= list[hi-1])
michael@0 260 return hi;
michael@0 261 // invariant: c >= list[lo]
michael@0 262 // invariant: c < list[hi]
michael@0 263 for (;;) {
michael@0 264 int32_t i = (lo + hi) >> 1;
michael@0 265 if (i == lo) {
michael@0 266 break; // Found!
michael@0 267 } else if (c < list[i]) {
michael@0 268 hi = i;
michael@0 269 } else {
michael@0 270 lo = i;
michael@0 271 }
michael@0 272 }
michael@0 273 return hi;
michael@0 274 }
michael@0 275
michael@0 276 UBool
michael@0 277 BMPSet::contains(UChar32 c) const {
michael@0 278 if((uint32_t)c<=0x7f) {
michael@0 279 return (UBool)asciiBytes[c];
michael@0 280 } else if((uint32_t)c<=0x7ff) {
michael@0 281 return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
michael@0 282 } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
michael@0 283 int lead=c>>12;
michael@0 284 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 285 if(twoBits<=1) {
michael@0 286 // All 64 code points with the same bits 15..6
michael@0 287 // are either in the set or not.
michael@0 288 return (UBool)twoBits;
michael@0 289 } else {
michael@0 290 // Look up the code point in its 4k block of code points.
michael@0 291 return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
michael@0 292 }
michael@0 293 } else if((uint32_t)c<=0x10ffff) {
michael@0 294 // surrogate or supplementary code point
michael@0 295 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
michael@0 296 } else {
michael@0 297 // Out-of-range code points get FALSE, consistent with long-standing
michael@0 298 // behavior of UnicodeSet::contains(c).
michael@0 299 return FALSE;
michael@0 300 }
michael@0 301 }
michael@0 302
michael@0 303 /*
michael@0 304 * Check for sufficient length for trail unit for each surrogate pair.
michael@0 305 * Handle single surrogates as surrogate code points as usual in ICU.
michael@0 306 */
michael@0 307 const UChar *
michael@0 308 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
michael@0 309 UChar c, c2;
michael@0 310
michael@0 311 if(spanCondition) {
michael@0 312 // span
michael@0 313 do {
michael@0 314 c=*s;
michael@0 315 if(c<=0x7f) {
michael@0 316 if(!asciiBytes[c]) {
michael@0 317 break;
michael@0 318 }
michael@0 319 } else if(c<=0x7ff) {
michael@0 320 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
michael@0 321 break;
michael@0 322 }
michael@0 323 } else if(c<0xd800 || c>=0xe000) {
michael@0 324 int lead=c>>12;
michael@0 325 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 326 if(twoBits<=1) {
michael@0 327 // All 64 code points with the same bits 15..6
michael@0 328 // are either in the set or not.
michael@0 329 if(twoBits==0) {
michael@0 330 break;
michael@0 331 }
michael@0 332 } else {
michael@0 333 // Look up the code point in its 4k block of code points.
michael@0 334 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
michael@0 335 break;
michael@0 336 }
michael@0 337 }
michael@0 338 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
michael@0 339 // surrogate code point
michael@0 340 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
michael@0 341 break;
michael@0 342 }
michael@0 343 } else {
michael@0 344 // surrogate pair
michael@0 345 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
michael@0 346 break;
michael@0 347 }
michael@0 348 ++s;
michael@0 349 }
michael@0 350 } while(++s<limit);
michael@0 351 } else {
michael@0 352 // span not
michael@0 353 do {
michael@0 354 c=*s;
michael@0 355 if(c<=0x7f) {
michael@0 356 if(asciiBytes[c]) {
michael@0 357 break;
michael@0 358 }
michael@0 359 } else if(c<=0x7ff) {
michael@0 360 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
michael@0 361 break;
michael@0 362 }
michael@0 363 } else if(c<0xd800 || c>=0xe000) {
michael@0 364 int lead=c>>12;
michael@0 365 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 366 if(twoBits<=1) {
michael@0 367 // All 64 code points with the same bits 15..6
michael@0 368 // are either in the set or not.
michael@0 369 if(twoBits!=0) {
michael@0 370 break;
michael@0 371 }
michael@0 372 } else {
michael@0 373 // Look up the code point in its 4k block of code points.
michael@0 374 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
michael@0 375 break;
michael@0 376 }
michael@0 377 }
michael@0 378 } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
michael@0 379 // surrogate code point
michael@0 380 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
michael@0 381 break;
michael@0 382 }
michael@0 383 } else {
michael@0 384 // surrogate pair
michael@0 385 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
michael@0 386 break;
michael@0 387 }
michael@0 388 ++s;
michael@0 389 }
michael@0 390 } while(++s<limit);
michael@0 391 }
michael@0 392 return s;
michael@0 393 }
michael@0 394
michael@0 395 /* Symmetrical with span(). */
michael@0 396 const UChar *
michael@0 397 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
michael@0 398 UChar c, c2;
michael@0 399
michael@0 400 if(spanCondition) {
michael@0 401 // span
michael@0 402 for(;;) {
michael@0 403 c=*(--limit);
michael@0 404 if(c<=0x7f) {
michael@0 405 if(!asciiBytes[c]) {
michael@0 406 break;
michael@0 407 }
michael@0 408 } else if(c<=0x7ff) {
michael@0 409 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
michael@0 410 break;
michael@0 411 }
michael@0 412 } else if(c<0xd800 || c>=0xe000) {
michael@0 413 int lead=c>>12;
michael@0 414 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 415 if(twoBits<=1) {
michael@0 416 // All 64 code points with the same bits 15..6
michael@0 417 // are either in the set or not.
michael@0 418 if(twoBits==0) {
michael@0 419 break;
michael@0 420 }
michael@0 421 } else {
michael@0 422 // Look up the code point in its 4k block of code points.
michael@0 423 if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
michael@0 424 break;
michael@0 425 }
michael@0 426 }
michael@0 427 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
michael@0 428 // surrogate code point
michael@0 429 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
michael@0 430 break;
michael@0 431 }
michael@0 432 } else {
michael@0 433 // surrogate pair
michael@0 434 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
michael@0 435 break;
michael@0 436 }
michael@0 437 --limit;
michael@0 438 }
michael@0 439 if(s==limit) {
michael@0 440 return s;
michael@0 441 }
michael@0 442 }
michael@0 443 } else {
michael@0 444 // span not
michael@0 445 for(;;) {
michael@0 446 c=*(--limit);
michael@0 447 if(c<=0x7f) {
michael@0 448 if(asciiBytes[c]) {
michael@0 449 break;
michael@0 450 }
michael@0 451 } else if(c<=0x7ff) {
michael@0 452 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
michael@0 453 break;
michael@0 454 }
michael@0 455 } else if(c<0xd800 || c>=0xe000) {
michael@0 456 int lead=c>>12;
michael@0 457 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 458 if(twoBits<=1) {
michael@0 459 // All 64 code points with the same bits 15..6
michael@0 460 // are either in the set or not.
michael@0 461 if(twoBits!=0) {
michael@0 462 break;
michael@0 463 }
michael@0 464 } else {
michael@0 465 // Look up the code point in its 4k block of code points.
michael@0 466 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
michael@0 467 break;
michael@0 468 }
michael@0 469 }
michael@0 470 } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
michael@0 471 // surrogate code point
michael@0 472 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
michael@0 473 break;
michael@0 474 }
michael@0 475 } else {
michael@0 476 // surrogate pair
michael@0 477 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
michael@0 478 break;
michael@0 479 }
michael@0 480 --limit;
michael@0 481 }
michael@0 482 if(s==limit) {
michael@0 483 return s;
michael@0 484 }
michael@0 485 }
michael@0 486 }
michael@0 487 return limit+1;
michael@0 488 }
michael@0 489
michael@0 490 /*
michael@0 491 * Precheck for sufficient trail bytes at end of string only once per span.
michael@0 492 * Check validity.
michael@0 493 */
michael@0 494 const uint8_t *
michael@0 495 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 496 const uint8_t *limit=s+length;
michael@0 497 uint8_t b=*s;
michael@0 498 if((int8_t)b>=0) {
michael@0 499 // Initial all-ASCII span.
michael@0 500 if(spanCondition) {
michael@0 501 do {
michael@0 502 if(!asciiBytes[b] || ++s==limit) {
michael@0 503 return s;
michael@0 504 }
michael@0 505 b=*s;
michael@0 506 } while((int8_t)b>=0);
michael@0 507 } else {
michael@0 508 do {
michael@0 509 if(asciiBytes[b] || ++s==limit) {
michael@0 510 return s;
michael@0 511 }
michael@0 512 b=*s;
michael@0 513 } while((int8_t)b>=0);
michael@0 514 }
michael@0 515 length=(int32_t)(limit-s);
michael@0 516 }
michael@0 517
michael@0 518 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 519 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 520 }
michael@0 521
michael@0 522 const uint8_t *limit0=limit;
michael@0 523
michael@0 524 /*
michael@0 525 * Make sure that the last 1/2/3/4-byte sequence before limit is complete
michael@0 526 * or runs into a lead byte.
michael@0 527 * In the span loop compare s with limit only once
michael@0 528 * per multi-byte character.
michael@0 529 *
michael@0 530 * Give a trailing illegal sequence the same value as the result of contains(FFFD),
michael@0 531 * including it if that is part of the span, otherwise set limit0 to before
michael@0 532 * the truncated sequence.
michael@0 533 */
michael@0 534 b=*(limit-1);
michael@0 535 if((int8_t)b<0) {
michael@0 536 // b>=0x80: lead or trail byte
michael@0 537 if(b<0xc0) {
michael@0 538 // single trail byte, check for preceding 3- or 4-byte lead byte
michael@0 539 if(length>=2 && (b=*(limit-2))>=0xe0) {
michael@0 540 limit-=2;
michael@0 541 if(asciiBytes[0x80]!=spanCondition) {
michael@0 542 limit0=limit;
michael@0 543 }
michael@0 544 } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
michael@0 545 // 4-byte lead byte with only two trail bytes
michael@0 546 limit-=3;
michael@0 547 if(asciiBytes[0x80]!=spanCondition) {
michael@0 548 limit0=limit;
michael@0 549 }
michael@0 550 }
michael@0 551 } else {
michael@0 552 // lead byte with no trail bytes
michael@0 553 --limit;
michael@0 554 if(asciiBytes[0x80]!=spanCondition) {
michael@0 555 limit0=limit;
michael@0 556 }
michael@0 557 }
michael@0 558 }
michael@0 559
michael@0 560 uint8_t t1, t2, t3;
michael@0 561
michael@0 562 while(s<limit) {
michael@0 563 b=*s;
michael@0 564 if(b<0xc0) {
michael@0 565 // ASCII; or trail bytes with the result of contains(FFFD).
michael@0 566 if(spanCondition) {
michael@0 567 do {
michael@0 568 if(!asciiBytes[b]) {
michael@0 569 return s;
michael@0 570 } else if(++s==limit) {
michael@0 571 return limit0;
michael@0 572 }
michael@0 573 b=*s;
michael@0 574 } while(b<0xc0);
michael@0 575 } else {
michael@0 576 do {
michael@0 577 if(asciiBytes[b]) {
michael@0 578 return s;
michael@0 579 } else if(++s==limit) {
michael@0 580 return limit0;
michael@0 581 }
michael@0 582 b=*s;
michael@0 583 } while(b<0xc0);
michael@0 584 }
michael@0 585 }
michael@0 586 ++s; // Advance past the lead byte.
michael@0 587 if(b>=0xe0) {
michael@0 588 if(b<0xf0) {
michael@0 589 if( /* handle U+0000..U+FFFF inline */
michael@0 590 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
michael@0 591 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
michael@0 592 ) {
michael@0 593 b&=0xf;
michael@0 594 uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
michael@0 595 if(twoBits<=1) {
michael@0 596 // All 64 code points with this lead byte and middle trail byte
michael@0 597 // are either in the set or not.
michael@0 598 if(twoBits!=(uint32_t)spanCondition) {
michael@0 599 return s-1;
michael@0 600 }
michael@0 601 } else {
michael@0 602 // Look up the code point in its 4k block of code points.
michael@0 603 UChar32 c=(b<<12)|(t1<<6)|t2;
michael@0 604 if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
michael@0 605 return s-1;
michael@0 606 }
michael@0 607 }
michael@0 608 s+=2;
michael@0 609 continue;
michael@0 610 }
michael@0 611 } else if( /* handle U+10000..U+10FFFF inline */
michael@0 612 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
michael@0 613 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
michael@0 614 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
michael@0 615 ) {
michael@0 616 // Give an illegal sequence the same value as the result of contains(FFFD).
michael@0 617 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
michael@0 618 if( ( (0x10000<=c && c<=0x10ffff) ?
michael@0 619 containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
michael@0 620 asciiBytes[0x80]
michael@0 621 ) != spanCondition
michael@0 622 ) {
michael@0 623 return s-1;
michael@0 624 }
michael@0 625 s+=3;
michael@0 626 continue;
michael@0 627 }
michael@0 628 } else /* 0xc0<=b<0xe0 */ {
michael@0 629 if( /* handle U+0000..U+07FF inline */
michael@0 630 (t1=(uint8_t)(*s-0x80)) <= 0x3f
michael@0 631 ) {
michael@0 632 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
michael@0 633 return s-1;
michael@0 634 }
michael@0 635 ++s;
michael@0 636 continue;
michael@0 637 }
michael@0 638 }
michael@0 639
michael@0 640 // Give an illegal sequence the same value as the result of contains(FFFD).
michael@0 641 // Handle each byte of an illegal sequence separately to simplify the code;
michael@0 642 // no need to optimize error handling.
michael@0 643 if(asciiBytes[0x80]!=spanCondition) {
michael@0 644 return s-1;
michael@0 645 }
michael@0 646 }
michael@0 647
michael@0 648 return limit0;
michael@0 649 }
michael@0 650
michael@0 651 /*
michael@0 652 * While going backwards through UTF-8 optimize only for ASCII.
michael@0 653 * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
michael@0 654 * possible to tell from the last byte in a multi-byte sequence how many
michael@0 655 * preceding bytes there should be. Therefore, going backwards through UTF-8
michael@0 656 * is much harder than going forward.
michael@0 657 */
michael@0 658 int32_t
michael@0 659 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 660 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
michael@0 661 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
michael@0 662 }
michael@0 663
michael@0 664 uint8_t b;
michael@0 665
michael@0 666 do {
michael@0 667 b=s[--length];
michael@0 668 if((int8_t)b>=0) {
michael@0 669 // ASCII sub-span
michael@0 670 if(spanCondition) {
michael@0 671 do {
michael@0 672 if(!asciiBytes[b]) {
michael@0 673 return length+1;
michael@0 674 } else if(length==0) {
michael@0 675 return 0;
michael@0 676 }
michael@0 677 b=s[--length];
michael@0 678 } while((int8_t)b>=0);
michael@0 679 } else {
michael@0 680 do {
michael@0 681 if(asciiBytes[b]) {
michael@0 682 return length+1;
michael@0 683 } else if(length==0) {
michael@0 684 return 0;
michael@0 685 }
michael@0 686 b=s[--length];
michael@0 687 } while((int8_t)b>=0);
michael@0 688 }
michael@0 689 }
michael@0 690
michael@0 691 int32_t prev=length;
michael@0 692 UChar32 c;
michael@0 693 // trail byte: collect a multi-byte character
michael@0 694 // (or lead byte in last-trail position)
michael@0 695 c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
michael@0 696 // c is a valid code point, not ASCII, not a surrogate
michael@0 697 if(c<=0x7ff) {
michael@0 698 if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
michael@0 699 return prev+1;
michael@0 700 }
michael@0 701 } else if(c<=0xffff) {
michael@0 702 int lead=c>>12;
michael@0 703 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
michael@0 704 if(twoBits<=1) {
michael@0 705 // All 64 code points with the same bits 15..6
michael@0 706 // are either in the set or not.
michael@0 707 if(twoBits!=(uint32_t)spanCondition) {
michael@0 708 return prev+1;
michael@0 709 }
michael@0 710 } else {
michael@0 711 // Look up the code point in its 4k block of code points.
michael@0 712 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
michael@0 713 return prev+1;
michael@0 714 }
michael@0 715 }
michael@0 716 } else {
michael@0 717 if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
michael@0 718 return prev+1;
michael@0 719 }
michael@0 720 }
michael@0 721 } while(length>0);
michael@0 722 return 0;
michael@0 723 }
michael@0 724
michael@0 725 U_NAMESPACE_END

mercurial