Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ****************************************************************************** |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2007-2012, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ****************************************************************************** |
michael@0 | 8 | * file name: bmpset.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2007jan29 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | #include "unicode/uniset.h" |
michael@0 | 19 | #include "unicode/utf8.h" |
michael@0 | 20 | #include "unicode/utf16.h" |
michael@0 | 21 | #include "cmemory.h" |
michael@0 | 22 | #include "bmpset.h" |
michael@0 | 23 | #include "uassert.h" |
michael@0 | 24 | |
michael@0 | 25 | U_NAMESPACE_BEGIN |
michael@0 | 26 | |
michael@0 | 27 | BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : |
michael@0 | 28 | list(parentList), listLength(parentListLength) { |
michael@0 | 29 | uprv_memset(asciiBytes, 0, sizeof(asciiBytes)); |
michael@0 | 30 | uprv_memset(table7FF, 0, sizeof(table7FF)); |
michael@0 | 31 | uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); |
michael@0 | 32 | |
michael@0 | 33 | /* |
michael@0 | 34 | * Set the list indexes for binary searches for |
michael@0 | 35 | * U+0800, U+1000, U+2000, .., U+F000, U+10000. |
michael@0 | 36 | * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are |
michael@0 | 37 | * looked up in the bit tables. |
michael@0 | 38 | * The last pair of indexes is for finding supplementary code points. |
michael@0 | 39 | */ |
michael@0 | 40 | list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); |
michael@0 | 41 | int32_t i; |
michael@0 | 42 | for(i=1; i<=0x10; ++i) { |
michael@0 | 43 | list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); |
michael@0 | 44 | } |
michael@0 | 45 | list4kStarts[0x11]=listLength-1; |
michael@0 | 46 | |
michael@0 | 47 | initBits(); |
michael@0 | 48 | overrideIllegal(); |
michael@0 | 49 | } |
michael@0 | 50 | |
michael@0 | 51 | BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : |
michael@0 | 52 | list(newParentList), listLength(newParentListLength) { |
michael@0 | 53 | uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes)); |
michael@0 | 54 | uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); |
michael@0 | 55 | uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); |
michael@0 | 56 | uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); |
michael@0 | 57 | } |
michael@0 | 58 | |
michael@0 | 59 | BMPSet::~BMPSet() { |
michael@0 | 60 | } |
michael@0 | 61 | |
michael@0 | 62 | /* |
michael@0 | 63 | * Set bits in a bit rectangle in "vertical" bit organization. |
michael@0 | 64 | * start<limit<=0x800 |
michael@0 | 65 | */ |
michael@0 | 66 | static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { |
michael@0 | 67 | U_ASSERT(start<limit); |
michael@0 | 68 | U_ASSERT(limit<=0x800); |
michael@0 | 69 | |
michael@0 | 70 | int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. |
michael@0 | 71 | int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. |
michael@0 | 72 | |
michael@0 | 73 | // Set one bit indicating an all-one block. |
michael@0 | 74 | uint32_t bits=(uint32_t)1<<lead; |
michael@0 | 75 | if((start+1)==limit) { // Single-character shortcut. |
michael@0 | 76 | table[trail]|=bits; |
michael@0 | 77 | return; |
michael@0 | 78 | } |
michael@0 | 79 | |
michael@0 | 80 | int32_t limitLead=limit>>6; |
michael@0 | 81 | int32_t limitTrail=limit&0x3f; |
michael@0 | 82 | |
michael@0 | 83 | if(lead==limitLead) { |
michael@0 | 84 | // Partial vertical bit column. |
michael@0 | 85 | while(trail<limitTrail) { |
michael@0 | 86 | table[trail++]|=bits; |
michael@0 | 87 | } |
michael@0 | 88 | } else { |
michael@0 | 89 | // Partial vertical bit column, |
michael@0 | 90 | // followed by a bit rectangle, |
michael@0 | 91 | // followed by another partial vertical bit column. |
michael@0 | 92 | if(trail>0) { |
michael@0 | 93 | do { |
michael@0 | 94 | table[trail++]|=bits; |
michael@0 | 95 | } while(trail<64); |
michael@0 | 96 | ++lead; |
michael@0 | 97 | } |
michael@0 | 98 | if(lead<limitLead) { |
michael@0 | 99 | bits=~((1<<lead)-1); |
michael@0 | 100 | if(limitLead<0x20) { |
michael@0 | 101 | bits&=(1<<limitLead)-1; |
michael@0 | 102 | } |
michael@0 | 103 | for(trail=0; trail<64; ++trail) { |
michael@0 | 104 | table[trail]|=bits; |
michael@0 | 105 | } |
michael@0 | 106 | } |
michael@0 | 107 | // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. |
michael@0 | 108 | // In that case, bits=1<<limitLead is undefined but the bits value |
michael@0 | 109 | // is not used because trail<limitTrail is already false. |
michael@0 | 110 | bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); |
michael@0 | 111 | for(trail=0; trail<limitTrail; ++trail) { |
michael@0 | 112 | table[trail]|=bits; |
michael@0 | 113 | } |
michael@0 | 114 | } |
michael@0 | 115 | } |
michael@0 | 116 | |
michael@0 | 117 | void BMPSet::initBits() { |
michael@0 | 118 | UChar32 start, limit; |
michael@0 | 119 | int32_t listIndex=0; |
michael@0 | 120 | |
michael@0 | 121 | // Set asciiBytes[]. |
michael@0 | 122 | do { |
michael@0 | 123 | start=list[listIndex++]; |
michael@0 | 124 | if(listIndex<listLength) { |
michael@0 | 125 | limit=list[listIndex++]; |
michael@0 | 126 | } else { |
michael@0 | 127 | limit=0x110000; |
michael@0 | 128 | } |
michael@0 | 129 | if(start>=0x80) { |
michael@0 | 130 | break; |
michael@0 | 131 | } |
michael@0 | 132 | do { |
michael@0 | 133 | asciiBytes[start++]=1; |
michael@0 | 134 | } while(start<limit && start<0x80); |
michael@0 | 135 | } while(limit<=0x80); |
michael@0 | 136 | |
michael@0 | 137 | // Set table7FF[]. |
michael@0 | 138 | while(start<0x800) { |
michael@0 | 139 | set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); |
michael@0 | 140 | if(limit>0x800) { |
michael@0 | 141 | start=0x800; |
michael@0 | 142 | break; |
michael@0 | 143 | } |
michael@0 | 144 | |
michael@0 | 145 | start=list[listIndex++]; |
michael@0 | 146 | if(listIndex<listLength) { |
michael@0 | 147 | limit=list[listIndex++]; |
michael@0 | 148 | } else { |
michael@0 | 149 | limit=0x110000; |
michael@0 | 150 | } |
michael@0 | 151 | } |
michael@0 | 152 | |
michael@0 | 153 | // Set bmpBlockBits[]. |
michael@0 | 154 | int32_t minStart=0x800; |
michael@0 | 155 | while(start<0x10000) { |
michael@0 | 156 | if(limit>0x10000) { |
michael@0 | 157 | limit=0x10000; |
michael@0 | 158 | } |
michael@0 | 159 | |
michael@0 | 160 | if(start<minStart) { |
michael@0 | 161 | start=minStart; |
michael@0 | 162 | } |
michael@0 | 163 | if(start<limit) { // Else: Another range entirely in a known mixed-value block. |
michael@0 | 164 | if(start&0x3f) { |
michael@0 | 165 | // Mixed-value block of 64 code points. |
michael@0 | 166 | start>>=6; |
michael@0 | 167 | bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); |
michael@0 | 168 | start=(start+1)<<6; // Round up to the next block boundary. |
michael@0 | 169 | minStart=start; // Ignore further ranges in this block. |
michael@0 | 170 | } |
michael@0 | 171 | if(start<limit) { |
michael@0 | 172 | if(start<(limit&~0x3f)) { |
michael@0 | 173 | // Multiple all-ones blocks of 64 code points each. |
michael@0 | 174 | set32x64Bits(bmpBlockBits, start>>6, limit>>6); |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | if(limit&0x3f) { |
michael@0 | 178 | // Mixed-value block of 64 code points. |
michael@0 | 179 | limit>>=6; |
michael@0 | 180 | bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); |
michael@0 | 181 | limit=(limit+1)<<6; // Round up to the next block boundary. |
michael@0 | 182 | minStart=limit; // Ignore further ranges in this block. |
michael@0 | 183 | } |
michael@0 | 184 | } |
michael@0 | 185 | } |
michael@0 | 186 | |
michael@0 | 187 | if(limit==0x10000) { |
michael@0 | 188 | break; |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | start=list[listIndex++]; |
michael@0 | 192 | if(listIndex<listLength) { |
michael@0 | 193 | limit=list[listIndex++]; |
michael@0 | 194 | } else { |
michael@0 | 195 | limit=0x110000; |
michael@0 | 196 | } |
michael@0 | 197 | } |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | /* |
michael@0 | 201 | * Override some bits and bytes to the result of contains(FFFD) |
michael@0 | 202 | * for faster validity checking at runtime. |
michael@0 | 203 | * No need to set 0 values where they were reset to 0 in the constructor |
michael@0 | 204 | * and not modified by initBits(). |
michael@0 | 205 | * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF) |
michael@0 | 206 | * Need to set 0 values for surrogates D800..DFFF. |
michael@0 | 207 | */ |
michael@0 | 208 | void BMPSet::overrideIllegal() { |
michael@0 | 209 | uint32_t bits, mask; |
michael@0 | 210 | int32_t i; |
michael@0 | 211 | |
michael@0 | 212 | if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) { |
michael@0 | 213 | // contains(FFFD)==TRUE |
michael@0 | 214 | for(i=0x80; i<0xc0; ++i) { |
michael@0 | 215 | asciiBytes[i]=1; |
michael@0 | 216 | } |
michael@0 | 217 | |
michael@0 | 218 | bits=3; // Lead bytes 0xC0 and 0xC1. |
michael@0 | 219 | for(i=0; i<64; ++i) { |
michael@0 | 220 | table7FF[i]|=bits; |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | bits=1; // Lead byte 0xE0. |
michael@0 | 224 | for(i=0; i<32; ++i) { // First half of 4k block. |
michael@0 | 225 | bmpBlockBits[i]|=bits; |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | mask=~(0x10001<<0xd); // Lead byte 0xED. |
michael@0 | 229 | bits=1<<0xd; |
michael@0 | 230 | for(i=32; i<64; ++i) { // Second half of 4k block. |
michael@0 | 231 | bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; |
michael@0 | 232 | } |
michael@0 | 233 | } else { |
michael@0 | 234 | // contains(FFFD)==FALSE |
michael@0 | 235 | mask=~(0x10001<<0xd); // Lead byte 0xED. |
michael@0 | 236 | for(i=32; i<64; ++i) { // Second half of 4k block. |
michael@0 | 237 | bmpBlockBits[i]&=mask; |
michael@0 | 238 | } |
michael@0 | 239 | } |
michael@0 | 240 | } |
michael@0 | 241 | |
michael@0 | 242 | int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { |
michael@0 | 243 | /* Examples: |
michael@0 | 244 | findCodePoint(c) |
michael@0 | 245 | set list[] c=0 1 3 4 7 8 |
michael@0 | 246 | === ============== =========== |
michael@0 | 247 | [] [110000] 0 0 0 0 0 0 |
michael@0 | 248 | [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 |
michael@0 | 249 | [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 |
michael@0 | 250 | [:Any:] [0, 110000] 1 1 1 1 1 1 |
michael@0 | 251 | */ |
michael@0 | 252 | |
michael@0 | 253 | // Return the smallest i such that c < list[i]. Assume |
michael@0 | 254 | // list[len - 1] == HIGH and that c is legal (0..HIGH-1). |
michael@0 | 255 | if (c < list[lo]) |
michael@0 | 256 | return lo; |
michael@0 | 257 | // High runner test. c is often after the last range, so an |
michael@0 | 258 | // initial check for this condition pays off. |
michael@0 | 259 | if (lo >= hi || c >= list[hi-1]) |
michael@0 | 260 | return hi; |
michael@0 | 261 | // invariant: c >= list[lo] |
michael@0 | 262 | // invariant: c < list[hi] |
michael@0 | 263 | for (;;) { |
michael@0 | 264 | int32_t i = (lo + hi) >> 1; |
michael@0 | 265 | if (i == lo) { |
michael@0 | 266 | break; // Found! |
michael@0 | 267 | } else if (c < list[i]) { |
michael@0 | 268 | hi = i; |
michael@0 | 269 | } else { |
michael@0 | 270 | lo = i; |
michael@0 | 271 | } |
michael@0 | 272 | } |
michael@0 | 273 | return hi; |
michael@0 | 274 | } |
michael@0 | 275 | |
michael@0 | 276 | UBool |
michael@0 | 277 | BMPSet::contains(UChar32 c) const { |
michael@0 | 278 | if((uint32_t)c<=0x7f) { |
michael@0 | 279 | return (UBool)asciiBytes[c]; |
michael@0 | 280 | } else if((uint32_t)c<=0x7ff) { |
michael@0 | 281 | return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); |
michael@0 | 282 | } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { |
michael@0 | 283 | int lead=c>>12; |
michael@0 | 284 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 285 | if(twoBits<=1) { |
michael@0 | 286 | // All 64 code points with the same bits 15..6 |
michael@0 | 287 | // are either in the set or not. |
michael@0 | 288 | return (UBool)twoBits; |
michael@0 | 289 | } else { |
michael@0 | 290 | // Look up the code point in its 4k block of code points. |
michael@0 | 291 | return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); |
michael@0 | 292 | } |
michael@0 | 293 | } else if((uint32_t)c<=0x10ffff) { |
michael@0 | 294 | // surrogate or supplementary code point |
michael@0 | 295 | return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); |
michael@0 | 296 | } else { |
michael@0 | 297 | // Out-of-range code points get FALSE, consistent with long-standing |
michael@0 | 298 | // behavior of UnicodeSet::contains(c). |
michael@0 | 299 | return FALSE; |
michael@0 | 300 | } |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | /* |
michael@0 | 304 | * Check for sufficient length for trail unit for each surrogate pair. |
michael@0 | 305 | * Handle single surrogates as surrogate code points as usual in ICU. |
michael@0 | 306 | */ |
michael@0 | 307 | const UChar * |
michael@0 | 308 | BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { |
michael@0 | 309 | UChar c, c2; |
michael@0 | 310 | |
michael@0 | 311 | if(spanCondition) { |
michael@0 | 312 | // span |
michael@0 | 313 | do { |
michael@0 | 314 | c=*s; |
michael@0 | 315 | if(c<=0x7f) { |
michael@0 | 316 | if(!asciiBytes[c]) { |
michael@0 | 317 | break; |
michael@0 | 318 | } |
michael@0 | 319 | } else if(c<=0x7ff) { |
michael@0 | 320 | if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { |
michael@0 | 321 | break; |
michael@0 | 322 | } |
michael@0 | 323 | } else if(c<0xd800 || c>=0xe000) { |
michael@0 | 324 | int lead=c>>12; |
michael@0 | 325 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 326 | if(twoBits<=1) { |
michael@0 | 327 | // All 64 code points with the same bits 15..6 |
michael@0 | 328 | // are either in the set or not. |
michael@0 | 329 | if(twoBits==0) { |
michael@0 | 330 | break; |
michael@0 | 331 | } |
michael@0 | 332 | } else { |
michael@0 | 333 | // Look up the code point in its 4k block of code points. |
michael@0 | 334 | if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
michael@0 | 335 | break; |
michael@0 | 336 | } |
michael@0 | 337 | } |
michael@0 | 338 | } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { |
michael@0 | 339 | // surrogate code point |
michael@0 | 340 | if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
michael@0 | 341 | break; |
michael@0 | 342 | } |
michael@0 | 343 | } else { |
michael@0 | 344 | // surrogate pair |
michael@0 | 345 | if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { |
michael@0 | 346 | break; |
michael@0 | 347 | } |
michael@0 | 348 | ++s; |
michael@0 | 349 | } |
michael@0 | 350 | } while(++s<limit); |
michael@0 | 351 | } else { |
michael@0 | 352 | // span not |
michael@0 | 353 | do { |
michael@0 | 354 | c=*s; |
michael@0 | 355 | if(c<=0x7f) { |
michael@0 | 356 | if(asciiBytes[c]) { |
michael@0 | 357 | break; |
michael@0 | 358 | } |
michael@0 | 359 | } else if(c<=0x7ff) { |
michael@0 | 360 | if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { |
michael@0 | 361 | break; |
michael@0 | 362 | } |
michael@0 | 363 | } else if(c<0xd800 || c>=0xe000) { |
michael@0 | 364 | int lead=c>>12; |
michael@0 | 365 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 366 | if(twoBits<=1) { |
michael@0 | 367 | // All 64 code points with the same bits 15..6 |
michael@0 | 368 | // are either in the set or not. |
michael@0 | 369 | if(twoBits!=0) { |
michael@0 | 370 | break; |
michael@0 | 371 | } |
michael@0 | 372 | } else { |
michael@0 | 373 | // Look up the code point in its 4k block of code points. |
michael@0 | 374 | if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
michael@0 | 375 | break; |
michael@0 | 376 | } |
michael@0 | 377 | } |
michael@0 | 378 | } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { |
michael@0 | 379 | // surrogate code point |
michael@0 | 380 | if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
michael@0 | 381 | break; |
michael@0 | 382 | } |
michael@0 | 383 | } else { |
michael@0 | 384 | // surrogate pair |
michael@0 | 385 | if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { |
michael@0 | 386 | break; |
michael@0 | 387 | } |
michael@0 | 388 | ++s; |
michael@0 | 389 | } |
michael@0 | 390 | } while(++s<limit); |
michael@0 | 391 | } |
michael@0 | 392 | return s; |
michael@0 | 393 | } |
michael@0 | 394 | |
michael@0 | 395 | /* Symmetrical with span(). */ |
michael@0 | 396 | const UChar * |
michael@0 | 397 | BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { |
michael@0 | 398 | UChar c, c2; |
michael@0 | 399 | |
michael@0 | 400 | if(spanCondition) { |
michael@0 | 401 | // span |
michael@0 | 402 | for(;;) { |
michael@0 | 403 | c=*(--limit); |
michael@0 | 404 | if(c<=0x7f) { |
michael@0 | 405 | if(!asciiBytes[c]) { |
michael@0 | 406 | break; |
michael@0 | 407 | } |
michael@0 | 408 | } else if(c<=0x7ff) { |
michael@0 | 409 | if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { |
michael@0 | 410 | break; |
michael@0 | 411 | } |
michael@0 | 412 | } else if(c<0xd800 || c>=0xe000) { |
michael@0 | 413 | int lead=c>>12; |
michael@0 | 414 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 415 | if(twoBits<=1) { |
michael@0 | 416 | // All 64 code points with the same bits 15..6 |
michael@0 | 417 | // are either in the set or not. |
michael@0 | 418 | if(twoBits==0) { |
michael@0 | 419 | break; |
michael@0 | 420 | } |
michael@0 | 421 | } else { |
michael@0 | 422 | // Look up the code point in its 4k block of code points. |
michael@0 | 423 | if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
michael@0 | 424 | break; |
michael@0 | 425 | } |
michael@0 | 426 | } |
michael@0 | 427 | } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { |
michael@0 | 428 | // surrogate code point |
michael@0 | 429 | if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
michael@0 | 430 | break; |
michael@0 | 431 | } |
michael@0 | 432 | } else { |
michael@0 | 433 | // surrogate pair |
michael@0 | 434 | if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { |
michael@0 | 435 | break; |
michael@0 | 436 | } |
michael@0 | 437 | --limit; |
michael@0 | 438 | } |
michael@0 | 439 | if(s==limit) { |
michael@0 | 440 | return s; |
michael@0 | 441 | } |
michael@0 | 442 | } |
michael@0 | 443 | } else { |
michael@0 | 444 | // span not |
michael@0 | 445 | for(;;) { |
michael@0 | 446 | c=*(--limit); |
michael@0 | 447 | if(c<=0x7f) { |
michael@0 | 448 | if(asciiBytes[c]) { |
michael@0 | 449 | break; |
michael@0 | 450 | } |
michael@0 | 451 | } else if(c<=0x7ff) { |
michael@0 | 452 | if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { |
michael@0 | 453 | break; |
michael@0 | 454 | } |
michael@0 | 455 | } else if(c<0xd800 || c>=0xe000) { |
michael@0 | 456 | int lead=c>>12; |
michael@0 | 457 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 458 | if(twoBits<=1) { |
michael@0 | 459 | // All 64 code points with the same bits 15..6 |
michael@0 | 460 | // are either in the set or not. |
michael@0 | 461 | if(twoBits!=0) { |
michael@0 | 462 | break; |
michael@0 | 463 | } |
michael@0 | 464 | } else { |
michael@0 | 465 | // Look up the code point in its 4k block of code points. |
michael@0 | 466 | if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { |
michael@0 | 467 | break; |
michael@0 | 468 | } |
michael@0 | 469 | } |
michael@0 | 470 | } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { |
michael@0 | 471 | // surrogate code point |
michael@0 | 472 | if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { |
michael@0 | 473 | break; |
michael@0 | 474 | } |
michael@0 | 475 | } else { |
michael@0 | 476 | // surrogate pair |
michael@0 | 477 | if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { |
michael@0 | 478 | break; |
michael@0 | 479 | } |
michael@0 | 480 | --limit; |
michael@0 | 481 | } |
michael@0 | 482 | if(s==limit) { |
michael@0 | 483 | return s; |
michael@0 | 484 | } |
michael@0 | 485 | } |
michael@0 | 486 | } |
michael@0 | 487 | return limit+1; |
michael@0 | 488 | } |
michael@0 | 489 | |
michael@0 | 490 | /* |
michael@0 | 491 | * Precheck for sufficient trail bytes at end of string only once per span. |
michael@0 | 492 | * Check validity. |
michael@0 | 493 | */ |
michael@0 | 494 | const uint8_t * |
michael@0 | 495 | BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 496 | const uint8_t *limit=s+length; |
michael@0 | 497 | uint8_t b=*s; |
michael@0 | 498 | if((int8_t)b>=0) { |
michael@0 | 499 | // Initial all-ASCII span. |
michael@0 | 500 | if(spanCondition) { |
michael@0 | 501 | do { |
michael@0 | 502 | if(!asciiBytes[b] || ++s==limit) { |
michael@0 | 503 | return s; |
michael@0 | 504 | } |
michael@0 | 505 | b=*s; |
michael@0 | 506 | } while((int8_t)b>=0); |
michael@0 | 507 | } else { |
michael@0 | 508 | do { |
michael@0 | 509 | if(asciiBytes[b] || ++s==limit) { |
michael@0 | 510 | return s; |
michael@0 | 511 | } |
michael@0 | 512 | b=*s; |
michael@0 | 513 | } while((int8_t)b>=0); |
michael@0 | 514 | } |
michael@0 | 515 | length=(int32_t)(limit-s); |
michael@0 | 516 | } |
michael@0 | 517 | |
michael@0 | 518 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 519 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 520 | } |
michael@0 | 521 | |
michael@0 | 522 | const uint8_t *limit0=limit; |
michael@0 | 523 | |
michael@0 | 524 | /* |
michael@0 | 525 | * Make sure that the last 1/2/3/4-byte sequence before limit is complete |
michael@0 | 526 | * or runs into a lead byte. |
michael@0 | 527 | * In the span loop compare s with limit only once |
michael@0 | 528 | * per multi-byte character. |
michael@0 | 529 | * |
michael@0 | 530 | * Give a trailing illegal sequence the same value as the result of contains(FFFD), |
michael@0 | 531 | * including it if that is part of the span, otherwise set limit0 to before |
michael@0 | 532 | * the truncated sequence. |
michael@0 | 533 | */ |
michael@0 | 534 | b=*(limit-1); |
michael@0 | 535 | if((int8_t)b<0) { |
michael@0 | 536 | // b>=0x80: lead or trail byte |
michael@0 | 537 | if(b<0xc0) { |
michael@0 | 538 | // single trail byte, check for preceding 3- or 4-byte lead byte |
michael@0 | 539 | if(length>=2 && (b=*(limit-2))>=0xe0) { |
michael@0 | 540 | limit-=2; |
michael@0 | 541 | if(asciiBytes[0x80]!=spanCondition) { |
michael@0 | 542 | limit0=limit; |
michael@0 | 543 | } |
michael@0 | 544 | } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { |
michael@0 | 545 | // 4-byte lead byte with only two trail bytes |
michael@0 | 546 | limit-=3; |
michael@0 | 547 | if(asciiBytes[0x80]!=spanCondition) { |
michael@0 | 548 | limit0=limit; |
michael@0 | 549 | } |
michael@0 | 550 | } |
michael@0 | 551 | } else { |
michael@0 | 552 | // lead byte with no trail bytes |
michael@0 | 553 | --limit; |
michael@0 | 554 | if(asciiBytes[0x80]!=spanCondition) { |
michael@0 | 555 | limit0=limit; |
michael@0 | 556 | } |
michael@0 | 557 | } |
michael@0 | 558 | } |
michael@0 | 559 | |
michael@0 | 560 | uint8_t t1, t2, t3; |
michael@0 | 561 | |
michael@0 | 562 | while(s<limit) { |
michael@0 | 563 | b=*s; |
michael@0 | 564 | if(b<0xc0) { |
michael@0 | 565 | // ASCII; or trail bytes with the result of contains(FFFD). |
michael@0 | 566 | if(spanCondition) { |
michael@0 | 567 | do { |
michael@0 | 568 | if(!asciiBytes[b]) { |
michael@0 | 569 | return s; |
michael@0 | 570 | } else if(++s==limit) { |
michael@0 | 571 | return limit0; |
michael@0 | 572 | } |
michael@0 | 573 | b=*s; |
michael@0 | 574 | } while(b<0xc0); |
michael@0 | 575 | } else { |
michael@0 | 576 | do { |
michael@0 | 577 | if(asciiBytes[b]) { |
michael@0 | 578 | return s; |
michael@0 | 579 | } else if(++s==limit) { |
michael@0 | 580 | return limit0; |
michael@0 | 581 | } |
michael@0 | 582 | b=*s; |
michael@0 | 583 | } while(b<0xc0); |
michael@0 | 584 | } |
michael@0 | 585 | } |
michael@0 | 586 | ++s; // Advance past the lead byte. |
michael@0 | 587 | if(b>=0xe0) { |
michael@0 | 588 | if(b<0xf0) { |
michael@0 | 589 | if( /* handle U+0000..U+FFFF inline */ |
michael@0 | 590 | (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && |
michael@0 | 591 | (t2=(uint8_t)(s[1]-0x80)) <= 0x3f |
michael@0 | 592 | ) { |
michael@0 | 593 | b&=0xf; |
michael@0 | 594 | uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; |
michael@0 | 595 | if(twoBits<=1) { |
michael@0 | 596 | // All 64 code points with this lead byte and middle trail byte |
michael@0 | 597 | // are either in the set or not. |
michael@0 | 598 | if(twoBits!=(uint32_t)spanCondition) { |
michael@0 | 599 | return s-1; |
michael@0 | 600 | } |
michael@0 | 601 | } else { |
michael@0 | 602 | // Look up the code point in its 4k block of code points. |
michael@0 | 603 | UChar32 c=(b<<12)|(t1<<6)|t2; |
michael@0 | 604 | if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { |
michael@0 | 605 | return s-1; |
michael@0 | 606 | } |
michael@0 | 607 | } |
michael@0 | 608 | s+=2; |
michael@0 | 609 | continue; |
michael@0 | 610 | } |
michael@0 | 611 | } else if( /* handle U+10000..U+10FFFF inline */ |
michael@0 | 612 | (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && |
michael@0 | 613 | (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && |
michael@0 | 614 | (t3=(uint8_t)(s[2]-0x80)) <= 0x3f |
michael@0 | 615 | ) { |
michael@0 | 616 | // Give an illegal sequence the same value as the result of contains(FFFD). |
michael@0 | 617 | UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; |
michael@0 | 618 | if( ( (0x10000<=c && c<=0x10ffff) ? |
michael@0 | 619 | containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : |
michael@0 | 620 | asciiBytes[0x80] |
michael@0 | 621 | ) != spanCondition |
michael@0 | 622 | ) { |
michael@0 | 623 | return s-1; |
michael@0 | 624 | } |
michael@0 | 625 | s+=3; |
michael@0 | 626 | continue; |
michael@0 | 627 | } |
michael@0 | 628 | } else /* 0xc0<=b<0xe0 */ { |
michael@0 | 629 | if( /* handle U+0000..U+07FF inline */ |
michael@0 | 630 | (t1=(uint8_t)(*s-0x80)) <= 0x3f |
michael@0 | 631 | ) { |
michael@0 | 632 | if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { |
michael@0 | 633 | return s-1; |
michael@0 | 634 | } |
michael@0 | 635 | ++s; |
michael@0 | 636 | continue; |
michael@0 | 637 | } |
michael@0 | 638 | } |
michael@0 | 639 | |
michael@0 | 640 | // Give an illegal sequence the same value as the result of contains(FFFD). |
michael@0 | 641 | // Handle each byte of an illegal sequence separately to simplify the code; |
michael@0 | 642 | // no need to optimize error handling. |
michael@0 | 643 | if(asciiBytes[0x80]!=spanCondition) { |
michael@0 | 644 | return s-1; |
michael@0 | 645 | } |
michael@0 | 646 | } |
michael@0 | 647 | |
michael@0 | 648 | return limit0; |
michael@0 | 649 | } |
michael@0 | 650 | |
michael@0 | 651 | /* |
michael@0 | 652 | * While going backwards through UTF-8 optimize only for ASCII. |
michael@0 | 653 | * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not |
michael@0 | 654 | * possible to tell from the last byte in a multi-byte sequence how many |
michael@0 | 655 | * preceding bytes there should be. Therefore, going backwards through UTF-8 |
michael@0 | 656 | * is much harder than going forward. |
michael@0 | 657 | */ |
michael@0 | 658 | int32_t |
michael@0 | 659 | BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { |
michael@0 | 660 | if(spanCondition!=USET_SPAN_NOT_CONTAINED) { |
michael@0 | 661 | spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. |
michael@0 | 662 | } |
michael@0 | 663 | |
michael@0 | 664 | uint8_t b; |
michael@0 | 665 | |
michael@0 | 666 | do { |
michael@0 | 667 | b=s[--length]; |
michael@0 | 668 | if((int8_t)b>=0) { |
michael@0 | 669 | // ASCII sub-span |
michael@0 | 670 | if(spanCondition) { |
michael@0 | 671 | do { |
michael@0 | 672 | if(!asciiBytes[b]) { |
michael@0 | 673 | return length+1; |
michael@0 | 674 | } else if(length==0) { |
michael@0 | 675 | return 0; |
michael@0 | 676 | } |
michael@0 | 677 | b=s[--length]; |
michael@0 | 678 | } while((int8_t)b>=0); |
michael@0 | 679 | } else { |
michael@0 | 680 | do { |
michael@0 | 681 | if(asciiBytes[b]) { |
michael@0 | 682 | return length+1; |
michael@0 | 683 | } else if(length==0) { |
michael@0 | 684 | return 0; |
michael@0 | 685 | } |
michael@0 | 686 | b=s[--length]; |
michael@0 | 687 | } while((int8_t)b>=0); |
michael@0 | 688 | } |
michael@0 | 689 | } |
michael@0 | 690 | |
michael@0 | 691 | int32_t prev=length; |
michael@0 | 692 | UChar32 c; |
michael@0 | 693 | // trail byte: collect a multi-byte character |
michael@0 | 694 | // (or lead byte in last-trail position) |
michael@0 | 695 | c=utf8_prevCharSafeBody(s, 0, &length, b, -3); |
michael@0 | 696 | // c is a valid code point, not ASCII, not a surrogate |
michael@0 | 697 | if(c<=0x7ff) { |
michael@0 | 698 | if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { |
michael@0 | 699 | return prev+1; |
michael@0 | 700 | } |
michael@0 | 701 | } else if(c<=0xffff) { |
michael@0 | 702 | int lead=c>>12; |
michael@0 | 703 | uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; |
michael@0 | 704 | if(twoBits<=1) { |
michael@0 | 705 | // All 64 code points with the same bits 15..6 |
michael@0 | 706 | // are either in the set or not. |
michael@0 | 707 | if(twoBits!=(uint32_t)spanCondition) { |
michael@0 | 708 | return prev+1; |
michael@0 | 709 | } |
michael@0 | 710 | } else { |
michael@0 | 711 | // Look up the code point in its 4k block of code points. |
michael@0 | 712 | if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { |
michael@0 | 713 | return prev+1; |
michael@0 | 714 | } |
michael@0 | 715 | } |
michael@0 | 716 | } else { |
michael@0 | 717 | if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { |
michael@0 | 718 | return prev+1; |
michael@0 | 719 | } |
michael@0 | 720 | } |
michael@0 | 721 | } while(length>0); |
michael@0 | 722 | return 0; |
michael@0 | 723 | } |
michael@0 | 724 | |
michael@0 | 725 | U_NAMESPACE_END |