intl/icu/source/common/bmpset.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/bmpset.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,725 @@
     1.4 +/*
     1.5 +******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2007-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +******************************************************************************
    1.11 +*   file name:  bmpset.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2007jan29
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +#include "unicode/uniset.h"
    1.22 +#include "unicode/utf8.h"
    1.23 +#include "unicode/utf16.h"
    1.24 +#include "cmemory.h"
    1.25 +#include "bmpset.h"
    1.26 +#include "uassert.h"
    1.27 +
    1.28 +U_NAMESPACE_BEGIN
    1.29 +
    1.30 +BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
    1.31 +        list(parentList), listLength(parentListLength) {
    1.32 +    uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
    1.33 +    uprv_memset(table7FF, 0, sizeof(table7FF));
    1.34 +    uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
    1.35 +
    1.36 +    /*
    1.37 +     * Set the list indexes for binary searches for
    1.38 +     * U+0800, U+1000, U+2000, .., U+F000, U+10000.
    1.39 +     * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
    1.40 +     * looked up in the bit tables.
    1.41 +     * The last pair of indexes is for finding supplementary code points.
    1.42 +     */
    1.43 +    list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
    1.44 +    int32_t i;
    1.45 +    for(i=1; i<=0x10; ++i) {
    1.46 +        list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
    1.47 +    }
    1.48 +    list4kStarts[0x11]=listLength-1;
    1.49 +
    1.50 +    initBits();
    1.51 +    overrideIllegal();
    1.52 +}
    1.53 +
    1.54 +BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
    1.55 +        list(newParentList), listLength(newParentListLength) {
    1.56 +    uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
    1.57 +    uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
    1.58 +    uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
    1.59 +    uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
    1.60 +}
    1.61 +
    1.62 +BMPSet::~BMPSet() {
    1.63 +}
    1.64 +
    1.65 +/*
    1.66 + * Set bits in a bit rectangle in "vertical" bit organization.
    1.67 + * start<limit<=0x800
    1.68 + */
    1.69 +static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
    1.70 +    U_ASSERT(start<limit);
    1.71 +    U_ASSERT(limit<=0x800);
    1.72 +
    1.73 +    int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
    1.74 +    int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
    1.75 +
    1.76 +    // Set one bit indicating an all-one block.
    1.77 +    uint32_t bits=(uint32_t)1<<lead;
    1.78 +    if((start+1)==limit) {  // Single-character shortcut.
    1.79 +        table[trail]|=bits;
    1.80 +        return;
    1.81 +    }
    1.82 +
    1.83 +    int32_t limitLead=limit>>6;
    1.84 +    int32_t limitTrail=limit&0x3f;
    1.85 +
    1.86 +    if(lead==limitLead) {
    1.87 +        // Partial vertical bit column.
    1.88 +        while(trail<limitTrail) {
    1.89 +            table[trail++]|=bits;
    1.90 +        }
    1.91 +    } else {
    1.92 +        // Partial vertical bit column,
    1.93 +        // followed by a bit rectangle,
    1.94 +        // followed by another partial vertical bit column.
    1.95 +        if(trail>0) {
    1.96 +            do {
    1.97 +                table[trail++]|=bits;
    1.98 +            } while(trail<64);
    1.99 +            ++lead;
   1.100 +        }
   1.101 +        if(lead<limitLead) {
   1.102 +            bits=~((1<<lead)-1);
   1.103 +            if(limitLead<0x20) {
   1.104 +                bits&=(1<<limitLead)-1;
   1.105 +            }
   1.106 +            for(trail=0; trail<64; ++trail) {
   1.107 +                table[trail]|=bits;
   1.108 +            }
   1.109 +        }
   1.110 +        // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
   1.111 +        // In that case, bits=1<<limitLead is undefined but the bits value
   1.112 +        // is not used because trail<limitTrail is already false.
   1.113 +        bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
   1.114 +        for(trail=0; trail<limitTrail; ++trail) {
   1.115 +            table[trail]|=bits;
   1.116 +        }
   1.117 +    }
   1.118 +}
   1.119 +
   1.120 +void BMPSet::initBits() {
   1.121 +    UChar32 start, limit;
   1.122 +    int32_t listIndex=0;
   1.123 +
   1.124 +    // Set asciiBytes[].
   1.125 +    do {
   1.126 +        start=list[listIndex++];
   1.127 +        if(listIndex<listLength) {
   1.128 +            limit=list[listIndex++];
   1.129 +        } else {
   1.130 +            limit=0x110000;
   1.131 +        }
   1.132 +        if(start>=0x80) {
   1.133 +            break;
   1.134 +        }
   1.135 +        do {
   1.136 +            asciiBytes[start++]=1;
   1.137 +        } while(start<limit && start<0x80);
   1.138 +    } while(limit<=0x80);
   1.139 +
   1.140 +    // Set table7FF[].
   1.141 +    while(start<0x800) {
   1.142 +        set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
   1.143 +        if(limit>0x800) {
   1.144 +            start=0x800;
   1.145 +            break;
   1.146 +        }
   1.147 +
   1.148 +        start=list[listIndex++];
   1.149 +        if(listIndex<listLength) {
   1.150 +            limit=list[listIndex++];
   1.151 +        } else {
   1.152 +            limit=0x110000;
   1.153 +        }
   1.154 +    }
   1.155 +
   1.156 +    // Set bmpBlockBits[].
   1.157 +    int32_t minStart=0x800;
   1.158 +    while(start<0x10000) {
   1.159 +        if(limit>0x10000) {
   1.160 +            limit=0x10000;
   1.161 +        }
   1.162 +
   1.163 +        if(start<minStart) {
   1.164 +            start=minStart;
   1.165 +        }
   1.166 +        if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
   1.167 +            if(start&0x3f) {
   1.168 +                // Mixed-value block of 64 code points.
   1.169 +                start>>=6;
   1.170 +                bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
   1.171 +                start=(start+1)<<6;  // Round up to the next block boundary.
   1.172 +                minStart=start;      // Ignore further ranges in this block.
   1.173 +            }
   1.174 +            if(start<limit) {
   1.175 +                if(start<(limit&~0x3f)) {
   1.176 +                    // Multiple all-ones blocks of 64 code points each.
   1.177 +                    set32x64Bits(bmpBlockBits, start>>6, limit>>6);
   1.178 +                }
   1.179 +
   1.180 +                if(limit&0x3f) {
   1.181 +                    // Mixed-value block of 64 code points.
   1.182 +                    limit>>=6;
   1.183 +                    bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
   1.184 +                    limit=(limit+1)<<6;  // Round up to the next block boundary.
   1.185 +                    minStart=limit;      // Ignore further ranges in this block.
   1.186 +                }
   1.187 +            }
   1.188 +        }
   1.189 +
   1.190 +        if(limit==0x10000) {
   1.191 +            break;
   1.192 +        }
   1.193 +
   1.194 +        start=list[listIndex++];
   1.195 +        if(listIndex<listLength) {
   1.196 +            limit=list[listIndex++];
   1.197 +        } else {
   1.198 +            limit=0x110000;
   1.199 +        }
   1.200 +    }
   1.201 +}
   1.202 +
   1.203 +/*
   1.204 + * Override some bits and bytes to the result of contains(FFFD)
   1.205 + * for faster validity checking at runtime.
   1.206 + * No need to set 0 values where they were reset to 0 in the constructor
   1.207 + * and not modified by initBits().
   1.208 + * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
   1.209 + * Need to set 0 values for surrogates D800..DFFF.
   1.210 + */
   1.211 +void BMPSet::overrideIllegal() {
   1.212 +    uint32_t bits, mask;
   1.213 +    int32_t i;
   1.214 +
   1.215 +    if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
   1.216 +        // contains(FFFD)==TRUE
   1.217 +        for(i=0x80; i<0xc0; ++i) {
   1.218 +            asciiBytes[i]=1;
   1.219 +        }
   1.220 +
   1.221 +        bits=3;                 // Lead bytes 0xC0 and 0xC1.
   1.222 +        for(i=0; i<64; ++i) {
   1.223 +            table7FF[i]|=bits;
   1.224 +        }
   1.225 +
   1.226 +        bits=1;                 // Lead byte 0xE0.
   1.227 +        for(i=0; i<32; ++i) {   // First half of 4k block.
   1.228 +            bmpBlockBits[i]|=bits;
   1.229 +        }
   1.230 +
   1.231 +        mask=~(0x10001<<0xd);   // Lead byte 0xED.
   1.232 +        bits=1<<0xd;
   1.233 +        for(i=32; i<64; ++i) {  // Second half of 4k block.
   1.234 +            bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
   1.235 +        }
   1.236 +    } else {
   1.237 +        // contains(FFFD)==FALSE
   1.238 +        mask=~(0x10001<<0xd);   // Lead byte 0xED.
   1.239 +        for(i=32; i<64; ++i) {  // Second half of 4k block.
   1.240 +            bmpBlockBits[i]&=mask;
   1.241 +        }
   1.242 +    }
   1.243 +}
   1.244 +
   1.245 +int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
   1.246 +    /* Examples:
   1.247 +                                       findCodePoint(c)
   1.248 +       set              list[]         c=0 1 3 4 7 8
   1.249 +       ===              ==============   ===========
   1.250 +       []               [110000]         0 0 0 0 0 0
   1.251 +       [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
   1.252 +       [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
   1.253 +       [:Any:]          [0, 110000]      1 1 1 1 1 1
   1.254 +     */
   1.255 +
   1.256 +    // Return the smallest i such that c < list[i].  Assume
   1.257 +    // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
   1.258 +    if (c < list[lo])
   1.259 +        return lo;
   1.260 +    // High runner test.  c is often after the last range, so an
   1.261 +    // initial check for this condition pays off.
   1.262 +    if (lo >= hi || c >= list[hi-1])
   1.263 +        return hi;
   1.264 +    // invariant: c >= list[lo]
   1.265 +    // invariant: c < list[hi]
   1.266 +    for (;;) {
   1.267 +        int32_t i = (lo + hi) >> 1;
   1.268 +        if (i == lo) {
   1.269 +            break; // Found!
   1.270 +        } else if (c < list[i]) {
   1.271 +            hi = i;
   1.272 +        } else {
   1.273 +            lo = i;
   1.274 +        }
   1.275 +    }
   1.276 +    return hi;
   1.277 +}
   1.278 +
   1.279 +UBool
   1.280 +BMPSet::contains(UChar32 c) const {
   1.281 +    if((uint32_t)c<=0x7f) {
   1.282 +        return (UBool)asciiBytes[c];
   1.283 +    } else if((uint32_t)c<=0x7ff) {
   1.284 +        return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
   1.285 +    } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
   1.286 +        int lead=c>>12;
   1.287 +        uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.288 +        if(twoBits<=1) {
   1.289 +            // All 64 code points with the same bits 15..6
   1.290 +            // are either in the set or not.
   1.291 +            return (UBool)twoBits;
   1.292 +        } else {
   1.293 +            // Look up the code point in its 4k block of code points.
   1.294 +            return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
   1.295 +        }
   1.296 +    } else if((uint32_t)c<=0x10ffff) {
   1.297 +        // surrogate or supplementary code point
   1.298 +        return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
   1.299 +    } else {
   1.300 +        // Out-of-range code points get FALSE, consistent with long-standing
   1.301 +        // behavior of UnicodeSet::contains(c).
   1.302 +        return FALSE;
   1.303 +    }
   1.304 +}
   1.305 +
   1.306 +/*
   1.307 + * Check for sufficient length for trail unit for each surrogate pair.
   1.308 + * Handle single surrogates as surrogate code points as usual in ICU.
   1.309 + */
   1.310 +const UChar *
   1.311 +BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
   1.312 +    UChar c, c2;
   1.313 +
   1.314 +    if(spanCondition) {
   1.315 +        // span
   1.316 +        do {
   1.317 +            c=*s;
   1.318 +            if(c<=0x7f) {
   1.319 +                if(!asciiBytes[c]) {
   1.320 +                    break;
   1.321 +                }
   1.322 +            } else if(c<=0x7ff) {
   1.323 +                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
   1.324 +                    break;
   1.325 +                }
   1.326 +            } else if(c<0xd800 || c>=0xe000) {
   1.327 +                int lead=c>>12;
   1.328 +                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.329 +                if(twoBits<=1) {
   1.330 +                    // All 64 code points with the same bits 15..6
   1.331 +                    // are either in the set or not.
   1.332 +                    if(twoBits==0) {
   1.333 +                        break;
   1.334 +                    }
   1.335 +                } else {
   1.336 +                    // Look up the code point in its 4k block of code points.
   1.337 +                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   1.338 +                        break;
   1.339 +                    }
   1.340 +                }
   1.341 +            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
   1.342 +                // surrogate code point
   1.343 +                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   1.344 +                    break;
   1.345 +                }
   1.346 +            } else {
   1.347 +                // surrogate pair
   1.348 +                if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
   1.349 +                    break;
   1.350 +                }
   1.351 +                ++s;
   1.352 +            }
   1.353 +        } while(++s<limit);
   1.354 +    } else {
   1.355 +        // span not
   1.356 +        do {
   1.357 +            c=*s;
   1.358 +            if(c<=0x7f) {
   1.359 +                if(asciiBytes[c]) {
   1.360 +                    break;
   1.361 +                }
   1.362 +            } else if(c<=0x7ff) {
   1.363 +                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
   1.364 +                    break;
   1.365 +                }
   1.366 +            } else if(c<0xd800 || c>=0xe000) {
   1.367 +                int lead=c>>12;
   1.368 +                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.369 +                if(twoBits<=1) {
   1.370 +                    // All 64 code points with the same bits 15..6
   1.371 +                    // are either in the set or not.
   1.372 +                    if(twoBits!=0) {
   1.373 +                        break;
   1.374 +                    }
   1.375 +                } else {
   1.376 +                    // Look up the code point in its 4k block of code points.
   1.377 +                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   1.378 +                        break;
   1.379 +                    }
   1.380 +                }
   1.381 +            } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
   1.382 +                // surrogate code point
   1.383 +                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   1.384 +                    break;
   1.385 +                }
   1.386 +            } else {
   1.387 +                // surrogate pair
   1.388 +                if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
   1.389 +                    break;
   1.390 +                }
   1.391 +                ++s;
   1.392 +            }
   1.393 +        } while(++s<limit);
   1.394 +    }
   1.395 +    return s;
   1.396 +}
   1.397 +
   1.398 +/* Symmetrical with span(). */
   1.399 +const UChar *
   1.400 +BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
   1.401 +    UChar c, c2;
   1.402 +
   1.403 +    if(spanCondition) {
   1.404 +        // span
   1.405 +        for(;;) {
   1.406 +            c=*(--limit);
   1.407 +            if(c<=0x7f) {
   1.408 +                if(!asciiBytes[c]) {
   1.409 +                    break;
   1.410 +                }
   1.411 +            } else if(c<=0x7ff) {
   1.412 +                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
   1.413 +                    break;
   1.414 +                }
   1.415 +            } else if(c<0xd800 || c>=0xe000) {
   1.416 +                int lead=c>>12;
   1.417 +                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.418 +                if(twoBits<=1) {
   1.419 +                    // All 64 code points with the same bits 15..6
   1.420 +                    // are either in the set or not.
   1.421 +                    if(twoBits==0) {
   1.422 +                        break;
   1.423 +                    }
   1.424 +                } else {
   1.425 +                    // Look up the code point in its 4k block of code points.
   1.426 +                    if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   1.427 +                        break;
   1.428 +                    }
   1.429 +                }
   1.430 +            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
   1.431 +                // surrogate code point
   1.432 +                if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   1.433 +                    break;
   1.434 +                }
   1.435 +            } else {
   1.436 +                // surrogate pair
   1.437 +                if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
   1.438 +                    break;
   1.439 +                }
   1.440 +                --limit;
   1.441 +            }
   1.442 +            if(s==limit) {
   1.443 +                return s;
   1.444 +            }
   1.445 +        }
   1.446 +    } else {
   1.447 +        // span not
   1.448 +        for(;;) {
   1.449 +            c=*(--limit);
   1.450 +            if(c<=0x7f) {
   1.451 +                if(asciiBytes[c]) {
   1.452 +                    break;
   1.453 +                }
   1.454 +            } else if(c<=0x7ff) {
   1.455 +                if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
   1.456 +                    break;
   1.457 +                }
   1.458 +            } else if(c<0xd800 || c>=0xe000) {
   1.459 +                int lead=c>>12;
   1.460 +                uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.461 +                if(twoBits<=1) {
   1.462 +                    // All 64 code points with the same bits 15..6
   1.463 +                    // are either in the set or not.
   1.464 +                    if(twoBits!=0) {
   1.465 +                        break;
   1.466 +                    }
   1.467 +                } else {
   1.468 +                    // Look up the code point in its 4k block of code points.
   1.469 +                    if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   1.470 +                        break;
   1.471 +                    }
   1.472 +                }
   1.473 +            } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
   1.474 +                // surrogate code point
   1.475 +                if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   1.476 +                    break;
   1.477 +                }
   1.478 +            } else {
   1.479 +                // surrogate pair
   1.480 +                if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
   1.481 +                    break;
   1.482 +                }
   1.483 +                --limit;
   1.484 +            }
   1.485 +            if(s==limit) {
   1.486 +                return s;
   1.487 +            }
   1.488 +        }
   1.489 +    }
   1.490 +    return limit+1;
   1.491 +}
   1.492 +
   1.493 +/*
   1.494 + * Precheck for sufficient trail bytes at end of string only once per span.
   1.495 + * Check validity.
   1.496 + */
   1.497 +const uint8_t *
   1.498 +BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1.499 +    const uint8_t *limit=s+length;
   1.500 +    uint8_t b=*s;
   1.501 +    if((int8_t)b>=0) {
   1.502 +        // Initial all-ASCII span.
   1.503 +        if(spanCondition) {
   1.504 +            do {
   1.505 +                if(!asciiBytes[b] || ++s==limit) {
   1.506 +                    return s;
   1.507 +                }
   1.508 +                b=*s;
   1.509 +            } while((int8_t)b>=0);
   1.510 +        } else {
   1.511 +            do {
   1.512 +                if(asciiBytes[b] || ++s==limit) {
   1.513 +                    return s;
   1.514 +                }
   1.515 +                b=*s;
   1.516 +            } while((int8_t)b>=0);
   1.517 +        }
   1.518 +        length=(int32_t)(limit-s);
   1.519 +    }
   1.520 +
   1.521 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   1.522 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   1.523 +    }
   1.524 +
   1.525 +    const uint8_t *limit0=limit;
   1.526 +
   1.527 +    /*
   1.528 +     * Make sure that the last 1/2/3/4-byte sequence before limit is complete
   1.529 +     * or runs into a lead byte.
   1.530 +     * In the span loop compare s with limit only once
   1.531 +     * per multi-byte character.
   1.532 +     *
   1.533 +     * Give a trailing illegal sequence the same value as the result of contains(FFFD),
   1.534 +     * including it if that is part of the span, otherwise set limit0 to before
   1.535 +     * the truncated sequence.
   1.536 +     */
   1.537 +    b=*(limit-1);
   1.538 +    if((int8_t)b<0) {
   1.539 +        // b>=0x80: lead or trail byte
   1.540 +        if(b<0xc0) {
   1.541 +            // single trail byte, check for preceding 3- or 4-byte lead byte
   1.542 +            if(length>=2 && (b=*(limit-2))>=0xe0) {
   1.543 +                limit-=2;
   1.544 +                if(asciiBytes[0x80]!=spanCondition) {
   1.545 +                    limit0=limit;
   1.546 +                }
   1.547 +            } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
   1.548 +                // 4-byte lead byte with only two trail bytes
   1.549 +                limit-=3;
   1.550 +                if(asciiBytes[0x80]!=spanCondition) {
   1.551 +                    limit0=limit;
   1.552 +                }
   1.553 +            }
   1.554 +        } else {
   1.555 +            // lead byte with no trail bytes
   1.556 +            --limit;
   1.557 +            if(asciiBytes[0x80]!=spanCondition) {
   1.558 +                limit0=limit;
   1.559 +            }
   1.560 +        }
   1.561 +    }
   1.562 +
   1.563 +    uint8_t t1, t2, t3;
   1.564 +
   1.565 +    while(s<limit) {
   1.566 +        b=*s;
   1.567 +        if(b<0xc0) {
   1.568 +            // ASCII; or trail bytes with the result of contains(FFFD).
   1.569 +            if(spanCondition) {
   1.570 +                do {
   1.571 +                    if(!asciiBytes[b]) {
   1.572 +                        return s;
   1.573 +                    } else if(++s==limit) {
   1.574 +                        return limit0;
   1.575 +                    }
   1.576 +                    b=*s;
   1.577 +                } while(b<0xc0);
   1.578 +            } else {
   1.579 +                do {
   1.580 +                    if(asciiBytes[b]) {
   1.581 +                        return s;
   1.582 +                    } else if(++s==limit) {
   1.583 +                        return limit0;
   1.584 +                    }
   1.585 +                    b=*s;
   1.586 +                } while(b<0xc0);
   1.587 +            }
   1.588 +        }
   1.589 +        ++s;  // Advance past the lead byte.
   1.590 +        if(b>=0xe0) {
   1.591 +            if(b<0xf0) {
   1.592 +                if( /* handle U+0000..U+FFFF inline */
   1.593 +                    (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
   1.594 +                    (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
   1.595 +                ) {
   1.596 +                    b&=0xf;
   1.597 +                    uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
   1.598 +                    if(twoBits<=1) {
   1.599 +                        // All 64 code points with this lead byte and middle trail byte
   1.600 +                        // are either in the set or not.
   1.601 +                        if(twoBits!=(uint32_t)spanCondition) {
   1.602 +                            return s-1;
   1.603 +                        }
   1.604 +                    } else {
   1.605 +                        // Look up the code point in its 4k block of code points.
   1.606 +                        UChar32 c=(b<<12)|(t1<<6)|t2;
   1.607 +                        if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
   1.608 +                            return s-1;
   1.609 +                        }
   1.610 +                    }
   1.611 +                    s+=2;
   1.612 +                    continue;
   1.613 +                }
   1.614 +            } else if( /* handle U+10000..U+10FFFF inline */
   1.615 +                (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
   1.616 +                (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
   1.617 +                (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
   1.618 +            ) {
   1.619 +                // Give an illegal sequence the same value as the result of contains(FFFD).
   1.620 +                UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
   1.621 +                if( (   (0x10000<=c && c<=0x10ffff) ?
   1.622 +                            containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
   1.623 +                            asciiBytes[0x80]
   1.624 +                    ) != spanCondition
   1.625 +                ) {
   1.626 +                    return s-1;
   1.627 +                }
   1.628 +                s+=3;
   1.629 +                continue;
   1.630 +            }
   1.631 +        } else /* 0xc0<=b<0xe0 */ {
   1.632 +            if( /* handle U+0000..U+07FF inline */
   1.633 +                (t1=(uint8_t)(*s-0x80)) <= 0x3f
   1.634 +            ) {
   1.635 +                if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
   1.636 +                    return s-1;
   1.637 +                }
   1.638 +                ++s;
   1.639 +                continue;
   1.640 +            }
   1.641 +        }
   1.642 +
   1.643 +        // Give an illegal sequence the same value as the result of contains(FFFD).
   1.644 +        // Handle each byte of an illegal sequence separately to simplify the code;
   1.645 +        // no need to optimize error handling.
   1.646 +        if(asciiBytes[0x80]!=spanCondition) {
   1.647 +            return s-1;
   1.648 +        }
   1.649 +    }
   1.650 +
   1.651 +    return limit0;
   1.652 +}
   1.653 +
   1.654 +/*
   1.655 + * While going backwards through UTF-8 optimize only for ASCII.
   1.656 + * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
   1.657 + * possible to tell from the last byte in a multi-byte sequence how many
   1.658 + * preceding bytes there should be. Therefore, going backwards through UTF-8
   1.659 + * is much harder than going forward.
   1.660 + */
   1.661 +int32_t
   1.662 +BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   1.663 +    if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   1.664 +        spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   1.665 +    }
   1.666 +
   1.667 +    uint8_t b;
   1.668 +
   1.669 +    do {
   1.670 +        b=s[--length];
   1.671 +        if((int8_t)b>=0) {
   1.672 +            // ASCII sub-span
   1.673 +            if(spanCondition) {
   1.674 +                do {
   1.675 +                    if(!asciiBytes[b]) {
   1.676 +                        return length+1;
   1.677 +                    } else if(length==0) {
   1.678 +                        return 0;
   1.679 +                    }
   1.680 +                    b=s[--length];
   1.681 +                } while((int8_t)b>=0);
   1.682 +            } else {
   1.683 +                do {
   1.684 +                    if(asciiBytes[b]) {
   1.685 +                        return length+1;
   1.686 +                    } else if(length==0) {
   1.687 +                        return 0;
   1.688 +                    }
   1.689 +                    b=s[--length];
   1.690 +                } while((int8_t)b>=0);
   1.691 +            }
   1.692 +        }
   1.693 +
   1.694 +        int32_t prev=length;
   1.695 +        UChar32 c;
   1.696 +        // trail byte: collect a multi-byte character
   1.697 +        // (or  lead byte in last-trail position)
   1.698 +        c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
   1.699 +        // c is a valid code point, not ASCII, not a surrogate
   1.700 +        if(c<=0x7ff) {
   1.701 +            if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
   1.702 +                return prev+1;
   1.703 +            }
   1.704 +        } else if(c<=0xffff) {
   1.705 +            int lead=c>>12;
   1.706 +            uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   1.707 +            if(twoBits<=1) {
   1.708 +                // All 64 code points with the same bits 15..6
   1.709 +                // are either in the set or not.
   1.710 +                if(twoBits!=(uint32_t)spanCondition) {
   1.711 +                    return prev+1;
   1.712 +                }
   1.713 +            } else {
   1.714 +                // Look up the code point in its 4k block of code points.
   1.715 +                if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
   1.716 +                    return prev+1;
   1.717 +                }
   1.718 +            }
   1.719 +        } else {
   1.720 +            if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
   1.721 +                return prev+1;
   1.722 +            }
   1.723 +        }
   1.724 +    } while(length>0);
   1.725 +    return 0;
   1.726 +}
   1.727 +
   1.728 +U_NAMESPACE_END

mercurial