intl/icu/source/common/bmpset.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2007-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  bmpset.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2007jan29
    14 *   created by: Markus W. Scherer
    15 */
    17 #include "unicode/utypes.h"
    18 #include "unicode/uniset.h"
    19 #include "unicode/utf8.h"
    20 #include "unicode/utf16.h"
    21 #include "cmemory.h"
    22 #include "bmpset.h"
    23 #include "uassert.h"
    25 U_NAMESPACE_BEGIN
    27 BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) :
    28         list(parentList), listLength(parentListLength) {
    29     uprv_memset(asciiBytes, 0, sizeof(asciiBytes));
    30     uprv_memset(table7FF, 0, sizeof(table7FF));
    31     uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits));
    33     /*
    34      * Set the list indexes for binary searches for
    35      * U+0800, U+1000, U+2000, .., U+F000, U+10000.
    36      * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are
    37      * looked up in the bit tables.
    38      * The last pair of indexes is for finding supplementary code points.
    39      */
    40     list4kStarts[0]=findCodePoint(0x800, 0, listLength-1);
    41     int32_t i;
    42     for(i=1; i<=0x10; ++i) {
    43         list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1);
    44     }
    45     list4kStarts[0x11]=listLength-1;
    47     initBits();
    48     overrideIllegal();
    49 }
    51 BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) :
    52         list(newParentList), listLength(newParentListLength) {
    53     uprv_memcpy(asciiBytes, otherBMPSet.asciiBytes, sizeof(asciiBytes));
    54     uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF));
    55     uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits));
    56     uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts));
    57 }
    59 BMPSet::~BMPSet() {
    60 }
    62 /*
    63  * Set bits in a bit rectangle in "vertical" bit organization.
    64  * start<limit<=0x800
    65  */
    66 static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) {
    67     U_ASSERT(start<limit);
    68     U_ASSERT(limit<=0x800);
    70     int32_t lead=start>>6;  // Named for UTF-8 2-byte lead byte with upper 5 bits.
    71     int32_t trail=start&0x3f;  // Named for UTF-8 2-byte trail byte with lower 6 bits.
    73     // Set one bit indicating an all-one block.
    74     uint32_t bits=(uint32_t)1<<lead;
    75     if((start+1)==limit) {  // Single-character shortcut.
    76         table[trail]|=bits;
    77         return;
    78     }
    80     int32_t limitLead=limit>>6;
    81     int32_t limitTrail=limit&0x3f;
    83     if(lead==limitLead) {
    84         // Partial vertical bit column.
    85         while(trail<limitTrail) {
    86             table[trail++]|=bits;
    87         }
    88     } else {
    89         // Partial vertical bit column,
    90         // followed by a bit rectangle,
    91         // followed by another partial vertical bit column.
    92         if(trail>0) {
    93             do {
    94                 table[trail++]|=bits;
    95             } while(trail<64);
    96             ++lead;
    97         }
    98         if(lead<limitLead) {
    99             bits=~((1<<lead)-1);
   100             if(limitLead<0x20) {
   101                 bits&=(1<<limitLead)-1;
   102             }
   103             for(trail=0; trail<64; ++trail) {
   104                 table[trail]|=bits;
   105             }
   106         }
   107         // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
   108         // In that case, bits=1<<limitLead is undefined but the bits value
   109         // is not used because trail<limitTrail is already false.
   110         bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead);
   111         for(trail=0; trail<limitTrail; ++trail) {
   112             table[trail]|=bits;
   113         }
   114     }
   115 }
   117 void BMPSet::initBits() {
   118     UChar32 start, limit;
   119     int32_t listIndex=0;
   121     // Set asciiBytes[].
   122     do {
   123         start=list[listIndex++];
   124         if(listIndex<listLength) {
   125             limit=list[listIndex++];
   126         } else {
   127             limit=0x110000;
   128         }
   129         if(start>=0x80) {
   130             break;
   131         }
   132         do {
   133             asciiBytes[start++]=1;
   134         } while(start<limit && start<0x80);
   135     } while(limit<=0x80);
   137     // Set table7FF[].
   138     while(start<0x800) {
   139         set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800);
   140         if(limit>0x800) {
   141             start=0x800;
   142             break;
   143         }
   145         start=list[listIndex++];
   146         if(listIndex<listLength) {
   147             limit=list[listIndex++];
   148         } else {
   149             limit=0x110000;
   150         }
   151     }
   153     // Set bmpBlockBits[].
   154     int32_t minStart=0x800;
   155     while(start<0x10000) {
   156         if(limit>0x10000) {
   157             limit=0x10000;
   158         }
   160         if(start<minStart) {
   161             start=minStart;
   162         }
   163         if(start<limit) {  // Else: Another range entirely in a known mixed-value block.
   164             if(start&0x3f) {
   165                 // Mixed-value block of 64 code points.
   166                 start>>=6;
   167                 bmpBlockBits[start&0x3f]|=0x10001<<(start>>6);
   168                 start=(start+1)<<6;  // Round up to the next block boundary.
   169                 minStart=start;      // Ignore further ranges in this block.
   170             }
   171             if(start<limit) {
   172                 if(start<(limit&~0x3f)) {
   173                     // Multiple all-ones blocks of 64 code points each.
   174                     set32x64Bits(bmpBlockBits, start>>6, limit>>6);
   175                 }
   177                 if(limit&0x3f) {
   178                     // Mixed-value block of 64 code points.
   179                     limit>>=6;
   180                     bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6);
   181                     limit=(limit+1)<<6;  // Round up to the next block boundary.
   182                     minStart=limit;      // Ignore further ranges in this block.
   183                 }
   184             }
   185         }
   187         if(limit==0x10000) {
   188             break;
   189         }
   191         start=list[listIndex++];
   192         if(listIndex<listLength) {
   193             limit=list[listIndex++];
   194         } else {
   195             limit=0x110000;
   196         }
   197     }
   198 }
   200 /*
   201  * Override some bits and bytes to the result of contains(FFFD)
   202  * for faster validity checking at runtime.
   203  * No need to set 0 values where they were reset to 0 in the constructor
   204  * and not modified by initBits().
   205  * (asciiBytes[] trail bytes, table7FF[] 0..7F, bmpBlockBits[] 0..7FF)
   206  * Need to set 0 values for surrogates D800..DFFF.
   207  */
   208 void BMPSet::overrideIllegal() {
   209     uint32_t bits, mask;
   210     int32_t i;
   212     if(containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10])) {
   213         // contains(FFFD)==TRUE
   214         for(i=0x80; i<0xc0; ++i) {
   215             asciiBytes[i]=1;
   216         }
   218         bits=3;                 // Lead bytes 0xC0 and 0xC1.
   219         for(i=0; i<64; ++i) {
   220             table7FF[i]|=bits;
   221         }
   223         bits=1;                 // Lead byte 0xE0.
   224         for(i=0; i<32; ++i) {   // First half of 4k block.
   225             bmpBlockBits[i]|=bits;
   226         }
   228         mask=~(0x10001<<0xd);   // Lead byte 0xED.
   229         bits=1<<0xd;
   230         for(i=32; i<64; ++i) {  // Second half of 4k block.
   231             bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits;
   232         }
   233     } else {
   234         // contains(FFFD)==FALSE
   235         mask=~(0x10001<<0xd);   // Lead byte 0xED.
   236         for(i=32; i<64; ++i) {  // Second half of 4k block.
   237             bmpBlockBits[i]&=mask;
   238         }
   239     }
   240 }
   242 int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const {
   243     /* Examples:
   244                                        findCodePoint(c)
   245        set              list[]         c=0 1 3 4 7 8
   246        ===              ==============   ===========
   247        []               [110000]         0 0 0 0 0 0
   248        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
   249        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
   250        [:Any:]          [0, 110000]      1 1 1 1 1 1
   251      */
   253     // Return the smallest i such that c < list[i].  Assume
   254     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
   255     if (c < list[lo])
   256         return lo;
   257     // High runner test.  c is often after the last range, so an
   258     // initial check for this condition pays off.
   259     if (lo >= hi || c >= list[hi-1])
   260         return hi;
   261     // invariant: c >= list[lo]
   262     // invariant: c < list[hi]
   263     for (;;) {
   264         int32_t i = (lo + hi) >> 1;
   265         if (i == lo) {
   266             break; // Found!
   267         } else if (c < list[i]) {
   268             hi = i;
   269         } else {
   270             lo = i;
   271         }
   272     }
   273     return hi;
   274 }
   276 UBool
   277 BMPSet::contains(UChar32 c) const {
   278     if((uint32_t)c<=0x7f) {
   279         return (UBool)asciiBytes[c];
   280     } else if((uint32_t)c<=0x7ff) {
   281         return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0);
   282     } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) {
   283         int lead=c>>12;
   284         uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   285         if(twoBits<=1) {
   286             // All 64 code points with the same bits 15..6
   287             // are either in the set or not.
   288             return (UBool)twoBits;
   289         } else {
   290             // Look up the code point in its 4k block of code points.
   291             return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]);
   292         }
   293     } else if((uint32_t)c<=0x10ffff) {
   294         // surrogate or supplementary code point
   295         return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
   296     } else {
   297         // Out-of-range code points get FALSE, consistent with long-standing
   298         // behavior of UnicodeSet::contains(c).
   299         return FALSE;
   300     }
   301 }
   303 /*
   304  * Check for sufficient length for trail unit for each surrogate pair.
   305  * Handle single surrogates as surrogate code points as usual in ICU.
   306  */
   307 const UChar *
   308 BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
   309     UChar c, c2;
   311     if(spanCondition) {
   312         // span
   313         do {
   314             c=*s;
   315             if(c<=0x7f) {
   316                 if(!asciiBytes[c]) {
   317                     break;
   318                 }
   319             } else if(c<=0x7ff) {
   320                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
   321                     break;
   322                 }
   323             } else if(c<0xd800 || c>=0xe000) {
   324                 int lead=c>>12;
   325                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   326                 if(twoBits<=1) {
   327                     // All 64 code points with the same bits 15..6
   328                     // are either in the set or not.
   329                     if(twoBits==0) {
   330                         break;
   331                     }
   332                 } else {
   333                     // Look up the code point in its 4k block of code points.
   334                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   335                         break;
   336                     }
   337                 }
   338             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
   339                 // surrogate code point
   340                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   341                     break;
   342                 }
   343             } else {
   344                 // surrogate pair
   345                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
   346                     break;
   347                 }
   348                 ++s;
   349             }
   350         } while(++s<limit);
   351     } else {
   352         // span not
   353         do {
   354             c=*s;
   355             if(c<=0x7f) {
   356                 if(asciiBytes[c]) {
   357                     break;
   358                 }
   359             } else if(c<=0x7ff) {
   360                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
   361                     break;
   362                 }
   363             } else if(c<0xd800 || c>=0xe000) {
   364                 int lead=c>>12;
   365                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   366                 if(twoBits<=1) {
   367                     // All 64 code points with the same bits 15..6
   368                     // are either in the set or not.
   369                     if(twoBits!=0) {
   370                         break;
   371                     }
   372                 } else {
   373                     // Look up the code point in its 4k block of code points.
   374                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   375                         break;
   376                     }
   377                 }
   378             } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) {
   379                 // surrogate code point
   380                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   381                     break;
   382                 }
   383             } else {
   384                 // surrogate pair
   385                 if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) {
   386                     break;
   387                 }
   388                 ++s;
   389             }
   390         } while(++s<limit);
   391     }
   392     return s;
   393 }
   395 /* Symmetrical with span(). */
   396 const UChar *
   397 BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const {
   398     UChar c, c2;
   400     if(spanCondition) {
   401         // span
   402         for(;;) {
   403             c=*(--limit);
   404             if(c<=0x7f) {
   405                 if(!asciiBytes[c]) {
   406                     break;
   407                 }
   408             } else if(c<=0x7ff) {
   409                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) {
   410                     break;
   411                 }
   412             } else if(c<0xd800 || c>=0xe000) {
   413                 int lead=c>>12;
   414                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   415                 if(twoBits<=1) {
   416                     // All 64 code points with the same bits 15..6
   417                     // are either in the set or not.
   418                     if(twoBits==0) {
   419                         break;
   420                     }
   421                 } else {
   422                     // Look up the code point in its 4k block of code points.
   423                     if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   424                         break;
   425                     }
   426                 }
   427             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
   428                 // surrogate code point
   429                 if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   430                     break;
   431                 }
   432             } else {
   433                 // surrogate pair
   434                 if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
   435                     break;
   436                 }
   437                 --limit;
   438             }
   439             if(s==limit) {
   440                 return s;
   441             }
   442         }
   443     } else {
   444         // span not
   445         for(;;) {
   446             c=*(--limit);
   447             if(c<=0x7f) {
   448                 if(asciiBytes[c]) {
   449                     break;
   450                 }
   451             } else if(c<=0x7ff) {
   452                 if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) {
   453                     break;
   454                 }
   455             } else if(c<0xd800 || c>=0xe000) {
   456                 int lead=c>>12;
   457                 uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   458                 if(twoBits<=1) {
   459                     // All 64 code points with the same bits 15..6
   460                     // are either in the set or not.
   461                     if(twoBits!=0) {
   462                         break;
   463                     }
   464                 } else {
   465                     // Look up the code point in its 4k block of code points.
   466                     if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) {
   467                         break;
   468                     }
   469                 }
   470             } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) {
   471                 // surrogate code point
   472                 if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) {
   473                     break;
   474                 }
   475             } else {
   476                 // surrogate pair
   477                 if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) {
   478                     break;
   479                 }
   480                 --limit;
   481             }
   482             if(s==limit) {
   483                 return s;
   484             }
   485         }
   486     }
   487     return limit+1;
   488 }
   490 /*
   491  * Precheck for sufficient trail bytes at end of string only once per span.
   492  * Check validity.
   493  */
   494 const uint8_t *
   495 BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   496     const uint8_t *limit=s+length;
   497     uint8_t b=*s;
   498     if((int8_t)b>=0) {
   499         // Initial all-ASCII span.
   500         if(spanCondition) {
   501             do {
   502                 if(!asciiBytes[b] || ++s==limit) {
   503                     return s;
   504                 }
   505                 b=*s;
   506             } while((int8_t)b>=0);
   507         } else {
   508             do {
   509                 if(asciiBytes[b] || ++s==limit) {
   510                     return s;
   511                 }
   512                 b=*s;
   513             } while((int8_t)b>=0);
   514         }
   515         length=(int32_t)(limit-s);
   516     }
   518     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   519         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   520     }
   522     const uint8_t *limit0=limit;
   524     /*
   525      * Make sure that the last 1/2/3/4-byte sequence before limit is complete
   526      * or runs into a lead byte.
   527      * In the span loop compare s with limit only once
   528      * per multi-byte character.
   529      *
   530      * Give a trailing illegal sequence the same value as the result of contains(FFFD),
   531      * including it if that is part of the span, otherwise set limit0 to before
   532      * the truncated sequence.
   533      */
   534     b=*(limit-1);
   535     if((int8_t)b<0) {
   536         // b>=0x80: lead or trail byte
   537         if(b<0xc0) {
   538             // single trail byte, check for preceding 3- or 4-byte lead byte
   539             if(length>=2 && (b=*(limit-2))>=0xe0) {
   540                 limit-=2;
   541                 if(asciiBytes[0x80]!=spanCondition) {
   542                     limit0=limit;
   543                 }
   544             } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) {
   545                 // 4-byte lead byte with only two trail bytes
   546                 limit-=3;
   547                 if(asciiBytes[0x80]!=spanCondition) {
   548                     limit0=limit;
   549                 }
   550             }
   551         } else {
   552             // lead byte with no trail bytes
   553             --limit;
   554             if(asciiBytes[0x80]!=spanCondition) {
   555                 limit0=limit;
   556             }
   557         }
   558     }
   560     uint8_t t1, t2, t3;
   562     while(s<limit) {
   563         b=*s;
   564         if(b<0xc0) {
   565             // ASCII; or trail bytes with the result of contains(FFFD).
   566             if(spanCondition) {
   567                 do {
   568                     if(!asciiBytes[b]) {
   569                         return s;
   570                     } else if(++s==limit) {
   571                         return limit0;
   572                     }
   573                     b=*s;
   574                 } while(b<0xc0);
   575             } else {
   576                 do {
   577                     if(asciiBytes[b]) {
   578                         return s;
   579                     } else if(++s==limit) {
   580                         return limit0;
   581                     }
   582                     b=*s;
   583                 } while(b<0xc0);
   584             }
   585         }
   586         ++s;  // Advance past the lead byte.
   587         if(b>=0xe0) {
   588             if(b<0xf0) {
   589                 if( /* handle U+0000..U+FFFF inline */
   590                     (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
   591                     (t2=(uint8_t)(s[1]-0x80)) <= 0x3f
   592                 ) {
   593                     b&=0xf;
   594                     uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001;
   595                     if(twoBits<=1) {
   596                         // All 64 code points with this lead byte and middle trail byte
   597                         // are either in the set or not.
   598                         if(twoBits!=(uint32_t)spanCondition) {
   599                             return s-1;
   600                         }
   601                     } else {
   602                         // Look up the code point in its 4k block of code points.
   603                         UChar32 c=(b<<12)|(t1<<6)|t2;
   604                         if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) {
   605                             return s-1;
   606                         }
   607                     }
   608                     s+=2;
   609                     continue;
   610                 }
   611             } else if( /* handle U+10000..U+10FFFF inline */
   612                 (t1=(uint8_t)(s[0]-0x80)) <= 0x3f &&
   613                 (t2=(uint8_t)(s[1]-0x80)) <= 0x3f &&
   614                 (t3=(uint8_t)(s[2]-0x80)) <= 0x3f
   615             ) {
   616                 // Give an illegal sequence the same value as the result of contains(FFFD).
   617                 UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3;
   618                 if( (   (0x10000<=c && c<=0x10ffff) ?
   619                             containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) :
   620                             asciiBytes[0x80]
   621                     ) != spanCondition
   622                 ) {
   623                     return s-1;
   624                 }
   625                 s+=3;
   626                 continue;
   627             }
   628         } else /* 0xc0<=b<0xe0 */ {
   629             if( /* handle U+0000..U+07FF inline */
   630                 (t1=(uint8_t)(*s-0x80)) <= 0x3f
   631             ) {
   632                 if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) {
   633                     return s-1;
   634                 }
   635                 ++s;
   636                 continue;
   637             }
   638         }
   640         // Give an illegal sequence the same value as the result of contains(FFFD).
   641         // Handle each byte of an illegal sequence separately to simplify the code;
   642         // no need to optimize error handling.
   643         if(asciiBytes[0x80]!=spanCondition) {
   644             return s-1;
   645         }
   646     }
   648     return limit0;
   649 }
   651 /*
   652  * While going backwards through UTF-8 optimize only for ASCII.
   653  * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not
   654  * possible to tell from the last byte in a multi-byte sequence how many
   655  * preceding bytes there should be. Therefore, going backwards through UTF-8
   656  * is much harder than going forward.
   657  */
   658 int32_t
   659 BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   660     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
   661         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
   662     }
   664     uint8_t b;
   666     do {
   667         b=s[--length];
   668         if((int8_t)b>=0) {
   669             // ASCII sub-span
   670             if(spanCondition) {
   671                 do {
   672                     if(!asciiBytes[b]) {
   673                         return length+1;
   674                     } else if(length==0) {
   675                         return 0;
   676                     }
   677                     b=s[--length];
   678                 } while((int8_t)b>=0);
   679             } else {
   680                 do {
   681                     if(asciiBytes[b]) {
   682                         return length+1;
   683                     } else if(length==0) {
   684                         return 0;
   685                     }
   686                     b=s[--length];
   687                 } while((int8_t)b>=0);
   688             }
   689         }
   691         int32_t prev=length;
   692         UChar32 c;
   693         // trail byte: collect a multi-byte character
   694         // (or  lead byte in last-trail position)
   695         c=utf8_prevCharSafeBody(s, 0, &length, b, -3);
   696         // c is a valid code point, not ASCII, not a surrogate
   697         if(c<=0x7ff) {
   698             if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) {
   699                 return prev+1;
   700             }
   701         } else if(c<=0xffff) {
   702             int lead=c>>12;
   703             uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001;
   704             if(twoBits<=1) {
   705                 // All 64 code points with the same bits 15..6
   706                 // are either in the set or not.
   707                 if(twoBits!=(uint32_t)spanCondition) {
   708                     return prev+1;
   709                 }
   710             } else {
   711                 // Look up the code point in its 4k block of code points.
   712                 if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) {
   713                     return prev+1;
   714                 }
   715             }
   716         } else {
   717             if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) {
   718                 return prev+1;
   719             }
   720         }
   721     } while(length>0);
   722     return 0;
   723 }
   725 U_NAMESPACE_END

mercurial