The Tor Browser: intl/icu/source/common/unisetspan.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/unisetspan.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/unisetspan.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Thu, 22 Jan 2015 13:21:57 +0100
branch: TOR_BUG_9701
changeset 15: b8a032363ba2
permissions: -rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

 /*
 ******************************************************************************
 *
 *   Copyright (C) 2007-2012, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 *
 ******************************************************************************
 *   file name:  unisetspan.cpp
 *   encoding:   US-ASCII
 *   tab size:   8 (not used)
 *   indentation:4
 *
 *   created on: 2007mar01
 *   created by: Markus W. Scherer
 */
 #include "unicode/utypes.h"
 #include "unicode/uniset.h"
 #include "unicode/ustring.h"
 #include "unicode/utf8.h"
 #include "unicode/utf16.h"
 #include "cmemory.h"
 #include "uvector.h"
 #include "unisetspan.h"
 U_NAMESPACE_BEGIN
 /*
  * List of offsets from the current position from where to try matching
  * a code point or a string.
  * Store offsets rather than indexes to simplify the code and use the same list
  * for both increments (in span()) and decrements (in spanBack()).
  *
  * Assumption: The maximum offset is limited, and the offsets that are stored
  * at any one time are relatively dense, that is, there are normally no gaps of
  * hundreds or thousands of offset values.
  *
  * The implementation uses a circular buffer of byte flags,
  * each indicating whether the corresponding offset is in the list.
  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
  * physically moving part of the list.
  *
  * Note: In principle, the caller should setMaxLength() to the maximum of the
  * max string length and U16_LENGTH/U8_LENGTH to account for
  * "long" single code points.
  * However, this implementation uses at least a staticList with more than
  * U8_LENGTH entries anyway.
  *
  * Note: If maxLength were guaranteed to be no more than 32 or 64,
  * the list could be stored as bit flags in a single integer.
  * Rather than handling a circular buffer with a start list index,
  * the integer would simply be shifted when lower offsets are removed.
  * UnicodeSet does not have a limit on the lengths of strings.
  */
 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
 public:
     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
     ~OffsetList() {
         if(list!=staticList) {
             uprv_free(list);
         }
     }
     // Call exactly once if the list is to be used.
     void setMaxLength(int32_t maxLength) {
         if(maxLength<=(int32_t)sizeof(staticList)) {
             capacity=(int32_t)sizeof(staticList);
         } else {
             UBool *l=(UBool *)uprv_malloc(maxLength);
             if(l!=NULL) {
                 list=l;
                 capacity=maxLength;
             }
         }
         uprv_memset(list, 0, capacity);
     }
     void clear() {
         uprv_memset(list, 0, capacity);
         start=length=0;
     }
     UBool isEmpty() const {
         return (UBool)(length==0);
     }
     // Reduce all stored offsets by delta, used when the current position
     // moves by delta.
     // There must not be any offsets lower than delta.
     // If there is an offset equal to delta, it is removed.
     // delta=[1..maxLength]
     void shift(int32_t delta) {
         int32_t i=start+delta;
         if(i>=capacity) {
             i-=capacity;
         }
         if(list[i]) {
             list[i]=FALSE;
             --length;
         }
         start=i;
     }
     // Add an offset. The list must not contain it yet.
     // offset=[1..maxLength]
     void addOffset(int32_t offset) {
         int32_t i=start+offset;
         if(i>=capacity) {
             i-=capacity;
         }
         list[i]=TRUE;
         ++length;
     }
     // offset=[1..maxLength]
     UBool containsOffset(int32_t offset) const {
         int32_t i=start+offset;
         if(i>=capacity) {
             i-=capacity;
         }
         return list[i];
     }
     // Find the lowest stored offset from a non-empty list, remove it,
     // and reduce all other offsets by this minimum.
     // Returns [1..maxLength].
     int32_t popMinimum() {
         // Look for the next offset in list[start+1..capacity-1].
         int32_t i=start, result;
         while(++i<capacity) {
             if(list[i]) {
                 list[i]=FALSE;
                 --length;
                 result=i-start;
                 start=i;
                 return result;
             }
         }
         // i==capacity
         // Wrap around and look for the next offset in list[0..start].
         // Since the list is not empty, there will be one.
         result=capacity-start;
         i=0;
         while(!list[i]) {
             ++i;
         }
         list[i]=FALSE;
         --length;
         start=i;
         return result+=i;
     }
 private:
     UBool *list;
     int32_t capacity;
     int32_t length;
     int32_t start;
     UBool staticList[16];
 };
 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
 static int32_t
 getUTF8Length(const UChar *s, int32_t length) {
     UErrorCode errorCode=U_ZERO_ERROR;
     int32_t length8=0;
     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
         return length8;
     } else {
         // The string contains an unpaired surrogate.
         // Ignore this string.
         return 0;
     }
 }
 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
 static int32_t
 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
     UErrorCode errorCode=U_ZERO_ERROR;
     int32_t length8=0;
     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
     if(U_SUCCESS(errorCode)) {
         return length8;
     } else {
         // The string contains an unpaired surrogate.
         // Ignore this string.
         return 0;
     }
 }
 static inline uint8_t
 makeSpanLengthByte(int32_t spanLength) {
     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
 }
 // Construct for all variants of span(), or only for any one variant.
 // Initialize as little as possible, for single use.
 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
                                            const UVector &setStrings,
                                            uint32_t which)
         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
           utf8Length(0),
           maxLength16(0), maxLength8(0),
           all((UBool)(which==ALL)) {
     spanSet.retainAll(set);
     if(which&NOT_CONTAINED) {
         // Default to the same sets.
         // addToSpanNotSet() will create a separate set if necessary.
         pSpanNotSet=&spanSet;
     }
     // Determine if the strings even need to be taken into account at all for span() etc.
     // If any string is relevant, then all strings need to be used for
     // span(longest match) but only the relevant ones for span(while contained).
     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
     int32_t stringsLength=strings.size();
     int32_t i, spanLength;
     UBool someRelevant=FALSE;
     for(i=0; i<stringsLength; ++i) {
         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
         UBool thisRelevant;
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
         if(spanLength<length16) {  // Relevant string.
             someRelevant=thisRelevant=TRUE;
         } else {
             thisRelevant=FALSE;
         }
         if((which&UTF16) && length16>maxLength16) {
             maxLength16=length16;
         }
         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
             int32_t length8=getUTF8Length(s16, length16);
             utf8Length+=length8;
             if(length8>maxLength8) {
                 maxLength8=length8;
             }
         }
     }
     if(!someRelevant) {
         maxLength16=maxLength8=0;
         return;
     }
     // Freeze after checking for the need to use strings at all because freezing
     // a set takes some time and memory which are wasted if there are no relevant strings.
     if(all) {
         spanSet.freeze();
     }
     uint8_t *spanBackLengths;
     uint8_t *spanUTF8Lengths;
     uint8_t *spanBackUTF8Lengths;
     // Allocate a block of meta data.
     int32_t allocSize;
     if(all) {
         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
     } else {
         allocSize=stringsLength;  // One set of span lengths.
         if(which&UTF8) {
             // UTF-8 lengths and UTF-8 strings.
             allocSize+=stringsLength*4+utf8Length;
         }
     }
     if(allocSize<=(int32_t)sizeof(staticLengths)) {
         utf8Lengths=staticLengths;
     } else {
         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
         if(utf8Lengths==NULL) {
             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
             return;  // Out of memory.
         }
     }
     if(all) {
         // Store span lengths for all span() variants.
         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
         spanBackLengths=spanLengths+stringsLength;
         spanUTF8Lengths=spanBackLengths+stringsLength;
         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
         utf8=spanBackUTF8Lengths+stringsLength;
     } else {
         // Store span lengths for only one span() variant.
         if(which&UTF8) {
             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
             utf8=spanLengths+stringsLength;
         } else {
             spanLengths=(uint8_t *)utf8Lengths;
         }
         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
     }
     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
     for(i=0; i<stringsLength; ++i) {
         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
         const UChar *s16=string.getBuffer();
         int32_t length16=string.length();
         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
         if(spanLength<length16) {  // Relevant string.
             if(which&UTF16) {
                 if(which&CONTAINED) {
                     if(which&FWD) {
                         spanLengths[i]=makeSpanLengthByte(spanLength);
                     }
                     if(which&BACK) {
                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
                     }
                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
                 }
             }
             if(which&UTF8) {
                 uint8_t *s8=utf8+utf8Count;
                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
                 utf8Count+=utf8Lengths[i]=length8;
                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
                 } else {  // Relevant for UTF-8.
                     if(which&CONTAINED) {
                         if(which&FWD) {
                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
                         }
                         if(which&BACK) {
                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
                         }
                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
                     }
                 }
             }
             if(which&NOT_CONTAINED) {
                 // Add string start and end code points to the spanNotSet so that
                 // a span(while not contained) stops before any string.
                 UChar32 c;
                 if(which&FWD) {
                     int32_t len=0;
                     U16_NEXT(s16, len, length16, c);
                     addToSpanNotSet(c);
                 }
                 if(which&BACK) {
                     int32_t len=length16;
                     U16_PREV(s16, 0, len, c);
                     addToSpanNotSet(c);
                 }
             }
         } else {  // Irrelevant string.
             if(which&UTF8) {
                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
                     uint8_t *s8=utf8+utf8Count;
                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
                     utf8Count+=utf8Lengths[i]=length8;
                 } else {
                     utf8Lengths[i]=0;
                 }
             }
             if(all) {
                 spanLengths[i]=spanBackLengths[i]=
                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
                         (uint8_t)ALL_CP_CONTAINED;
             } else {
                 // All spanXYZLengths pointers contain the same address.
                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
             }
         }
     }
     // Finish.
     if(all) {
         pSpanNotSet->freeze();
     }
 }
 // Copy constructor. Assumes which==ALL for a frozen set.
 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
                                            const UVector &newParentSetStrings)
         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
           utf8Length(otherStringSpan.utf8Length),
           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
           all(TRUE) {
     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
         pSpanNotSet=&spanSet;
     } else {
         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
     }
     // Allocate a block of meta data.
     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
     int32_t stringsLength=strings.size();
     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
     if(allocSize<=(int32_t)sizeof(staticLengths)) {
         utf8Lengths=staticLengths;
     } else {
         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
         if(utf8Lengths==NULL) {
             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
             return;  // Out of memory.
         }
     }
     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
     utf8=spanLengths+stringsLength*4;
     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
 }
 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
         delete pSpanNotSet;
     }
     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
         uprv_free(utf8Lengths);
     }
 }
 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
         if(spanSet.contains(c)) {
             return;  // Nothing to do.
         }
         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
         if(newSet==NULL) {
             return;  // Out of memory.
         } else {
             pSpanNotSet=newSet;
         }
     }
     pSpanNotSet->add(c);
 }
 // Compare strings without any argument checks. Requires length>0.
 static inline UBool
 matches16(const UChar *s, const UChar *t, int32_t length) {
     do {
         if(*s++!=*t++) {
             return FALSE;
         }
     } while(--length>0);
     return TRUE;
 }
 static inline UBool
 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
     do {
         if(*s++!=*t++) {
             return FALSE;
         }
     } while(--length>0);
     return TRUE;
 }
 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
 // at code point boundaries.
 // That is, each edge of a match must not be in the middle of a surrogate pair.
 static inline UBool
 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
     s+=start;
     limit-=start;
     return matches16(s, t, length) &&
            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
 }
 // Does the set contain the next code point?
 // If so, return its length; otherwise return its negative length.
 static inline int32_t
 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
     UChar c=*s, c2;
     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
     }
     return set.contains(c) ? 1 : -1;
 }
 static inline int32_t
 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
     UChar c=s[length-1], c2;
     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
     }
     return set.contains(c) ? 1 : -1;
 }
 static inline int32_t
 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
     UChar32 c=*s;
     if((int8_t)c>=0) {
         return set.contains(c) ? 1 : -1;
     }
     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
     int32_t i=0;
     U8_NEXT_OR_FFFD(s, i, length, c);
     return set.contains(c) ? i : -i;
 }
 static inline int32_t
 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
     UChar32 c=s[length-1];
     if((int8_t)c>=0) {
         return set.contains(c) ? 1 : -1;
     }
     int32_t i=length-1;
     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
     length-=i;
     return set.contains(c) ? length : -length;
 }
 /*
  * Note: In span() when spanLength==0 (after a string match, or at the beginning
  * after an empty code point span) and in spanNot() and spanNotUTF8(),
  * string matching could use a binary search
  * because all string matches are done from the same start index.
  *
  * For UTF-8, this would require a comparison function that returns UTF-16 order.
  *
  * This optimization should not be necessary for normal UnicodeSets because
  * most sets have no strings, and most sets with strings have
  * very few very short strings.
  * For cases with many strings, it might be better to use a different API
  * and implementation with a DFA (state machine).
  */
 /*
  * Algorithm for span(USET_SPAN_CONTAINED)
  *
  * Theoretical algorithm:
  * - Iterate through the string, and at each code point boundary:
  *   + If the code point there is in the set, then remember to continue after it.
  *   + If a set string matches at the current position, then remember to continue after it.
  *   + Either recursively span for each code point or string match,
  *     or recursively span for all but the shortest one and
  *     iteratively continue the span with the shortest local match.
  *   + Remember the longest recursive span (the farthest end point).
  *   + If there is no match at the current position, neither for the code point there
  *     nor for any set string, then stop and return the longest recursive span length.
  *
  * Optimized implementation:
  *
  * (We assume that most sets will have very few very short strings.
  * A span using a string-less set is extremely fast.)
  *
  * Create and cache a spanSet which contains all of the single code points
  * of the original set but none of its strings.
  *
  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
  * - Loop:
  *   + Try to match each set string at the end of the spanLength.
  *     ~ Set strings that start with set-contained code points must be matched
  *       with a partial overlap because the recursive algorithm would have tried
  *       to match them at every position.
  *     ~ Set strings that entirely consist of set-contained code points
  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
  *       recursive algorithm would continue after them anyway
  *       and find the longest recursive match from their end.
  *     ~ Rather than recursing, note each end point of a set string match.
  *   + If no set string matched after spanSet.span(), then return
  *     with where the spanSet.span() ended.
  *   + If at least one set string matched after spanSet.span(), then
  *     pop the shortest string match end point and continue
  *     the loop, trying to match all set strings from there.
  *   + If at least one more set string matched after a previous string match,
  *     then test if the code point after the previous string match is also
  *     contained in the set.
  *     Continue the loop with the shortest end point of either this code point
  *     or a matching set string.
  *   + If no more set string matched after a previous string match,
  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
  *     Stop if spanLength==0, otherwise continue the loop.
  *
  * By noting each end point of a set string match,
  * the function visits each string position at most once and finishes
  * in linear time.
  *
  * The recursive algorithm may visit the same string position many times
  * if multiple paths lead to it and finishes in exponential time.
  */
 /*
  * Algorithm for span(USET_SPAN_SIMPLE)
  *
  * Theoretical algorithm:
  * - Iterate through the string, and at each code point boundary:
  *   + If the code point there is in the set, then remember to continue after it.
  *   + If a set string matches at the current position, then remember to continue after it.
  *   + Continue from the farthest match position and ignore all others.
  *   + If there is no match at the current position,
  *     then stop and return the current position.
  *
  * Optimized implementation:
  *
  * (Same assumption and spanSet as above.)
  *
  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
  * - Loop:
  *   + Try to match each set string at the end of the spanLength.
  *     ~ Set strings that start with set-contained code points must be matched
  *       with a partial overlap because the standard algorithm would have tried
  *       to match them earlier.
  *     ~ Set strings that entirely consist of set-contained code points
  *       must be matched with a full overlap because the longest-match algorithm
  *       would hide set string matches that end earlier.
  *       Such set strings need not be matched earlier inside the code point span
  *       because the standard algorithm would then have continued after
  *       the set string match anyway.
  *     ~ Remember the longest set string match (farthest end point) from the earliest
  *       starting point.
  *   + If no set string matched after spanSet.span(), then return
  *     with where the spanSet.span() ended.
  *   + If at least one set string matched, then continue the loop after the
  *     longest match from the earliest position.
  *   + If no more set string matched after a previous string match,
  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
  *     Stop if spanLength==0, otherwise continue the loop.
  */
 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
         return spanNot(s, length);
     }
     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
     if(spanLength==length) {
         return length;
     }
     // Consider strings; they may overlap with the span.
     OffsetList offsets;
     if(spanCondition==USET_SPAN_CONTAINED) {
         // Use offset list to try all possibilities.
         offsets.setMaxLength(maxLength16);
     }
     int32_t pos=spanLength, rest=length-pos;
     int32_t i, stringsLength=strings.size();
     for(;;) {
         if(spanCondition==USET_SPAN_CONTAINED) {
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
                     continue;  // Irrelevant string.
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
                     overlap=length16;
                     // While contained: No point matching fully inside the code point span.
                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
                 for(;;) {
                     if(inc>rest) {
                         break;
                     }
                     // Try to match if the increment is not listed already.
                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
                         if(inc==rest) {
                             return length;  // Reached the end of the string.
                         }
                         offsets.addOffset(inc);
                     }
                     if(overlap==0) {
                         break;
                     }
                     --overlap;
                     ++inc;
                 }
             }
         } else /* USET_SPAN_SIMPLE */ {
             int32_t maxInc=0, maxOverlap=0;
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanLengths[i];
                 // For longest match, we do need to try to match even an all-contained string
                 // to find the match from the earliest start.
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
                     overlap=length16;
                     // Longest match: Need to match fully inside the code point span
                     // to find the match from the earliest start.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
                 for(;;) {
                     if(inc>rest || overlap<maxOverlap) {
                         break;
                     }
                     // Try to match if the string is longer or starts earlier.
                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
                         matches16CPB(s, pos-overlap, length, s16, length16)
                     ) {
                         maxInc=inc;  // Longest match from earliest start.
                         maxOverlap=overlap;
                         break;
                     }
                     --overlap;
                     ++inc;
                 }
             }
             if(maxInc!=0 || maxOverlap!=0) {
                 // Longest-match algorithm, and there was a string match.
                 // Simply continue after it.
                 pos+=maxInc;
                 rest-=maxInc;
                 if(rest==0) {
                     return length;  // Reached the end of the string.
                 }
                 spanLength=0;  // Match strings from after a string match.
                 continue;
             }
         }
         // Finished trying to match all strings at pos.
         if(spanLength!=0 || pos==0) {
             // The position is after an unlimited code point span (spanLength!=0),
             // not after a string match.
             // The only position where spanLength==0 after a span is pos==0.
             // Otherwise, an unlimited code point span is only tried again when no
             // strings match, and if such a non-initial span fails we stop.
             if(offsets.isEmpty()) {
                 return pos;  // No strings matched after a span.
             }
             // Match strings from after the next string match.
         } else {
             // The position is after a string match (or a single code point).
             if(offsets.isEmpty()) {
                 // No more strings matched after a previous string match.
                 // Try another code point span from after the last string match.
                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
                 if( spanLength==rest || // Reached the end of the string, or
                     spanLength==0       // neither strings nor span progressed.
                 ) {
                     return pos+spanLength;
                 }
                 pos+=spanLength;
                 rest-=spanLength;
                 continue;  // spanLength>0: Match strings from after a span.
             } else {
                 // Try to match only one code point from after a string match if some
                 // string matched beyond it, so that we try all possible positions
                 // and don't overshoot.
                 spanLength=spanOne(spanSet, s+pos, rest);
                 if(spanLength>0) {
                     if(spanLength==rest) {
                         return length;  // Reached the end of the string.
                     }
                     // Match strings after this code point.
                     // There cannot be any increments below it because UnicodeSet strings
                     // contain multiple code points.
                     pos+=spanLength;
                     rest-=spanLength;
                     offsets.shift(spanLength);
                     spanLength=0;
                     continue;  // Match strings from after a single code point.
                 }
                 // Match strings from after the next string match.
             }
         }
         int32_t minOffset=offsets.popMinimum();
         pos+=minOffset;
         rest-=minOffset;
         spanLength=0;  // Match strings from after a string match.
     }
 }
 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
         return spanNotBack(s, length);
     }
     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
     if(pos==0) {
         return 0;
     }
     int32_t spanLength=length-pos;
     // Consider strings; they may overlap with the span.
     OffsetList offsets;
     if(spanCondition==USET_SPAN_CONTAINED) {
         // Use offset list to try all possibilities.
         offsets.setMaxLength(maxLength16);
     }
     int32_t i, stringsLength=strings.size();
     uint8_t *spanBackLengths=spanLengths;
     if(all) {
         spanBackLengths+=stringsLength;
     }
     for(;;) {
         if(spanCondition==USET_SPAN_CONTAINED) {
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanBackLengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
                     continue;  // Irrelevant string.
                 }
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
                     overlap=length16;
                     // While contained: No point matching fully inside the code point span.
                     int32_t len1=0;
                     U16_FWD_1(s16, len1, overlap);
                     overlap-=len1;  // Length of the string minus the first code point.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
                 for(;;) {
                     if(dec>pos) {
                         break;
                     }
                     // Try to match if the decrement is not listed already.
                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
                         if(dec==pos) {
                             return 0;  // Reached the start of the string.
                         }
                         offsets.addOffset(dec);
                     }
                     if(overlap==0) {
                         break;
                     }
                     --overlap;
                     ++dec;
                 }
             }
         } else /* USET_SPAN_SIMPLE */ {
             int32_t maxDec=0, maxOverlap=0;
             for(i=0; i<stringsLength; ++i) {
                 int32_t overlap=spanBackLengths[i];
                 // For longest match, we do need to try to match even an all-contained string
                 // to find the match from the latest end.
                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
                 const UChar *s16=string.getBuffer();
                 int32_t length16=string.length();
                 // Try to match this string at pos-(length16-overlap)..pos-length16.
                 if(overlap>=LONG_SPAN) {
                     overlap=length16;
                     // Longest match: Need to match fully inside the code point span
                     // to find the match from the latest end.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
                 for(;;) {
                     if(dec>pos || overlap<maxOverlap) {
                         break;
                     }
                     // Try to match if the string is longer or ends later.
                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
                         matches16CPB(s, pos-dec, length, s16, length16)
                     ) {
                         maxDec=dec;  // Longest match from latest end.
                         maxOverlap=overlap;
                         break;
                     }
                     --overlap;
                     ++dec;
                 }
             }
             if(maxDec!=0 || maxOverlap!=0) {
                 // Longest-match algorithm, and there was a string match.
                 // Simply continue before it.
                 pos-=maxDec;
                 if(pos==0) {
                     return 0;  // Reached the start of the string.
                 }
                 spanLength=0;  // Match strings from before a string match.
                 continue;
             }
         }
         // Finished trying to match all strings at pos.
         if(spanLength!=0 || pos==length) {
             // The position is before an unlimited code point span (spanLength!=0),
             // not before a string match.
             // The only position where spanLength==0 before a span is pos==length.
             // Otherwise, an unlimited code point span is only tried again when no
             // strings match, and if such a non-initial span fails we stop.
             if(offsets.isEmpty()) {
                 return pos;  // No strings matched before a span.
             }
             // Match strings from before the next string match.
         } else {
             // The position is before a string match (or a single code point).
             if(offsets.isEmpty()) {
                 // No more strings matched before a previous string match.
                 // Try another code point span from before the last string match.
                 int32_t oldPos=pos;
                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
                 spanLength=oldPos-pos;
                 if( pos==0 ||           // Reached the start of the string, or
                     spanLength==0       // neither strings nor span progressed.
                 ) {
                     return pos;
                 }
                 continue;  // spanLength>0: Match strings from before a span.
             } else {
                 // Try to match only one code point from before a string match if some
                 // string matched beyond it, so that we try all possible positions
                 // and don't overshoot.
                 spanLength=spanOneBack(spanSet, s, pos);
                 if(spanLength>0) {
                     if(spanLength==pos) {
                         return 0;  // Reached the start of the string.
                     }
                     // Match strings before this code point.
                     // There cannot be any decrements below it because UnicodeSet strings
                     // contain multiple code points.
                     pos-=spanLength;
                     offsets.shift(spanLength);
                     spanLength=0;
                     continue;  // Match strings from before a single code point.
                 }
                 // Match strings from before the next string match.
             }
         }
         pos-=offsets.popMinimum();
         spanLength=0;  // Match strings from before a string match.
     }
 }
 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
         return spanNotUTF8(s, length);
     }
     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
     if(spanLength==length) {
         return length;
     }
     // Consider strings; they may overlap with the span.
     OffsetList offsets;
     if(spanCondition==USET_SPAN_CONTAINED) {
         // Use offset list to try all possibilities.
         offsets.setMaxLength(maxLength8);
     }
     int32_t pos=spanLength, rest=length-pos;
     int32_t i, stringsLength=strings.size();
     uint8_t *spanUTF8Lengths=spanLengths;
     if(all) {
         spanUTF8Lengths+=2*stringsLength;
     }
     for(;;) {
         const uint8_t *s8=utf8;
         int32_t length8;
         if(spanCondition==USET_SPAN_CONTAINED) {
             for(i=0; i<stringsLength; ++i) {
                 length8=utf8Lengths[i];
                 if(length8==0) {
                     continue;  // String not representable in UTF-8.
                 }
                 int32_t overlap=spanUTF8Lengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
                     s8+=length8;
                     continue;  // Irrelevant string.
                 }
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
                     overlap=length8;
                     // While contained: No point matching fully inside the code point span.
                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
                 for(;;) {
                     if(inc>rest) {
                         break;
                     }
                     // Try to match if the increment is not listed already.
                     // Match at code point boundaries. (The UTF-8 strings were converted
                     // from UTF-16 and are guaranteed to be well-formed.)
                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
                         !offsets.containsOffset(inc) &&
                         matches8(s+pos-overlap, s8, length8)
                     ) {
                         if(inc==rest) {
                             return length;  // Reached the end of the string.
                         }
                         offsets.addOffset(inc);
                     }
                     if(overlap==0) {
                         break;
                     }
                     --overlap;
                     ++inc;
                 }
                 s8+=length8;
             }
         } else /* USET_SPAN_SIMPLE */ {
             int32_t maxInc=0, maxOverlap=0;
             for(i=0; i<stringsLength; ++i) {
                 length8=utf8Lengths[i];
                 if(length8==0) {
                     continue;  // String not representable in UTF-8.
                 }
                 int32_t overlap=spanUTF8Lengths[i];
                 // For longest match, we do need to try to match even an all-contained string
                 // to find the match from the earliest start.
                 // Try to match this string at pos-overlap..pos.
                 if(overlap>=LONG_SPAN) {
                     overlap=length8;
                     // Longest match: Need to match fully inside the code point span
                     // to find the match from the earliest start.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
                 for(;;) {
                     if(inc>rest || overlap<maxOverlap) {
                         break;
                     }
                     // Try to match if the string is longer or starts earlier.
                     // Match at code point boundaries. (The UTF-8 strings were converted
                     // from UTF-16 and are guaranteed to be well-formed.)
                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
                         matches8(s+pos-overlap, s8, length8)
                     ) {
                         maxInc=inc;  // Longest match from earliest start.
                         maxOverlap=overlap;
                         break;
                     }
                     --overlap;
                     ++inc;
                 }
                 s8+=length8;
             }
             if(maxInc!=0 || maxOverlap!=0) {
                 // Longest-match algorithm, and there was a string match.
                 // Simply continue after it.
                 pos+=maxInc;
                 rest-=maxInc;
                 if(rest==0) {
                     return length;  // Reached the end of the string.
                 }
                 spanLength=0;  // Match strings from after a string match.
                 continue;
             }
         }
         // Finished trying to match all strings at pos.
         if(spanLength!=0 || pos==0) {
             // The position is after an unlimited code point span (spanLength!=0),
             // not after a string match.
             // The only position where spanLength==0 after a span is pos==0.
             // Otherwise, an unlimited code point span is only tried again when no
             // strings match, and if such a non-initial span fails we stop.
             if(offsets.isEmpty()) {
                 return pos;  // No strings matched after a span.
             }
             // Match strings from after the next string match.
         } else {
             // The position is after a string match (or a single code point).
             if(offsets.isEmpty()) {
                 // No more strings matched after a previous string match.
                 // Try another code point span from after the last string match.
                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
                 if( spanLength==rest || // Reached the end of the string, or
                     spanLength==0       // neither strings nor span progressed.
                 ) {
                     return pos+spanLength;
                 }
                 pos+=spanLength;
                 rest-=spanLength;
                 continue;  // spanLength>0: Match strings from after a span.
             } else {
                 // Try to match only one code point from after a string match if some
                 // string matched beyond it, so that we try all possible positions
                 // and don't overshoot.
                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
                 if(spanLength>0) {
                     if(spanLength==rest) {
                         return length;  // Reached the end of the string.
                     }
                     // Match strings after this code point.
                     // There cannot be any increments below it because UnicodeSet strings
                     // contain multiple code points.
                     pos+=spanLength;
                     rest-=spanLength;
                     offsets.shift(spanLength);
                     spanLength=0;
                     continue;  // Match strings from after a single code point.
                 }
                 // Match strings from after the next string match.
             }
         }
         int32_t minOffset=offsets.popMinimum();
         pos+=minOffset;
         rest-=minOffset;
         spanLength=0;  // Match strings from after a string match.
     }
 }
 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
         return spanNotBackUTF8(s, length);
     }
     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
     if(pos==0) {
         return 0;
     }
     int32_t spanLength=length-pos;
     // Consider strings; they may overlap with the span.
     OffsetList offsets;
     if(spanCondition==USET_SPAN_CONTAINED) {
         // Use offset list to try all possibilities.
         offsets.setMaxLength(maxLength8);
     }
     int32_t i, stringsLength=strings.size();
     uint8_t *spanBackUTF8Lengths=spanLengths;
     if(all) {
         spanBackUTF8Lengths+=3*stringsLength;
     }
     for(;;) {
         const uint8_t *s8=utf8;
         int32_t length8;
         if(spanCondition==USET_SPAN_CONTAINED) {
             for(i=0; i<stringsLength; ++i) {
                 length8=utf8Lengths[i];
                 if(length8==0) {
                     continue;  // String not representable in UTF-8.
                 }
                 int32_t overlap=spanBackUTF8Lengths[i];
                 if(overlap==ALL_CP_CONTAINED) {
                     s8+=length8;
                     continue;  // Irrelevant string.
                 }
                 // Try to match this string at pos-(length8-overlap)..pos-length8.
                 if(overlap>=LONG_SPAN) {
                     overlap=length8;
                     // While contained: No point matching fully inside the code point span.
                     int32_t len1=0;
                     U8_FWD_1(s8, len1, overlap);
                     overlap-=len1;  // Length of the string minus the first code point.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
                 for(;;) {
                     if(dec>pos) {
                         break;
                     }
                     // Try to match if the decrement is not listed already.
                     // Match at code point boundaries. (The UTF-8 strings were converted
                     // from UTF-16 and are guaranteed to be well-formed.)
                     if( !U8_IS_TRAIL(s[pos-dec]) &&
                         !offsets.containsOffset(dec) &&
                         matches8(s+pos-dec, s8, length8)
                     ) {
                         if(dec==pos) {
                             return 0;  // Reached the start of the string.
                         }
                         offsets.addOffset(dec);
                     }
                     if(overlap==0) {
                         break;
                     }
                     --overlap;
                     ++dec;
                 }
                 s8+=length8;
             }
         } else /* USET_SPAN_SIMPLE */ {
             int32_t maxDec=0, maxOverlap=0;
             for(i=0; i<stringsLength; ++i) {
                 length8=utf8Lengths[i];
                 if(length8==0) {
                     continue;  // String not representable in UTF-8.
                 }
                 int32_t overlap=spanBackUTF8Lengths[i];
                 // For longest match, we do need to try to match even an all-contained string
                 // to find the match from the latest end.
                 // Try to match this string at pos-(length8-overlap)..pos-length8.
                 if(overlap>=LONG_SPAN) {
                     overlap=length8;
                     // Longest match: Need to match fully inside the code point span
                     // to find the match from the latest end.
                 }
                 if(overlap>spanLength) {
                     overlap=spanLength;
                 }
                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
                 for(;;) {
                     if(dec>pos || overlap<maxOverlap) {
                         break;
                     }
                     // Try to match if the string is longer or ends later.
                     // Match at code point boundaries. (The UTF-8 strings were converted
                     // from UTF-16 and are guaranteed to be well-formed.)
                     if( !U8_IS_TRAIL(s[pos-dec]) &&
                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
                         matches8(s+pos-dec, s8, length8)
                     ) {
                         maxDec=dec;  // Longest match from latest end.
                         maxOverlap=overlap;
                         break;
                     }
                     --overlap;
                     ++dec;
                 }
                 s8+=length8;
             }
             if(maxDec!=0 || maxOverlap!=0) {
                 // Longest-match algorithm, and there was a string match.
                 // Simply continue before it.
                 pos-=maxDec;
                 if(pos==0) {
                     return 0;  // Reached the start of the string.
                 }
                 spanLength=0;  // Match strings from before a string match.
                 continue;
             }
         }
         // Finished trying to match all strings at pos.
         if(spanLength!=0 || pos==length) {
             // The position is before an unlimited code point span (spanLength!=0),
             // not before a string match.
             // The only position where spanLength==0 before a span is pos==length.
             // Otherwise, an unlimited code point span is only tried again when no
             // strings match, and if such a non-initial span fails we stop.
             if(offsets.isEmpty()) {
                 return pos;  // No strings matched before a span.
             }
             // Match strings from before the next string match.
         } else {
             // The position is before a string match (or a single code point).
             if(offsets.isEmpty()) {
                 // No more strings matched before a previous string match.
                 // Try another code point span from before the last string match.
                 int32_t oldPos=pos;
                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
                 spanLength=oldPos-pos;
                 if( pos==0 ||           // Reached the start of the string, or
                     spanLength==0       // neither strings nor span progressed.
                 ) {
                     return pos;
                 }
                 continue;  // spanLength>0: Match strings from before a span.
             } else {
                 // Try to match only one code point from before a string match if some
                 // string matched beyond it, so that we try all possible positions
                 // and don't overshoot.
                 spanLength=spanOneBackUTF8(spanSet, s, pos);
                 if(spanLength>0) {
                     if(spanLength==pos) {
                         return 0;  // Reached the start of the string.
                     }
                     // Match strings before this code point.
                     // There cannot be any decrements below it because UnicodeSet strings
                     // contain multiple code points.
                     pos-=spanLength;
                     offsets.shift(spanLength);
                     spanLength=0;
                     continue;  // Match strings from before a single code point.
                 }
                 // Match strings from before the next string match.
             }
         }
         pos-=offsets.popMinimum();
         spanLength=0;  // Match strings from before a string match.
     }
 }
 /*
  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
  *
  * Theoretical algorithm:
  * - Iterate through the string, and at each code point boundary:
  *   + If the code point there is in the set, then return with the current position.
  *   + If a set string matches at the current position, then return with the current position.
  *
  * Optimized implementation:
  *
  * (Same assumption as for span() above.)
  *
  * Create and cache a spanNotSet which contains all of the single code points
  * of the original set but none of its strings.
  * For each set string add its initial code point to the spanNotSet.
  * (Also add its final code point for spanNotBack().)
  *
  * - Loop:
  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
  *   + If the current code point is in the original set, then
  *     return the current position.
  *   + If any set string matches at the current position, then
  *     return the current position.
  *   + If there is no match at the current position, neither for the code point there
  *     nor for any set string, then skip this code point and continue the loop.
  *     This happens for set-string-initial code points that were added to spanNotSet
  *     when there is not actually a match for such a set string.
  */
 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
     int32_t pos=0, rest=length;
     int32_t i, stringsLength=strings.size();
     do {
         // Span until we find a code point from the set,
         // or a code point that starts or ends some string.
         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
         if(i==rest) {
             return length;  // Reached the end of the string.
         }
         pos+=i;
         rest-=i;
         // Check whether the current code point is in the original set,
         // without the string starts and ends.
         int32_t cpLength=spanOne(spanSet, s+pos, rest);
         if(cpLength>0) {
             return pos;  // There is a set element at pos.
         }
         // Try to match the strings at pos.
         for(i=0; i<stringsLength; ++i) {
             if(spanLengths[i]==ALL_CP_CONTAINED) {
                 continue;  // Irrelevant string.
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
         }
         // The span(while not contained) ended on a string start/end which is
         // not in the original set. Skip this code point and continue.
         // cpLength<0
         pos-=cpLength;
         rest+=cpLength;
     } while(rest!=0);
     return length;  // Reached the end of the string.
 }
 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
     int32_t pos=length;
     int32_t i, stringsLength=strings.size();
     do {
         // Span until we find a code point from the set,
         // or a code point that starts or ends some string.
         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
         if(pos==0) {
             return 0;  // Reached the start of the string.
         }
         // Check whether the current code point is in the original set,
         // without the string starts and ends.
         int32_t cpLength=spanOneBack(spanSet, s, pos);
         if(cpLength>0) {
             return pos;  // There is a set element at pos.
         }
         // Try to match the strings at pos.
         for(i=0; i<stringsLength; ++i) {
             // Use spanLengths rather than a spanBackLengths pointer because
             // it is easier and we only need to know whether the string is irrelevant
             // which is the same in either array.
             if(spanLengths[i]==ALL_CP_CONTAINED) {
                 continue;  // Irrelevant string.
             }
             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
             const UChar *s16=string.getBuffer();
             int32_t length16=string.length();
             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
                 return pos;  // There is a set element at pos.
             }
         }
         // The span(while not contained) ended on a string start/end which is
         // not in the original set. Skip this code point and continue.
         // cpLength<0
         pos+=cpLength;
     } while(pos!=0);
     return 0;  // Reached the start of the string.
 }
 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
     int32_t pos=0, rest=length;
     int32_t i, stringsLength=strings.size();
     uint8_t *spanUTF8Lengths=spanLengths;
     if(all) {
         spanUTF8Lengths+=2*stringsLength;
     }
     do {
         // Span until we find a code point from the set,
         // or a code point that starts or ends some string.
         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
         if(i==rest) {
             return length;  // Reached the end of the string.
         }
         pos+=i;
         rest-=i;
         // Check whether the current code point is in the original set,
         // without the string starts and ends.
         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
         if(cpLength>0) {
             return pos;  // There is a set element at pos.
         }
         // Try to match the strings at pos.
         const uint8_t *s8=utf8;
         int32_t length8;
         for(i=0; i<stringsLength; ++i) {
             length8=utf8Lengths[i];
             // ALL_CP_CONTAINED: Irrelevant string.
             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
                 return pos;  // There is a set element at pos.
             }
             s8+=length8;
         }
         // The span(while not contained) ended on a string start/end which is
         // not in the original set. Skip this code point and continue.
         // cpLength<0
         pos-=cpLength;
         rest+=cpLength;
     } while(rest!=0);
     return length;  // Reached the end of the string.
 }
 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
     int32_t pos=length;
     int32_t i, stringsLength=strings.size();
     uint8_t *spanBackUTF8Lengths=spanLengths;
     if(all) {
         spanBackUTF8Lengths+=3*stringsLength;
     }
     do {
         // Span until we find a code point from the set,
         // or a code point that starts or ends some string.
         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
         if(pos==0) {
             return 0;  // Reached the start of the string.
         }
         // Check whether the current code point is in the original set,
         // without the string starts and ends.
         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
         if(cpLength>0) {
             return pos;  // There is a set element at pos.
         }
         // Try to match the strings at pos.
         const uint8_t *s8=utf8;
         int32_t length8;
         for(i=0; i<stringsLength; ++i) {
             length8=utf8Lengths[i];
             // ALL_CP_CONTAINED: Irrelevant string.
             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
                 return pos;  // There is a set element at pos.
             }
             s8+=length8;
         }
         // The span(while not contained) ended on a string start/end which is
         // not in the original set. Skip this code point and continue.
         // cpLength<0
         pos+=cpLength;
     } while(pos!=0);
     return 0;  // Reached the start of the string.
 }
 U_NAMESPACE_END

The Tor Browser / annotate

intl/icu/source/common/unisetspan.cpp@b8a032363ba2 (annotated)

intl/icu/source/common/unisetspan.cpp