The Tor Browser: intl/icu/source/common/unisetspan.cpp@fc2d59ddac77

     1 /*

     2 ******************************************************************************

3 *

     4 *   Copyright (C) 2007-2012, International Business Machines

     5 *   Corporation and others.  All Rights Reserved.

6 *

     7 ******************************************************************************

     8 *   file name:  unisetspan.cpp

     9 *   encoding:   US-ASCII

    10 *   tab size:   8 (not used)

    11 *   indentation:4

    12 *

    13 *   created on: 2007mar01

    14 *   created by: Markus W. Scherer

    15 */

    17 #include "unicode/utypes.h"

    18 #include "unicode/uniset.h"

    19 #include "unicode/ustring.h"

    20 #include "unicode/utf8.h"

    21 #include "unicode/utf16.h"

    22 #include "cmemory.h"

    23 #include "uvector.h"

    24 #include "unisetspan.h"

    26 U_NAMESPACE_BEGIN

    28 /*

    29  * List of offsets from the current position from where to try matching

    30  * a code point or a string.

    31  * Store offsets rather than indexes to simplify the code and use the same list

    32  * for both increments (in span()) and decrements (in spanBack()).

    33  *

    34  * Assumption: The maximum offset is limited, and the offsets that are stored

    35  * at any one time are relatively dense, that is, there are normally no gaps of

    36  * hundreds or thousands of offset values.

    37  *

    38  * The implementation uses a circular buffer of byte flags,

    39  * each indicating whether the corresponding offset is in the list.

    40  * This avoids inserting into a sorted list of offsets (or absolute indexes) and

    41  * physically moving part of the list.

    42  *

    43  * Note: In principle, the caller should setMaxLength() to the maximum of the

    44  * max string length and U16_LENGTH/U8_LENGTH to account for

    45  * "long" single code points.

    46  * However, this implementation uses at least a staticList with more than

    47  * U8_LENGTH entries anyway.

    48  *

    49  * Note: If maxLength were guaranteed to be no more than 32 or 64,

    50  * the list could be stored as bit flags in a single integer.

    51  * Rather than handling a circular buffer with a start list index,

    52  * the integer would simply be shifted when lower offsets are removed.

    53  * UnicodeSet does not have a limit on the lengths of strings.

    54  */

    55 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.

    56 public:

    57     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}

    59     ~OffsetList() {

    60         if(list!=staticList) {

    61             uprv_free(list);

    62         }

    63     }

    65     // Call exactly once if the list is to be used.

    66     void setMaxLength(int32_t maxLength) {

    67         if(maxLength<=(int32_t)sizeof(staticList)) {

    68             capacity=(int32_t)sizeof(staticList);

    69         } else {

    70             UBool *l=(UBool *)uprv_malloc(maxLength);

    71             if(l!=NULL) {

    72                 list=l;

    73                 capacity=maxLength;

    74             }

    75         }

    76         uprv_memset(list, 0, capacity);

    77     }

    79     void clear() {

    80         uprv_memset(list, 0, capacity);

    81         start=length=0;

    82     }

    84     UBool isEmpty() const {

    85         return (UBool)(length==0);

    86     }

    88     // Reduce all stored offsets by delta, used when the current position

    89     // moves by delta.

    90     // There must not be any offsets lower than delta.

    91     // If there is an offset equal to delta, it is removed.

    92     // delta=[1..maxLength]

    93     void shift(int32_t delta) {

    94         int32_t i=start+delta;

    95         if(i>=capacity) {

    96             i-=capacity;

    97         }

    98         if(list[i]) {

    99             list[i]=FALSE;

   100             --length;

   101         }

   102         start=i;

   103     }

   105     // Add an offset. The list must not contain it yet.

   106     // offset=[1..maxLength]

   107     void addOffset(int32_t offset) {

   108         int32_t i=start+offset;

   109         if(i>=capacity) {

   110             i-=capacity;

   111         }

   112         list[i]=TRUE;

   113         ++length;

   114     }

   116     // offset=[1..maxLength]

   117     UBool containsOffset(int32_t offset) const {

   118         int32_t i=start+offset;

   119         if(i>=capacity) {

   120             i-=capacity;

   121         }

   122         return list[i];

   123     }

   125     // Find the lowest stored offset from a non-empty list, remove it,

   126     // and reduce all other offsets by this minimum.

   127     // Returns [1..maxLength].

   128     int32_t popMinimum() {

   129         // Look for the next offset in list[start+1..capacity-1].

   130         int32_t i=start, result;

   131         while(++i<capacity) {

   132             if(list[i]) {

   133                 list[i]=FALSE;

   134                 --length;

   135                 result=i-start;

   136                 start=i;

   137                 return result;

   138             }

   139         }

   140         // i==capacity

   142         // Wrap around and look for the next offset in list[0..start].

   143         // Since the list is not empty, there will be one.

   144         result=capacity-start;

   145         i=0;

   146         while(!list[i]) {

   147             ++i;

   148         }

   149         list[i]=FALSE;

   150         --length;

   151         start=i;

   152         return result+=i;

   153     }

   155 private:

   156     UBool *list;

   157     int32_t capacity;

   158     int32_t length;

   159     int32_t start;

   161     UBool staticList[16];

   162 };

   164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.

   165 static int32_t

   166 getUTF8Length(const UChar *s, int32_t length) {

   167     UErrorCode errorCode=U_ZERO_ERROR;

   168     int32_t length8=0;

   169     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);

   170     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {

   171         return length8;

   172     } else {

   173         // The string contains an unpaired surrogate.

   174         // Ignore this string.

   175         return 0;

   176     }

   177 }

   179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.

   180 static int32_t

   181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {

   182     UErrorCode errorCode=U_ZERO_ERROR;

   183     int32_t length8=0;

   184     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);

   185     if(U_SUCCESS(errorCode)) {

   186         return length8;

   187     } else {

   188         // The string contains an unpaired surrogate.

   189         // Ignore this string.

   190         return 0;

   191     }

   192 }

   194 static inline uint8_t

   195 makeSpanLengthByte(int32_t spanLength) {

   196     // 0xfe==UnicodeSetStringSpan::LONG_SPAN

   197     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;

   198 }

   200 // Construct for all variants of span(), or only for any one variant.

   201 // Initialize as little as possible, for single use.

   202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,

   203                                            const UVector &setStrings,

   204                                            uint32_t which)

   205         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),

   206           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

   207           utf8Length(0),

   208           maxLength16(0), maxLength8(0),

   209           all((UBool)(which==ALL)) {

   210     spanSet.retainAll(set);

   211     if(which&NOT_CONTAINED) {

   212         // Default to the same sets.

   213         // addToSpanNotSet() will create a separate set if necessary.

   214         pSpanNotSet=&spanSet;

   215     }

   217     // Determine if the strings even need to be taken into account at all for span() etc.

   218     // If any string is relevant, then all strings need to be used for

   219     // span(longest match) but only the relevant ones for span(while contained).

   220     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH

   221     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.

   222     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)

   223     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.

   224     int32_t stringsLength=strings.size();

   226     int32_t i, spanLength;

   227     UBool someRelevant=FALSE;

   228     for(i=0; i<stringsLength; ++i) {

   229         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   230         const UChar *s16=string.getBuffer();

   231         int32_t length16=string.length();

   232         UBool thisRelevant;

   233         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

   234         if(spanLength<length16) {  // Relevant string.

   235             someRelevant=thisRelevant=TRUE;

   236         } else {

   237             thisRelevant=FALSE;

   238         }

   239         if((which&UTF16) && length16>maxLength16) {

   240             maxLength16=length16;

   241         }

   242         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {

   243             int32_t length8=getUTF8Length(s16, length16);

   244             utf8Length+=length8;

   245             if(length8>maxLength8) {

   246                 maxLength8=length8;

   247             }

   248         }

   249     }

   250     if(!someRelevant) {

   251         maxLength16=maxLength8=0;

   252         return;

   253     }

   255     // Freeze after checking for the need to use strings at all because freezing

   256     // a set takes some time and memory which are wasted if there are no relevant strings.

   257     if(all) {

   258         spanSet.freeze();

   259     }

   261     uint8_t *spanBackLengths;

   262     uint8_t *spanUTF8Lengths;

   263     uint8_t *spanBackUTF8Lengths;

   265     // Allocate a block of meta data.

   266     int32_t allocSize;

   267     if(all) {

   268         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

   269         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

   270     } else {

   271         allocSize=stringsLength;  // One set of span lengths.

   272         if(which&UTF8) {

   273             // UTF-8 lengths and UTF-8 strings.

   274             allocSize+=stringsLength*4+utf8Length;

   275         }

   276     }

   277     if(allocSize<=(int32_t)sizeof(staticLengths)) {

   278         utf8Lengths=staticLengths;

   279     } else {

   280         utf8Lengths=(int32_t *)uprv_malloc(allocSize);

   281         if(utf8Lengths==NULL) {

   282             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.

   283             return;  // Out of memory.

   284         }

   285     }

   287     if(all) {

   288         // Store span lengths for all span() variants.

   289         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

   290         spanBackLengths=spanLengths+stringsLength;

   291         spanUTF8Lengths=spanBackLengths+stringsLength;

   292         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;

   293         utf8=spanBackUTF8Lengths+stringsLength;

   294     } else {

   295         // Store span lengths for only one span() variant.

   296         if(which&UTF8) {

   297             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

   298             utf8=spanLengths+stringsLength;

   299         } else {

   300             spanLengths=(uint8_t *)utf8Lengths;

   301         }

   302         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;

   303     }

   305     // Set the meta data and pSpanNotSet and write the UTF-8 strings.

   306     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.

   308     for(i=0; i<stringsLength; ++i) {

   309         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   310         const UChar *s16=string.getBuffer();

   311         int32_t length16=string.length();

   312         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);

   313         if(spanLength<length16) {  // Relevant string.

   314             if(which&UTF16) {

   315                 if(which&CONTAINED) {

   316                     if(which&FWD) {

   317                         spanLengths[i]=makeSpanLengthByte(spanLength);

   318                     }

   319                     if(which&BACK) {

   320                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);

   321                         spanBackLengths[i]=makeSpanLengthByte(spanLength);

   322                     }

   323                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

   324                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.

   325                 }

   326             }

   327             if(which&UTF8) {

   328                 uint8_t *s8=utf8+utf8Count;

   329                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);

   330                 utf8Count+=utf8Lengths[i]=length8;

   331                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.

   332                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;

   333                 } else {  // Relevant for UTF-8.

   334                     if(which&CONTAINED) {

   335                         if(which&FWD) {

   336                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);

   337                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);

   338                         }

   339                         if(which&BACK) {

   340                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);

   341                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);

   342                         }

   343                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {

   344                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.

   345                     }

   346                 }

   347             }

   348             if(which&NOT_CONTAINED) {

   349                 // Add string start and end code points to the spanNotSet so that

   350                 // a span(while not contained) stops before any string.

   351                 UChar32 c;

   352                 if(which&FWD) {

   353                     int32_t len=0;

   354                     U16_NEXT(s16, len, length16, c);

   355                     addToSpanNotSet(c);

   356                 }

   357                 if(which&BACK) {

   358                     int32_t len=length16;

   359                     U16_PREV(s16, 0, len, c);

   360                     addToSpanNotSet(c);

   361                 }

   362             }

   363         } else {  // Irrelevant string.

   364             if(which&UTF8) {

   365                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.

   366                     uint8_t *s8=utf8+utf8Count;

   367                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);

   368                     utf8Count+=utf8Lengths[i]=length8;

   369                 } else {

   370                     utf8Lengths[i]=0;

   371                 }

   372             }

   373             if(all) {

   374                 spanLengths[i]=spanBackLengths[i]=

   375                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=

   376                         (uint8_t)ALL_CP_CONTAINED;

   377             } else {

   378                 // All spanXYZLengths pointers contain the same address.

   379                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;

   380             }

   381         }

   382     }

   384     // Finish.

   385     if(all) {

   386         pSpanNotSet->freeze();

   387     }

   388 }

   390 // Copy constructor. Assumes which==ALL for a frozen set.

   391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,

   392                                            const UVector &newParentSetStrings)

   393         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),

   394           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),

   395           utf8Length(otherStringSpan.utf8Length),

   396           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),

   397           all(TRUE) {

   398     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {

   399         pSpanNotSet=&spanSet;

   400     } else {

   401         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();

   402     }

   404     // Allocate a block of meta data.

   405     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.

   406     int32_t stringsLength=strings.size();

   407     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;

   408     if(allocSize<=(int32_t)sizeof(staticLengths)) {

   409         utf8Lengths=staticLengths;

   410     } else {

   411         utf8Lengths=(int32_t *)uprv_malloc(allocSize);

   412         if(utf8Lengths==NULL) {

   413             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.

   414             return;  // Out of memory.

   415         }

   416     }

   418     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);

   419     utf8=spanLengths+stringsLength*4;

   420     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);

   421 }

   423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {

   424     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {

   425         delete pSpanNotSet;

   426     }

   427     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {

   428         uprv_free(utf8Lengths);

   429     }

   430 }

   432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {

   433     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {

   434         if(spanSet.contains(c)) {

   435             return;  // Nothing to do.

   436         }

   437         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();

   438         if(newSet==NULL) {

   439             return;  // Out of memory.

   440         } else {

   441             pSpanNotSet=newSet;

   442         }

   443     }

   444     pSpanNotSet->add(c);

   445 }

   447 // Compare strings without any argument checks. Requires length>0.

   448 static inline UBool

   449 matches16(const UChar *s, const UChar *t, int32_t length) {

   450     do {

   451         if(*s++!=*t++) {

   452             return FALSE;

   453         }

   454     } while(--length>0);

   455     return TRUE;

   456 }

   458 static inline UBool

   459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {

   460     do {

   461         if(*s++!=*t++) {

   462             return FALSE;

   463         }

   464     } while(--length>0);

   465     return TRUE;

   466 }

   468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)

   469 // at code point boundaries.

   470 // That is, each edge of a match must not be in the middle of a surrogate pair.

   471 static inline UBool

   472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {

   473     s+=start;

   474     limit-=start;

   475     return matches16(s, t, length) &&

   476            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&

   477            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));

   478 }

   480 // Does the set contain the next code point?

   481 // If so, return its length; otherwise return its negative length.

   482 static inline int32_t

   483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {

   484     UChar c=*s, c2;

   485     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {

   486         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;

   487     }

   488     return set.contains(c) ? 1 : -1;

   489 }

   491 static inline int32_t

   492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {

   493     UChar c=s[length-1], c2;

   494     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {

   495         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;

   496     }

   497     return set.contains(c) ? 1 : -1;

   498 }

   500 static inline int32_t

   501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

   502     UChar32 c=*s;

   503     if((int8_t)c>=0) {

   504         return set.contains(c) ? 1 : -1;

   505     }

   506     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().

   507     int32_t i=0;

   508     U8_NEXT_OR_FFFD(s, i, length, c);

   509     return set.contains(c) ? i : -i;

   510 }

   512 static inline int32_t

   513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {

   514     UChar32 c=s[length-1];

   515     if((int8_t)c>=0) {

   516         return set.contains(c) ? 1 : -1;

   517     }

   518     int32_t i=length-1;

   519     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);

   520     length-=i;

   521     return set.contains(c) ? length : -length;

   522 }

   524 /*

   525  * Note: In span() when spanLength==0 (after a string match, or at the beginning

   526  * after an empty code point span) and in spanNot() and spanNotUTF8(),

   527  * string matching could use a binary search

   528  * because all string matches are done from the same start index.

   529  *

   530  * For UTF-8, this would require a comparison function that returns UTF-16 order.

   531  *

   532  * This optimization should not be necessary for normal UnicodeSets because

   533  * most sets have no strings, and most sets with strings have

   534  * very few very short strings.

   535  * For cases with many strings, it might be better to use a different API

   536  * and implementation with a DFA (state machine).

   537  */

   539 /*

   540  * Algorithm for span(USET_SPAN_CONTAINED)

   541  *

   542  * Theoretical algorithm:

   543  * - Iterate through the string, and at each code point boundary:

   544  *   + If the code point there is in the set, then remember to continue after it.

   545  *   + If a set string matches at the current position, then remember to continue after it.

   546  *   + Either recursively span for each code point or string match,

   547  *     or recursively span for all but the shortest one and

   548  *     iteratively continue the span with the shortest local match.

   549  *   + Remember the longest recursive span (the farthest end point).

   550  *   + If there is no match at the current position, neither for the code point there

   551  *     nor for any set string, then stop and return the longest recursive span length.

   552  *

   553  * Optimized implementation:

   554  *

   555  * (We assume that most sets will have very few very short strings.

   556  * A span using a string-less set is extremely fast.)

   557  *

   558  * Create and cache a spanSet which contains all of the single code points

   559  * of the original set but none of its strings.

   560  *

   561  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

   562  * - Loop:

   563  *   + Try to match each set string at the end of the spanLength.

   564  *     ~ Set strings that start with set-contained code points must be matched

   565  *       with a partial overlap because the recursive algorithm would have tried

   566  *       to match them at every position.

   567  *     ~ Set strings that entirely consist of set-contained code points

   568  *       are irrelevant for span(USET_SPAN_CONTAINED) because the

   569  *       recursive algorithm would continue after them anyway

   570  *       and find the longest recursive match from their end.

   571  *     ~ Rather than recursing, note each end point of a set string match.

   572  *   + If no set string matched after spanSet.span(), then return

   573  *     with where the spanSet.span() ended.

   574  *   + If at least one set string matched after spanSet.span(), then

   575  *     pop the shortest string match end point and continue

   576  *     the loop, trying to match all set strings from there.

   577  *   + If at least one more set string matched after a previous string match,

   578  *     then test if the code point after the previous string match is also

   579  *     contained in the set.

   580  *     Continue the loop with the shortest end point of either this code point

   581  *     or a matching set string.

   582  *   + If no more set string matched after a previous string match,

   583  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

   584  *     Stop if spanLength==0, otherwise continue the loop.

   585  *

   586  * By noting each end point of a set string match,

   587  * the function visits each string position at most once and finishes

   588  * in linear time.

   589  *

   590  * The recursive algorithm may visit the same string position many times

   591  * if multiple paths lead to it and finishes in exponential time.

   592  */

   594 /*

   595  * Algorithm for span(USET_SPAN_SIMPLE)

   596  *

   597  * Theoretical algorithm:

   598  * - Iterate through the string, and at each code point boundary:

   599  *   + If the code point there is in the set, then remember to continue after it.

   600  *   + If a set string matches at the current position, then remember to continue after it.

   601  *   + Continue from the farthest match position and ignore all others.

   602  *   + If there is no match at the current position,

   603  *     then stop and return the current position.

   604  *

   605  * Optimized implementation:

   606  *

   607  * (Same assumption and spanSet as above.)

   608  *

   609  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).

   610  * - Loop:

   611  *   + Try to match each set string at the end of the spanLength.

   612  *     ~ Set strings that start with set-contained code points must be matched

   613  *       with a partial overlap because the standard algorithm would have tried

   614  *       to match them earlier.

   615  *     ~ Set strings that entirely consist of set-contained code points

   616  *       must be matched with a full overlap because the longest-match algorithm

   617  *       would hide set string matches that end earlier.

   618  *       Such set strings need not be matched earlier inside the code point span

   619  *       because the standard algorithm would then have continued after

   620  *       the set string match anyway.

   621  *     ~ Remember the longest set string match (farthest end point) from the earliest

   622  *       starting point.

   623  *   + If no set string matched after spanSet.span(), then return

   624  *     with where the spanSet.span() ended.

   625  *   + If at least one set string matched, then continue the loop after the

   626  *     longest match from the earliest position.

   627  *   + If no more set string matched after a previous string match,

   628  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).

   629  *     Stop if spanLength==0, otherwise continue the loop.

   630  */

   632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {

   633     if(spanCondition==USET_SPAN_NOT_CONTAINED) {

   634         return spanNot(s, length);

   635     }

   636     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);

   637     if(spanLength==length) {

   638         return length;

   639     }

   641     // Consider strings; they may overlap with the span.

   642     OffsetList offsets;

   643     if(spanCondition==USET_SPAN_CONTAINED) {

   644         // Use offset list to try all possibilities.

   645         offsets.setMaxLength(maxLength16);

   646     }

   647     int32_t pos=spanLength, rest=length-pos;

   648     int32_t i, stringsLength=strings.size();

   649     for(;;) {

   650         if(spanCondition==USET_SPAN_CONTAINED) {

   651             for(i=0; i<stringsLength; ++i) {

   652                 int32_t overlap=spanLengths[i];

   653                 if(overlap==ALL_CP_CONTAINED) {

   654                     continue;  // Irrelevant string.

   655                 }

   656                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   657                 const UChar *s16=string.getBuffer();

   658                 int32_t length16=string.length();

   660                 // Try to match this string at pos-overlap..pos.

   661                 if(overlap>=LONG_SPAN) {

   662                     overlap=length16;

   663                     // While contained: No point matching fully inside the code point span.

   664                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.

   665                 }

   666                 if(overlap>spanLength) {

   667                     overlap=spanLength;

   668                 }

   669                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.

   670                 for(;;) {

   671                     if(inc>rest) {

   672                         break;

   673                     }

   674                     // Try to match if the increment is not listed already.

   675                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {

   676                         if(inc==rest) {

   677                             return length;  // Reached the end of the string.

   678                         }

   679                         offsets.addOffset(inc);

   680                     }

   681                     if(overlap==0) {

   682                         break;

   683                     }

   684                     --overlap;

   685                     ++inc;

   686                 }

   687             }

   688         } else /* USET_SPAN_SIMPLE */ {

   689             int32_t maxInc=0, maxOverlap=0;

   690             for(i=0; i<stringsLength; ++i) {

   691                 int32_t overlap=spanLengths[i];

   692                 // For longest match, we do need to try to match even an all-contained string

   693                 // to find the match from the earliest start.

   695                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   696                 const UChar *s16=string.getBuffer();

   697                 int32_t length16=string.length();

   699                 // Try to match this string at pos-overlap..pos.

   700                 if(overlap>=LONG_SPAN) {

   701                     overlap=length16;

   702                     // Longest match: Need to match fully inside the code point span

   703                     // to find the match from the earliest start.

   704                 }

   705                 if(overlap>spanLength) {

   706                     overlap=spanLength;

   707                 }

   708                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.

   709                 for(;;) {

   710                     if(inc>rest || overlap<maxOverlap) {

   711                         break;

   712                     }

   713                     // Try to match if the string is longer or starts earlier.

   714                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&

   715                         matches16CPB(s, pos-overlap, length, s16, length16)

   716                     ) {

   717                         maxInc=inc;  // Longest match from earliest start.

   718                         maxOverlap=overlap;

   719                         break;

   720                     }

   721                     --overlap;

   722                     ++inc;

   723                 }

   724             }

   726             if(maxInc!=0 || maxOverlap!=0) {

   727                 // Longest-match algorithm, and there was a string match.

   728                 // Simply continue after it.

   729                 pos+=maxInc;

   730                 rest-=maxInc;

   731                 if(rest==0) {

   732                     return length;  // Reached the end of the string.

   733                 }

   734                 spanLength=0;  // Match strings from after a string match.

   735                 continue;

   736             }

   737         }

   738         // Finished trying to match all strings at pos.

   740         if(spanLength!=0 || pos==0) {

   741             // The position is after an unlimited code point span (spanLength!=0),

   742             // not after a string match.

   743             // The only position where spanLength==0 after a span is pos==0.

   744             // Otherwise, an unlimited code point span is only tried again when no

   745             // strings match, and if such a non-initial span fails we stop.

   746             if(offsets.isEmpty()) {

   747                 return pos;  // No strings matched after a span.

   748             }

   749             // Match strings from after the next string match.

   750         } else {

   751             // The position is after a string match (or a single code point).

   752             if(offsets.isEmpty()) {

   753                 // No more strings matched after a previous string match.

   754                 // Try another code point span from after the last string match.

   755                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);

   756                 if( spanLength==rest || // Reached the end of the string, or

   757                     spanLength==0       // neither strings nor span progressed.

   758                 ) {

   759                     return pos+spanLength;

   760                 }

   761                 pos+=spanLength;

   762                 rest-=spanLength;

   763                 continue;  // spanLength>0: Match strings from after a span.

   764             } else {

   765                 // Try to match only one code point from after a string match if some

   766                 // string matched beyond it, so that we try all possible positions

   767                 // and don't overshoot.

   768                 spanLength=spanOne(spanSet, s+pos, rest);

   769                 if(spanLength>0) {

   770                     if(spanLength==rest) {

   771                         return length;  // Reached the end of the string.

   772                     }

   773                     // Match strings after this code point.

   774                     // There cannot be any increments below it because UnicodeSet strings

   775                     // contain multiple code points.

   776                     pos+=spanLength;

   777                     rest-=spanLength;

   778                     offsets.shift(spanLength);

   779                     spanLength=0;

   780                     continue;  // Match strings from after a single code point.

   781                 }

   782                 // Match strings from after the next string match.

   783             }

   784         }

   785         int32_t minOffset=offsets.popMinimum();

   786         pos+=minOffset;

   787         rest-=minOffset;

   788         spanLength=0;  // Match strings from after a string match.

   789     }

   790 }

   792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {

   793     if(spanCondition==USET_SPAN_NOT_CONTAINED) {

   794         return spanNotBack(s, length);

   795     }

   796     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);

   797     if(pos==0) {

   798         return 0;

   799     }

   800     int32_t spanLength=length-pos;

   802     // Consider strings; they may overlap with the span.

   803     OffsetList offsets;

   804     if(spanCondition==USET_SPAN_CONTAINED) {

   805         // Use offset list to try all possibilities.

   806         offsets.setMaxLength(maxLength16);

   807     }

   808     int32_t i, stringsLength=strings.size();

   809     uint8_t *spanBackLengths=spanLengths;

   810     if(all) {

   811         spanBackLengths+=stringsLength;

   812     }

   813     for(;;) {

   814         if(spanCondition==USET_SPAN_CONTAINED) {

   815             for(i=0; i<stringsLength; ++i) {

   816                 int32_t overlap=spanBackLengths[i];

   817                 if(overlap==ALL_CP_CONTAINED) {

   818                     continue;  // Irrelevant string.

   819                 }

   820                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   821                 const UChar *s16=string.getBuffer();

   822                 int32_t length16=string.length();

   824                 // Try to match this string at pos-(length16-overlap)..pos-length16.

   825                 if(overlap>=LONG_SPAN) {

   826                     overlap=length16;

   827                     // While contained: No point matching fully inside the code point span.

   828                     int32_t len1=0;

   829                     U16_FWD_1(s16, len1, overlap);

   830                     overlap-=len1;  // Length of the string minus the first code point.

   831                 }

   832                 if(overlap>spanLength) {

   833                     overlap=spanLength;

   834                 }

   835                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.

   836                 for(;;) {

   837                     if(dec>pos) {

   838                         break;

   839                     }

   840                     // Try to match if the decrement is not listed already.

   841                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {

   842                         if(dec==pos) {

   843                             return 0;  // Reached the start of the string.

   844                         }

   845                         offsets.addOffset(dec);

   846                     }

   847                     if(overlap==0) {

   848                         break;

   849                     }

   850                     --overlap;

   851                     ++dec;

   852                 }

   853             }

   854         } else /* USET_SPAN_SIMPLE */ {

   855             int32_t maxDec=0, maxOverlap=0;

   856             for(i=0; i<stringsLength; ++i) {

   857                 int32_t overlap=spanBackLengths[i];

   858                 // For longest match, we do need to try to match even an all-contained string

   859                 // to find the match from the latest end.

   861                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

   862                 const UChar *s16=string.getBuffer();

   863                 int32_t length16=string.length();

   865                 // Try to match this string at pos-(length16-overlap)..pos-length16.

   866                 if(overlap>=LONG_SPAN) {

   867                     overlap=length16;

   868                     // Longest match: Need to match fully inside the code point span

   869                     // to find the match from the latest end.

   870                 }

   871                 if(overlap>spanLength) {

   872                     overlap=spanLength;

   873                 }

   874                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.

   875                 for(;;) {

   876                     if(dec>pos || overlap<maxOverlap) {

   877                         break;

   878                     }

   879                     // Try to match if the string is longer or ends later.

   880                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&

   881                         matches16CPB(s, pos-dec, length, s16, length16)

   882                     ) {

   883                         maxDec=dec;  // Longest match from latest end.

   884                         maxOverlap=overlap;

   885                         break;

   886                     }

   887                     --overlap;

   888                     ++dec;

   889                 }

   890             }

   892             if(maxDec!=0 || maxOverlap!=0) {

   893                 // Longest-match algorithm, and there was a string match.

   894                 // Simply continue before it.

   895                 pos-=maxDec;

   896                 if(pos==0) {

   897                     return 0;  // Reached the start of the string.

   898                 }

   899                 spanLength=0;  // Match strings from before a string match.

   900                 continue;

   901             }

   902         }

   903         // Finished trying to match all strings at pos.

   905         if(spanLength!=0 || pos==length) {

   906             // The position is before an unlimited code point span (spanLength!=0),

   907             // not before a string match.

   908             // The only position where spanLength==0 before a span is pos==length.

   909             // Otherwise, an unlimited code point span is only tried again when no

   910             // strings match, and if such a non-initial span fails we stop.

   911             if(offsets.isEmpty()) {

   912                 return pos;  // No strings matched before a span.

   913             }

   914             // Match strings from before the next string match.

   915         } else {

   916             // The position is before a string match (or a single code point).

   917             if(offsets.isEmpty()) {

   918                 // No more strings matched before a previous string match.

   919                 // Try another code point span from before the last string match.

   920                 int32_t oldPos=pos;

   921                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);

   922                 spanLength=oldPos-pos;

   923                 if( pos==0 ||           // Reached the start of the string, or

   924                     spanLength==0       // neither strings nor span progressed.

   925                 ) {

   926                     return pos;

   927                 }

   928                 continue;  // spanLength>0: Match strings from before a span.

   929             } else {

   930                 // Try to match only one code point from before a string match if some

   931                 // string matched beyond it, so that we try all possible positions

   932                 // and don't overshoot.

   933                 spanLength=spanOneBack(spanSet, s, pos);

   934                 if(spanLength>0) {

   935                     if(spanLength==pos) {

   936                         return 0;  // Reached the start of the string.

   937                     }

   938                     // Match strings before this code point.

   939                     // There cannot be any decrements below it because UnicodeSet strings

   940                     // contain multiple code points.

   941                     pos-=spanLength;

   942                     offsets.shift(spanLength);

   943                     spanLength=0;

   944                     continue;  // Match strings from before a single code point.

   945                 }

   946                 // Match strings from before the next string match.

   947             }

   948         }

   949         pos-=offsets.popMinimum();

   950         spanLength=0;  // Match strings from before a string match.

   951     }

   952 }

   954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {

   955     if(spanCondition==USET_SPAN_NOT_CONTAINED) {

   956         return spanNotUTF8(s, length);

   957     }

   958     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);

   959     if(spanLength==length) {

   960         return length;

   961     }

   963     // Consider strings; they may overlap with the span.

   964     OffsetList offsets;

   965     if(spanCondition==USET_SPAN_CONTAINED) {

   966         // Use offset list to try all possibilities.

   967         offsets.setMaxLength(maxLength8);

   968     }

   969     int32_t pos=spanLength, rest=length-pos;

   970     int32_t i, stringsLength=strings.size();

   971     uint8_t *spanUTF8Lengths=spanLengths;

   972     if(all) {

   973         spanUTF8Lengths+=2*stringsLength;

   974     }

   975     for(;;) {

   976         const uint8_t *s8=utf8;

   977         int32_t length8;

   978         if(spanCondition==USET_SPAN_CONTAINED) {

   979             for(i=0; i<stringsLength; ++i) {

   980                 length8=utf8Lengths[i];

   981                 if(length8==0) {

   982                     continue;  // String not representable in UTF-8.

   983                 }

   984                 int32_t overlap=spanUTF8Lengths[i];

   985                 if(overlap==ALL_CP_CONTAINED) {

   986                     s8+=length8;

   987                     continue;  // Irrelevant string.

   988                 }

   990                 // Try to match this string at pos-overlap..pos.

   991                 if(overlap>=LONG_SPAN) {

   992                     overlap=length8;

   993                     // While contained: No point matching fully inside the code point span.

   994                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.

   995                 }

   996                 if(overlap>spanLength) {

   997                     overlap=spanLength;

   998                 }

   999                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.

  1000                 for(;;) {

  1001                     if(inc>rest) {

  1002                         break;

  1003                     }

  1004                     // Try to match if the increment is not listed already.

  1005                     // Match at code point boundaries. (The UTF-8 strings were converted

  1006                     // from UTF-16 and are guaranteed to be well-formed.)

  1007                     if( !U8_IS_TRAIL(s[pos-overlap]) &&

  1008                         !offsets.containsOffset(inc) &&

  1009                         matches8(s+pos-overlap, s8, length8)

  1011                     ) {

  1012                         if(inc==rest) {

  1013                             return length;  // Reached the end of the string.

  1014                         }

  1015                         offsets.addOffset(inc);

  1016                     }

  1017                     if(overlap==0) {

  1018                         break;

  1019                     }

  1020                     --overlap;

  1021                     ++inc;

  1022                 }

  1023                 s8+=length8;

  1024             }

  1025         } else /* USET_SPAN_SIMPLE */ {

  1026             int32_t maxInc=0, maxOverlap=0;

  1027             for(i=0; i<stringsLength; ++i) {

  1028                 length8=utf8Lengths[i];

  1029                 if(length8==0) {

  1030                     continue;  // String not representable in UTF-8.

  1031                 }

  1032                 int32_t overlap=spanUTF8Lengths[i];

  1033                 // For longest match, we do need to try to match even an all-contained string

  1034                 // to find the match from the earliest start.

  1036                 // Try to match this string at pos-overlap..pos.

  1037                 if(overlap>=LONG_SPAN) {

  1038                     overlap=length8;

  1039                     // Longest match: Need to match fully inside the code point span

  1040                     // to find the match from the earliest start.

  1041                 }

  1042                 if(overlap>spanLength) {

  1043                     overlap=spanLength;

  1044                 }

  1045                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.

  1046                 for(;;) {

  1047                     if(inc>rest || overlap<maxOverlap) {

  1048                         break;

  1049                     }

  1050                     // Try to match if the string is longer or starts earlier.

  1051                     // Match at code point boundaries. (The UTF-8 strings were converted

  1052                     // from UTF-16 and are guaranteed to be well-formed.)

  1053                     if( !U8_IS_TRAIL(s[pos-overlap]) &&

  1054                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&

  1055                         matches8(s+pos-overlap, s8, length8)

  1057                     ) {

  1058                         maxInc=inc;  // Longest match from earliest start.

  1059                         maxOverlap=overlap;

  1060                         break;

  1061                     }

  1062                     --overlap;

  1063                     ++inc;

  1064                 }

  1065                 s8+=length8;

  1066             }

  1068             if(maxInc!=0 || maxOverlap!=0) {

  1069                 // Longest-match algorithm, and there was a string match.

  1070                 // Simply continue after it.

  1071                 pos+=maxInc;

  1072                 rest-=maxInc;

  1073                 if(rest==0) {

  1074                     return length;  // Reached the end of the string.

  1075                 }

  1076                 spanLength=0;  // Match strings from after a string match.

  1077                 continue;

  1078             }

  1079         }

  1080         // Finished trying to match all strings at pos.

  1082         if(spanLength!=0 || pos==0) {

  1083             // The position is after an unlimited code point span (spanLength!=0),

  1084             // not after a string match.

  1085             // The only position where spanLength==0 after a span is pos==0.

  1086             // Otherwise, an unlimited code point span is only tried again when no

  1087             // strings match, and if such a non-initial span fails we stop.

  1088             if(offsets.isEmpty()) {

  1089                 return pos;  // No strings matched after a span.

  1090             }

  1091             // Match strings from after the next string match.

  1092         } else {

  1093             // The position is after a string match (or a single code point).

  1094             if(offsets.isEmpty()) {

  1095                 // No more strings matched after a previous string match.

  1096                 // Try another code point span from after the last string match.

  1097                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);

  1098                 if( spanLength==rest || // Reached the end of the string, or

  1099                     spanLength==0       // neither strings nor span progressed.

  1100                 ) {

  1101                     return pos+spanLength;

  1102                 }

  1103                 pos+=spanLength;

  1104                 rest-=spanLength;

  1105                 continue;  // spanLength>0: Match strings from after a span.

  1106             } else {

  1107                 // Try to match only one code point from after a string match if some

  1108                 // string matched beyond it, so that we try all possible positions

  1109                 // and don't overshoot.

  1110                 spanLength=spanOneUTF8(spanSet, s+pos, rest);

  1111                 if(spanLength>0) {

  1112                     if(spanLength==rest) {

  1113                         return length;  // Reached the end of the string.

  1114                     }

  1115                     // Match strings after this code point.

  1116                     // There cannot be any increments below it because UnicodeSet strings

  1117                     // contain multiple code points.

  1118                     pos+=spanLength;

  1119                     rest-=spanLength;

  1120                     offsets.shift(spanLength);

  1121                     spanLength=0;

  1122                     continue;  // Match strings from after a single code point.

  1123                 }

  1124                 // Match strings from after the next string match.

  1125             }

  1126         }

  1127         int32_t minOffset=offsets.popMinimum();

  1128         pos+=minOffset;

  1129         rest-=minOffset;

  1130         spanLength=0;  // Match strings from after a string match.

  1131     }

  1132 }

  1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {

  1135     if(spanCondition==USET_SPAN_NOT_CONTAINED) {

  1136         return spanNotBackUTF8(s, length);

  1137     }

  1138     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);

  1139     if(pos==0) {

  1140         return 0;

  1141     }

  1142     int32_t spanLength=length-pos;

  1144     // Consider strings; they may overlap with the span.

  1145     OffsetList offsets;

  1146     if(spanCondition==USET_SPAN_CONTAINED) {

  1147         // Use offset list to try all possibilities.

  1148         offsets.setMaxLength(maxLength8);

  1149     }

  1150     int32_t i, stringsLength=strings.size();

  1151     uint8_t *spanBackUTF8Lengths=spanLengths;

  1152     if(all) {

  1153         spanBackUTF8Lengths+=3*stringsLength;

  1154     }

  1155     for(;;) {

  1156         const uint8_t *s8=utf8;

  1157         int32_t length8;

  1158         if(spanCondition==USET_SPAN_CONTAINED) {

  1159             for(i=0; i<stringsLength; ++i) {

  1160                 length8=utf8Lengths[i];

  1161                 if(length8==0) {

  1162                     continue;  // String not representable in UTF-8.

  1163                 }

  1164                 int32_t overlap=spanBackUTF8Lengths[i];

  1165                 if(overlap==ALL_CP_CONTAINED) {

  1166                     s8+=length8;

  1167                     continue;  // Irrelevant string.

  1168                 }

  1170                 // Try to match this string at pos-(length8-overlap)..pos-length8.

  1171                 if(overlap>=LONG_SPAN) {

  1172                     overlap=length8;

  1173                     // While contained: No point matching fully inside the code point span.

  1174                     int32_t len1=0;

  1175                     U8_FWD_1(s8, len1, overlap);

  1176                     overlap-=len1;  // Length of the string minus the first code point.

  1177                 }

  1178                 if(overlap>spanLength) {

  1179                     overlap=spanLength;

  1180                 }

  1181                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.

  1182                 for(;;) {

  1183                     if(dec>pos) {

  1184                         break;

  1185                     }

  1186                     // Try to match if the decrement is not listed already.

  1187                     // Match at code point boundaries. (The UTF-8 strings were converted

  1188                     // from UTF-16 and are guaranteed to be well-formed.)

  1189                     if( !U8_IS_TRAIL(s[pos-dec]) &&

  1190                         !offsets.containsOffset(dec) &&

  1191                         matches8(s+pos-dec, s8, length8)

  1192                     ) {

  1193                         if(dec==pos) {

  1194                             return 0;  // Reached the start of the string.

  1195                         }

  1196                         offsets.addOffset(dec);

  1197                     }

  1198                     if(overlap==0) {

  1199                         break;

  1200                     }

  1201                     --overlap;

  1202                     ++dec;

  1203                 }

  1204                 s8+=length8;

  1205             }

  1206         } else /* USET_SPAN_SIMPLE */ {

  1207             int32_t maxDec=0, maxOverlap=0;

  1208             for(i=0; i<stringsLength; ++i) {

  1209                 length8=utf8Lengths[i];

  1210                 if(length8==0) {

  1211                     continue;  // String not representable in UTF-8.

  1212                 }

  1213                 int32_t overlap=spanBackUTF8Lengths[i];

  1214                 // For longest match, we do need to try to match even an all-contained string

  1215                 // to find the match from the latest end.

  1217                 // Try to match this string at pos-(length8-overlap)..pos-length8.

  1218                 if(overlap>=LONG_SPAN) {

  1219                     overlap=length8;

  1220                     // Longest match: Need to match fully inside the code point span

  1221                     // to find the match from the latest end.

  1222                 }

  1223                 if(overlap>spanLength) {

  1224                     overlap=spanLength;

  1225                 }

  1226                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.

  1227                 for(;;) {

  1228                     if(dec>pos || overlap<maxOverlap) {

  1229                         break;

  1230                     }

  1231                     // Try to match if the string is longer or ends later.

  1232                     // Match at code point boundaries. (The UTF-8 strings were converted

  1233                     // from UTF-16 and are guaranteed to be well-formed.)

  1234                     if( !U8_IS_TRAIL(s[pos-dec]) &&

  1235                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&

  1236                         matches8(s+pos-dec, s8, length8)

  1237                     ) {

  1238                         maxDec=dec;  // Longest match from latest end.

  1239                         maxOverlap=overlap;

  1240                         break;

  1241                     }

  1242                     --overlap;

  1243                     ++dec;

  1244                 }

  1245                 s8+=length8;

  1246             }

  1248             if(maxDec!=0 || maxOverlap!=0) {

  1249                 // Longest-match algorithm, and there was a string match.

  1250                 // Simply continue before it.

  1251                 pos-=maxDec;

  1252                 if(pos==0) {

  1253                     return 0;  // Reached the start of the string.

  1254                 }

  1255                 spanLength=0;  // Match strings from before a string match.

  1256                 continue;

  1257             }

  1258         }

  1259         // Finished trying to match all strings at pos.

  1261         if(spanLength!=0 || pos==length) {

  1262             // The position is before an unlimited code point span (spanLength!=0),

  1263             // not before a string match.

  1264             // The only position where spanLength==0 before a span is pos==length.

  1265             // Otherwise, an unlimited code point span is only tried again when no

  1266             // strings match, and if such a non-initial span fails we stop.

  1267             if(offsets.isEmpty()) {

  1268                 return pos;  // No strings matched before a span.

  1269             }

  1270             // Match strings from before the next string match.

  1271         } else {

  1272             // The position is before a string match (or a single code point).

  1273             if(offsets.isEmpty()) {

  1274                 // No more strings matched before a previous string match.

  1275                 // Try another code point span from before the last string match.

  1276                 int32_t oldPos=pos;

  1277                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);

  1278                 spanLength=oldPos-pos;

  1279                 if( pos==0 ||           // Reached the start of the string, or

  1280                     spanLength==0       // neither strings nor span progressed.

  1281                 ) {

  1282                     return pos;

  1283                 }

  1284                 continue;  // spanLength>0: Match strings from before a span.

  1285             } else {

  1286                 // Try to match only one code point from before a string match if some

  1287                 // string matched beyond it, so that we try all possible positions

  1288                 // and don't overshoot.

  1289                 spanLength=spanOneBackUTF8(spanSet, s, pos);

  1290                 if(spanLength>0) {

  1291                     if(spanLength==pos) {

  1292                         return 0;  // Reached the start of the string.

  1293                     }

  1294                     // Match strings before this code point.

  1295                     // There cannot be any decrements below it because UnicodeSet strings

  1296                     // contain multiple code points.

  1297                     pos-=spanLength;

  1298                     offsets.shift(spanLength);

  1299                     spanLength=0;

  1300                     continue;  // Match strings from before a single code point.

  1301                 }

  1302                 // Match strings from before the next string match.

  1303             }

  1304         }

  1305         pos-=offsets.popMinimum();

  1306         spanLength=0;  // Match strings from before a string match.

  1307     }

  1308 }

  1310 /*

  1311  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)

  1312  *

  1313  * Theoretical algorithm:

  1314  * - Iterate through the string, and at each code point boundary:

  1315  *   + If the code point there is in the set, then return with the current position.

  1316  *   + If a set string matches at the current position, then return with the current position.

  1317  *

  1318  * Optimized implementation:

  1319  *

  1320  * (Same assumption as for span() above.)

  1321  *

  1322  * Create and cache a spanNotSet which contains all of the single code points

  1323  * of the original set but none of its strings.

  1324  * For each set string add its initial code point to the spanNotSet.

  1325  * (Also add its final code point for spanNotBack().)

  1326  *

  1327  * - Loop:

  1328  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).

  1329  *   + If the current code point is in the original set, then

  1330  *     return the current position.

  1331  *   + If any set string matches at the current position, then

  1332  *     return the current position.

  1333  *   + If there is no match at the current position, neither for the code point there

  1334  *     nor for any set string, then skip this code point and continue the loop.

  1335  *     This happens for set-string-initial code points that were added to spanNotSet

  1336  *     when there is not actually a match for such a set string.

  1337  */

  1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {

  1340     int32_t pos=0, rest=length;

  1341     int32_t i, stringsLength=strings.size();

  1342     do {

  1343         // Span until we find a code point from the set,

  1344         // or a code point that starts or ends some string.

  1345         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);

  1346         if(i==rest) {

  1347             return length;  // Reached the end of the string.

  1348         }

  1349         pos+=i;

  1350         rest-=i;

  1352         // Check whether the current code point is in the original set,

  1353         // without the string starts and ends.

  1354         int32_t cpLength=spanOne(spanSet, s+pos, rest);

  1355         if(cpLength>0) {

  1356             return pos;  // There is a set element at pos.

  1357         }

  1359         // Try to match the strings at pos.

  1360         for(i=0; i<stringsLength; ++i) {

  1361             if(spanLengths[i]==ALL_CP_CONTAINED) {

  1362                 continue;  // Irrelevant string.

  1363             }

  1364             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

  1365             const UChar *s16=string.getBuffer();

  1366             int32_t length16=string.length();

  1367             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {

  1368                 return pos;  // There is a set element at pos.

  1369             }

  1370         }

  1372         // The span(while not contained) ended on a string start/end which is

  1373         // not in the original set. Skip this code point and continue.

  1374         // cpLength<0

  1375         pos-=cpLength;

  1376         rest+=cpLength;

  1377     } while(rest!=0);

  1378     return length;  // Reached the end of the string.

  1379 }

  1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {

  1382     int32_t pos=length;

  1383     int32_t i, stringsLength=strings.size();

  1384     do {

  1385         // Span until we find a code point from the set,

  1386         // or a code point that starts or ends some string.

  1387         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);

  1388         if(pos==0) {

  1389             return 0;  // Reached the start of the string.

  1390         }

  1392         // Check whether the current code point is in the original set,

  1393         // without the string starts and ends.

  1394         int32_t cpLength=spanOneBack(spanSet, s, pos);

  1395         if(cpLength>0) {

  1396             return pos;  // There is a set element at pos.

  1397         }

  1399         // Try to match the strings at pos.

  1400         for(i=0; i<stringsLength; ++i) {

  1401             // Use spanLengths rather than a spanBackLengths pointer because

  1402             // it is easier and we only need to know whether the string is irrelevant

  1403             // which is the same in either array.

  1404             if(spanLengths[i]==ALL_CP_CONTAINED) {

  1405                 continue;  // Irrelevant string.

  1406             }

  1407             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);

  1408             const UChar *s16=string.getBuffer();

  1409             int32_t length16=string.length();

  1410             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {

  1411                 return pos;  // There is a set element at pos.

  1412             }

  1413         }

  1415         // The span(while not contained) ended on a string start/end which is

  1416         // not in the original set. Skip this code point and continue.

  1417         // cpLength<0

  1418         pos+=cpLength;

  1419     } while(pos!=0);

  1420     return 0;  // Reached the start of the string.

  1421 }

  1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {

  1424     int32_t pos=0, rest=length;

  1425     int32_t i, stringsLength=strings.size();

  1426     uint8_t *spanUTF8Lengths=spanLengths;

  1427     if(all) {

  1428         spanUTF8Lengths+=2*stringsLength;

  1429     }

  1430     do {

  1431         // Span until we find a code point from the set,

  1432         // or a code point that starts or ends some string.

  1433         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);

  1434         if(i==rest) {

  1435             return length;  // Reached the end of the string.

  1436         }

  1437         pos+=i;

  1438         rest-=i;

  1440         // Check whether the current code point is in the original set,

  1441         // without the string starts and ends.

  1442         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);

  1443         if(cpLength>0) {

  1444             return pos;  // There is a set element at pos.

  1445         }

  1447         // Try to match the strings at pos.

  1448         const uint8_t *s8=utf8;

  1449         int32_t length8;

  1450         for(i=0; i<stringsLength; ++i) {

  1451             length8=utf8Lengths[i];

  1452             // ALL_CP_CONTAINED: Irrelevant string.

  1453             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {

  1454                 return pos;  // There is a set element at pos.

  1455             }

  1456             s8+=length8;

  1457         }

  1459         // The span(while not contained) ended on a string start/end which is

  1460         // not in the original set. Skip this code point and continue.

  1461         // cpLength<0

  1462         pos-=cpLength;

  1463         rest+=cpLength;

  1464     } while(rest!=0);

  1465     return length;  // Reached the end of the string.

  1466 }

  1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {

  1469     int32_t pos=length;

  1470     int32_t i, stringsLength=strings.size();

  1471     uint8_t *spanBackUTF8Lengths=spanLengths;

  1472     if(all) {

  1473         spanBackUTF8Lengths+=3*stringsLength;

  1474     }

  1475     do {

  1476         // Span until we find a code point from the set,

  1477         // or a code point that starts or ends some string.

  1478         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);

  1479         if(pos==0) {

  1480             return 0;  // Reached the start of the string.

  1481         }

  1483         // Check whether the current code point is in the original set,

  1484         // without the string starts and ends.

  1485         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);

  1486         if(cpLength>0) {

  1487             return pos;  // There is a set element at pos.

  1488         }

  1490         // Try to match the strings at pos.

  1491         const uint8_t *s8=utf8;

  1492         int32_t length8;

  1493         for(i=0; i<stringsLength; ++i) {

  1494             length8=utf8Lengths[i];

  1495             // ALL_CP_CONTAINED: Irrelevant string.

  1496             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {

  1497                 return pos;  // There is a set element at pos.

  1498             }

  1499             s8+=length8;

  1500         }

  1502         // The span(while not contained) ended on a string start/end which is

  1503         // not in the original set. Skip this code point and continue.

  1504         // cpLength<0

  1505         pos+=cpLength;

  1506     } while(pos!=0);

  1507     return 0;  // Reached the start of the string.

  1508 }

  1510 U_NAMESPACE_END

The Tor Browser / file revision

intl/icu/source/common/unisetspan.cpp@fc2d59ddac77

intl/icu/source/common/unisetspan.cpp