intl/icu/source/common/unisetspan.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2007-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *   file name:  unisetspan.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2007mar01
    14 *   created by: Markus W. Scherer
    15 */
    17 #include "unicode/utypes.h"
    18 #include "unicode/uniset.h"
    19 #include "unicode/ustring.h"
    20 #include "unicode/utf8.h"
    21 #include "unicode/utf16.h"
    22 #include "cmemory.h"
    23 #include "uvector.h"
    24 #include "unisetspan.h"
    26 U_NAMESPACE_BEGIN
    28 /*
    29  * List of offsets from the current position from where to try matching
    30  * a code point or a string.
    31  * Store offsets rather than indexes to simplify the code and use the same list
    32  * for both increments (in span()) and decrements (in spanBack()).
    33  *
    34  * Assumption: The maximum offset is limited, and the offsets that are stored
    35  * at any one time are relatively dense, that is, there are normally no gaps of
    36  * hundreds or thousands of offset values.
    37  *
    38  * The implementation uses a circular buffer of byte flags,
    39  * each indicating whether the corresponding offset is in the list.
    40  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
    41  * physically moving part of the list.
    42  *
    43  * Note: In principle, the caller should setMaxLength() to the maximum of the
    44  * max string length and U16_LENGTH/U8_LENGTH to account for
    45  * "long" single code points.
    46  * However, this implementation uses at least a staticList with more than
    47  * U8_LENGTH entries anyway.
    48  *
    49  * Note: If maxLength were guaranteed to be no more than 32 or 64,
    50  * the list could be stored as bit flags in a single integer.
    51  * Rather than handling a circular buffer with a start list index,
    52  * the integer would simply be shifted when lower offsets are removed.
    53  * UnicodeSet does not have a limit on the lengths of strings.
    54  */
    55 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
    56 public:
    57     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
    59     ~OffsetList() {
    60         if(list!=staticList) {
    61             uprv_free(list);
    62         }
    63     }
    65     // Call exactly once if the list is to be used.
    66     void setMaxLength(int32_t maxLength) {
    67         if(maxLength<=(int32_t)sizeof(staticList)) {
    68             capacity=(int32_t)sizeof(staticList);
    69         } else {
    70             UBool *l=(UBool *)uprv_malloc(maxLength);
    71             if(l!=NULL) {
    72                 list=l;
    73                 capacity=maxLength;
    74             }
    75         }
    76         uprv_memset(list, 0, capacity);
    77     }
    79     void clear() {
    80         uprv_memset(list, 0, capacity);
    81         start=length=0;
    82     }
    84     UBool isEmpty() const {
    85         return (UBool)(length==0);
    86     }
    88     // Reduce all stored offsets by delta, used when the current position
    89     // moves by delta.
    90     // There must not be any offsets lower than delta.
    91     // If there is an offset equal to delta, it is removed.
    92     // delta=[1..maxLength]
    93     void shift(int32_t delta) {
    94         int32_t i=start+delta;
    95         if(i>=capacity) {
    96             i-=capacity;
    97         }
    98         if(list[i]) {
    99             list[i]=FALSE;
   100             --length;
   101         }
   102         start=i;
   103     }
   105     // Add an offset. The list must not contain it yet.
   106     // offset=[1..maxLength]
   107     void addOffset(int32_t offset) {
   108         int32_t i=start+offset;
   109         if(i>=capacity) {
   110             i-=capacity;
   111         }
   112         list[i]=TRUE;
   113         ++length;
   114     }
   116     // offset=[1..maxLength]
   117     UBool containsOffset(int32_t offset) const {
   118         int32_t i=start+offset;
   119         if(i>=capacity) {
   120             i-=capacity;
   121         }
   122         return list[i];
   123     }
   125     // Find the lowest stored offset from a non-empty list, remove it,
   126     // and reduce all other offsets by this minimum.
   127     // Returns [1..maxLength].
   128     int32_t popMinimum() {
   129         // Look for the next offset in list[start+1..capacity-1].
   130         int32_t i=start, result;
   131         while(++i<capacity) {
   132             if(list[i]) {
   133                 list[i]=FALSE;
   134                 --length;
   135                 result=i-start;
   136                 start=i;
   137                 return result;
   138             }
   139         }
   140         // i==capacity
   142         // Wrap around and look for the next offset in list[0..start].
   143         // Since the list is not empty, there will be one.
   144         result=capacity-start;
   145         i=0;
   146         while(!list[i]) {
   147             ++i;
   148         }
   149         list[i]=FALSE;
   150         --length;
   151         start=i;
   152         return result+=i;
   153     }
   155 private:
   156     UBool *list;
   157     int32_t capacity;
   158     int32_t length;
   159     int32_t start;
   161     UBool staticList[16];
   162 };
   164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
   165 static int32_t
   166 getUTF8Length(const UChar *s, int32_t length) {
   167     UErrorCode errorCode=U_ZERO_ERROR;
   168     int32_t length8=0;
   169     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
   170     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
   171         return length8;
   172     } else {
   173         // The string contains an unpaired surrogate.
   174         // Ignore this string.
   175         return 0;
   176     }
   177 }
   179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
   180 static int32_t
   181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
   182     UErrorCode errorCode=U_ZERO_ERROR;
   183     int32_t length8=0;
   184     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
   185     if(U_SUCCESS(errorCode)) {
   186         return length8;
   187     } else {
   188         // The string contains an unpaired surrogate.
   189         // Ignore this string.
   190         return 0;
   191     }
   192 }
   194 static inline uint8_t
   195 makeSpanLengthByte(int32_t spanLength) {
   196     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
   197     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
   198 }
   200 // Construct for all variants of span(), or only for any one variant.
   201 // Initialize as little as possible, for single use.
   202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
   203                                            const UVector &setStrings,
   204                                            uint32_t which)
   205         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
   206           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
   207           utf8Length(0),
   208           maxLength16(0), maxLength8(0),
   209           all((UBool)(which==ALL)) {
   210     spanSet.retainAll(set);
   211     if(which&NOT_CONTAINED) {
   212         // Default to the same sets.
   213         // addToSpanNotSet() will create a separate set if necessary.
   214         pSpanNotSet=&spanSet;
   215     }
   217     // Determine if the strings even need to be taken into account at all for span() etc.
   218     // If any string is relevant, then all strings need to be used for
   219     // span(longest match) but only the relevant ones for span(while contained).
   220     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
   221     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
   222     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
   223     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
   224     int32_t stringsLength=strings.size();
   226     int32_t i, spanLength;
   227     UBool someRelevant=FALSE;
   228     for(i=0; i<stringsLength; ++i) {
   229         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   230         const UChar *s16=string.getBuffer();
   231         int32_t length16=string.length();
   232         UBool thisRelevant;
   233         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
   234         if(spanLength<length16) {  // Relevant string.
   235             someRelevant=thisRelevant=TRUE;
   236         } else {
   237             thisRelevant=FALSE;
   238         }
   239         if((which&UTF16) && length16>maxLength16) {
   240             maxLength16=length16;
   241         }
   242         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
   243             int32_t length8=getUTF8Length(s16, length16);
   244             utf8Length+=length8;
   245             if(length8>maxLength8) {
   246                 maxLength8=length8;
   247             }
   248         }
   249     }
   250     if(!someRelevant) {
   251         maxLength16=maxLength8=0;
   252         return;
   253     }
   255     // Freeze after checking for the need to use strings at all because freezing
   256     // a set takes some time and memory which are wasted if there are no relevant strings.
   257     if(all) {
   258         spanSet.freeze();
   259     }
   261     uint8_t *spanBackLengths;
   262     uint8_t *spanUTF8Lengths;
   263     uint8_t *spanBackUTF8Lengths;
   265     // Allocate a block of meta data.
   266     int32_t allocSize;
   267     if(all) {
   268         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
   269         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
   270     } else {
   271         allocSize=stringsLength;  // One set of span lengths.
   272         if(which&UTF8) {
   273             // UTF-8 lengths and UTF-8 strings.
   274             allocSize+=stringsLength*4+utf8Length;
   275         }
   276     }
   277     if(allocSize<=(int32_t)sizeof(staticLengths)) {
   278         utf8Lengths=staticLengths;
   279     } else {
   280         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
   281         if(utf8Lengths==NULL) {
   282             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
   283             return;  // Out of memory.
   284         }
   285     }
   287     if(all) {
   288         // Store span lengths for all span() variants.
   289         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
   290         spanBackLengths=spanLengths+stringsLength;
   291         spanUTF8Lengths=spanBackLengths+stringsLength;
   292         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
   293         utf8=spanBackUTF8Lengths+stringsLength;
   294     } else {
   295         // Store span lengths for only one span() variant.
   296         if(which&UTF8) {
   297             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
   298             utf8=spanLengths+stringsLength;
   299         } else {
   300             spanLengths=(uint8_t *)utf8Lengths;
   301         }
   302         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
   303     }
   305     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
   306     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
   308     for(i=0; i<stringsLength; ++i) {
   309         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   310         const UChar *s16=string.getBuffer();
   311         int32_t length16=string.length();
   312         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
   313         if(spanLength<length16) {  // Relevant string.
   314             if(which&UTF16) {
   315                 if(which&CONTAINED) {
   316                     if(which&FWD) {
   317                         spanLengths[i]=makeSpanLengthByte(spanLength);
   318                     }
   319                     if(which&BACK) {
   320                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
   321                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
   322                     }
   323                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
   324                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
   325                 }
   326             }
   327             if(which&UTF8) {
   328                 uint8_t *s8=utf8+utf8Count;
   329                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
   330                 utf8Count+=utf8Lengths[i]=length8;
   331                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
   332                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
   333                 } else {  // Relevant for UTF-8.
   334                     if(which&CONTAINED) {
   335                         if(which&FWD) {
   336                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
   337                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
   338                         }
   339                         if(which&BACK) {
   340                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
   341                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
   342                         }
   343                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
   344                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
   345                     }
   346                 }
   347             }
   348             if(which&NOT_CONTAINED) {
   349                 // Add string start and end code points to the spanNotSet so that
   350                 // a span(while not contained) stops before any string.
   351                 UChar32 c;
   352                 if(which&FWD) {
   353                     int32_t len=0;
   354                     U16_NEXT(s16, len, length16, c);
   355                     addToSpanNotSet(c);
   356                 }
   357                 if(which&BACK) {
   358                     int32_t len=length16;
   359                     U16_PREV(s16, 0, len, c);
   360                     addToSpanNotSet(c);
   361                 }
   362             }
   363         } else {  // Irrelevant string.
   364             if(which&UTF8) {
   365                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
   366                     uint8_t *s8=utf8+utf8Count;
   367                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
   368                     utf8Count+=utf8Lengths[i]=length8;
   369                 } else {
   370                     utf8Lengths[i]=0;
   371                 }
   372             }
   373             if(all) {
   374                 spanLengths[i]=spanBackLengths[i]=
   375                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
   376                         (uint8_t)ALL_CP_CONTAINED;
   377             } else {
   378                 // All spanXYZLengths pointers contain the same address.
   379                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
   380             }
   381         }
   382     }
   384     // Finish.
   385     if(all) {
   386         pSpanNotSet->freeze();
   387     }
   388 }
   390 // Copy constructor. Assumes which==ALL for a frozen set.
   391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
   392                                            const UVector &newParentSetStrings)
   393         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
   394           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
   395           utf8Length(otherStringSpan.utf8Length),
   396           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
   397           all(TRUE) {
   398     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
   399         pSpanNotSet=&spanSet;
   400     } else {
   401         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
   402     }
   404     // Allocate a block of meta data.
   405     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
   406     int32_t stringsLength=strings.size();
   407     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
   408     if(allocSize<=(int32_t)sizeof(staticLengths)) {
   409         utf8Lengths=staticLengths;
   410     } else {
   411         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
   412         if(utf8Lengths==NULL) {
   413             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
   414             return;  // Out of memory.
   415         }
   416     }
   418     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
   419     utf8=spanLengths+stringsLength*4;
   420     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
   421 }
   423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
   424     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
   425         delete pSpanNotSet;
   426     }
   427     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
   428         uprv_free(utf8Lengths);
   429     }
   430 }
   432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
   433     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
   434         if(spanSet.contains(c)) {
   435             return;  // Nothing to do.
   436         }
   437         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
   438         if(newSet==NULL) {
   439             return;  // Out of memory.
   440         } else {
   441             pSpanNotSet=newSet;
   442         }
   443     }
   444     pSpanNotSet->add(c);
   445 }
   447 // Compare strings without any argument checks. Requires length>0.
   448 static inline UBool
   449 matches16(const UChar *s, const UChar *t, int32_t length) {
   450     do {
   451         if(*s++!=*t++) {
   452             return FALSE;
   453         }
   454     } while(--length>0);
   455     return TRUE;
   456 }
   458 static inline UBool
   459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
   460     do {
   461         if(*s++!=*t++) {
   462             return FALSE;
   463         }
   464     } while(--length>0);
   465     return TRUE;
   466 }
   468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
   469 // at code point boundaries.
   470 // That is, each edge of a match must not be in the middle of a surrogate pair.
   471 static inline UBool
   472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
   473     s+=start;
   474     limit-=start;
   475     return matches16(s, t, length) &&
   476            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
   477            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
   478 }
   480 // Does the set contain the next code point?
   481 // If so, return its length; otherwise return its negative length.
   482 static inline int32_t
   483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
   484     UChar c=*s, c2;
   485     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
   486         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
   487     }
   488     return set.contains(c) ? 1 : -1;
   489 }
   491 static inline int32_t
   492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
   493     UChar c=s[length-1], c2;
   494     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
   495         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
   496     }
   497     return set.contains(c) ? 1 : -1;
   498 }
   500 static inline int32_t
   501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
   502     UChar32 c=*s;
   503     if((int8_t)c>=0) {
   504         return set.contains(c) ? 1 : -1;
   505     }
   506     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
   507     int32_t i=0;
   508     U8_NEXT_OR_FFFD(s, i, length, c);
   509     return set.contains(c) ? i : -i;
   510 }
   512 static inline int32_t
   513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
   514     UChar32 c=s[length-1];
   515     if((int8_t)c>=0) {
   516         return set.contains(c) ? 1 : -1;
   517     }
   518     int32_t i=length-1;
   519     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
   520     length-=i;
   521     return set.contains(c) ? length : -length;
   522 }
   524 /*
   525  * Note: In span() when spanLength==0 (after a string match, or at the beginning
   526  * after an empty code point span) and in spanNot() and spanNotUTF8(),
   527  * string matching could use a binary search
   528  * because all string matches are done from the same start index.
   529  *
   530  * For UTF-8, this would require a comparison function that returns UTF-16 order.
   531  *
   532  * This optimization should not be necessary for normal UnicodeSets because
   533  * most sets have no strings, and most sets with strings have
   534  * very few very short strings.
   535  * For cases with many strings, it might be better to use a different API
   536  * and implementation with a DFA (state machine).
   537  */
   539 /*
   540  * Algorithm for span(USET_SPAN_CONTAINED)
   541  *
   542  * Theoretical algorithm:
   543  * - Iterate through the string, and at each code point boundary:
   544  *   + If the code point there is in the set, then remember to continue after it.
   545  *   + If a set string matches at the current position, then remember to continue after it.
   546  *   + Either recursively span for each code point or string match,
   547  *     or recursively span for all but the shortest one and
   548  *     iteratively continue the span with the shortest local match.
   549  *   + Remember the longest recursive span (the farthest end point).
   550  *   + If there is no match at the current position, neither for the code point there
   551  *     nor for any set string, then stop and return the longest recursive span length.
   552  *
   553  * Optimized implementation:
   554  *
   555  * (We assume that most sets will have very few very short strings.
   556  * A span using a string-less set is extremely fast.)
   557  *
   558  * Create and cache a spanSet which contains all of the single code points
   559  * of the original set but none of its strings.
   560  *
   561  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
   562  * - Loop:
   563  *   + Try to match each set string at the end of the spanLength.
   564  *     ~ Set strings that start with set-contained code points must be matched
   565  *       with a partial overlap because the recursive algorithm would have tried
   566  *       to match them at every position.
   567  *     ~ Set strings that entirely consist of set-contained code points
   568  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
   569  *       recursive algorithm would continue after them anyway
   570  *       and find the longest recursive match from their end.
   571  *     ~ Rather than recursing, note each end point of a set string match.
   572  *   + If no set string matched after spanSet.span(), then return
   573  *     with where the spanSet.span() ended.
   574  *   + If at least one set string matched after spanSet.span(), then
   575  *     pop the shortest string match end point and continue
   576  *     the loop, trying to match all set strings from there.
   577  *   + If at least one more set string matched after a previous string match,
   578  *     then test if the code point after the previous string match is also
   579  *     contained in the set.
   580  *     Continue the loop with the shortest end point of either this code point
   581  *     or a matching set string.
   582  *   + If no more set string matched after a previous string match,
   583  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
   584  *     Stop if spanLength==0, otherwise continue the loop.
   585  *
   586  * By noting each end point of a set string match,
   587  * the function visits each string position at most once and finishes
   588  * in linear time.
   589  *
   590  * The recursive algorithm may visit the same string position many times
   591  * if multiple paths lead to it and finishes in exponential time.
   592  */
   594 /*
   595  * Algorithm for span(USET_SPAN_SIMPLE)
   596  *
   597  * Theoretical algorithm:
   598  * - Iterate through the string, and at each code point boundary:
   599  *   + If the code point there is in the set, then remember to continue after it.
   600  *   + If a set string matches at the current position, then remember to continue after it.
   601  *   + Continue from the farthest match position and ignore all others.
   602  *   + If there is no match at the current position,
   603  *     then stop and return the current position.
   604  *
   605  * Optimized implementation:
   606  *
   607  * (Same assumption and spanSet as above.)
   608  *
   609  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
   610  * - Loop:
   611  *   + Try to match each set string at the end of the spanLength.
   612  *     ~ Set strings that start with set-contained code points must be matched
   613  *       with a partial overlap because the standard algorithm would have tried
   614  *       to match them earlier.
   615  *     ~ Set strings that entirely consist of set-contained code points
   616  *       must be matched with a full overlap because the longest-match algorithm
   617  *       would hide set string matches that end earlier.
   618  *       Such set strings need not be matched earlier inside the code point span
   619  *       because the standard algorithm would then have continued after
   620  *       the set string match anyway.
   621  *     ~ Remember the longest set string match (farthest end point) from the earliest
   622  *       starting point.
   623  *   + If no set string matched after spanSet.span(), then return
   624  *     with where the spanSet.span() ended.
   625  *   + If at least one set string matched, then continue the loop after the
   626  *     longest match from the earliest position.
   627  *   + If no more set string matched after a previous string match,
   628  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
   629  *     Stop if spanLength==0, otherwise continue the loop.
   630  */
   632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
   633     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   634         return spanNot(s, length);
   635     }
   636     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
   637     if(spanLength==length) {
   638         return length;
   639     }
   641     // Consider strings; they may overlap with the span.
   642     OffsetList offsets;
   643     if(spanCondition==USET_SPAN_CONTAINED) {
   644         // Use offset list to try all possibilities.
   645         offsets.setMaxLength(maxLength16);
   646     }
   647     int32_t pos=spanLength, rest=length-pos;
   648     int32_t i, stringsLength=strings.size();
   649     for(;;) {
   650         if(spanCondition==USET_SPAN_CONTAINED) {
   651             for(i=0; i<stringsLength; ++i) {
   652                 int32_t overlap=spanLengths[i];
   653                 if(overlap==ALL_CP_CONTAINED) {
   654                     continue;  // Irrelevant string.
   655                 }
   656                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   657                 const UChar *s16=string.getBuffer();
   658                 int32_t length16=string.length();
   660                 // Try to match this string at pos-overlap..pos.
   661                 if(overlap>=LONG_SPAN) {
   662                     overlap=length16;
   663                     // While contained: No point matching fully inside the code point span.
   664                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
   665                 }
   666                 if(overlap>spanLength) {
   667                     overlap=spanLength;
   668                 }
   669                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
   670                 for(;;) {
   671                     if(inc>rest) {
   672                         break;
   673                     }
   674                     // Try to match if the increment is not listed already.
   675                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
   676                         if(inc==rest) {
   677                             return length;  // Reached the end of the string.
   678                         }
   679                         offsets.addOffset(inc);
   680                     }
   681                     if(overlap==0) {
   682                         break;
   683                     }
   684                     --overlap;
   685                     ++inc;
   686                 }
   687             }
   688         } else /* USET_SPAN_SIMPLE */ {
   689             int32_t maxInc=0, maxOverlap=0;
   690             for(i=0; i<stringsLength; ++i) {
   691                 int32_t overlap=spanLengths[i];
   692                 // For longest match, we do need to try to match even an all-contained string
   693                 // to find the match from the earliest start.
   695                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   696                 const UChar *s16=string.getBuffer();
   697                 int32_t length16=string.length();
   699                 // Try to match this string at pos-overlap..pos.
   700                 if(overlap>=LONG_SPAN) {
   701                     overlap=length16;
   702                     // Longest match: Need to match fully inside the code point span
   703                     // to find the match from the earliest start.
   704                 }
   705                 if(overlap>spanLength) {
   706                     overlap=spanLength;
   707                 }
   708                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
   709                 for(;;) {
   710                     if(inc>rest || overlap<maxOverlap) {
   711                         break;
   712                     }
   713                     // Try to match if the string is longer or starts earlier.
   714                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
   715                         matches16CPB(s, pos-overlap, length, s16, length16)
   716                     ) {
   717                         maxInc=inc;  // Longest match from earliest start.
   718                         maxOverlap=overlap;
   719                         break;
   720                     }
   721                     --overlap;
   722                     ++inc;
   723                 }
   724             }
   726             if(maxInc!=0 || maxOverlap!=0) {
   727                 // Longest-match algorithm, and there was a string match.
   728                 // Simply continue after it.
   729                 pos+=maxInc;
   730                 rest-=maxInc;
   731                 if(rest==0) {
   732                     return length;  // Reached the end of the string.
   733                 }
   734                 spanLength=0;  // Match strings from after a string match.
   735                 continue;
   736             }
   737         }
   738         // Finished trying to match all strings at pos.
   740         if(spanLength!=0 || pos==0) {
   741             // The position is after an unlimited code point span (spanLength!=0),
   742             // not after a string match.
   743             // The only position where spanLength==0 after a span is pos==0.
   744             // Otherwise, an unlimited code point span is only tried again when no
   745             // strings match, and if such a non-initial span fails we stop.
   746             if(offsets.isEmpty()) {
   747                 return pos;  // No strings matched after a span.
   748             }
   749             // Match strings from after the next string match.
   750         } else {
   751             // The position is after a string match (or a single code point).
   752             if(offsets.isEmpty()) {
   753                 // No more strings matched after a previous string match.
   754                 // Try another code point span from after the last string match.
   755                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
   756                 if( spanLength==rest || // Reached the end of the string, or
   757                     spanLength==0       // neither strings nor span progressed.
   758                 ) {
   759                     return pos+spanLength;
   760                 }
   761                 pos+=spanLength;
   762                 rest-=spanLength;
   763                 continue;  // spanLength>0: Match strings from after a span.
   764             } else {
   765                 // Try to match only one code point from after a string match if some
   766                 // string matched beyond it, so that we try all possible positions
   767                 // and don't overshoot.
   768                 spanLength=spanOne(spanSet, s+pos, rest);
   769                 if(spanLength>0) {
   770                     if(spanLength==rest) {
   771                         return length;  // Reached the end of the string.
   772                     }
   773                     // Match strings after this code point.
   774                     // There cannot be any increments below it because UnicodeSet strings
   775                     // contain multiple code points.
   776                     pos+=spanLength;
   777                     rest-=spanLength;
   778                     offsets.shift(spanLength);
   779                     spanLength=0;
   780                     continue;  // Match strings from after a single code point.
   781                 }
   782                 // Match strings from after the next string match.
   783             }
   784         }
   785         int32_t minOffset=offsets.popMinimum();
   786         pos+=minOffset;
   787         rest-=minOffset;
   788         spanLength=0;  // Match strings from after a string match.
   789     }
   790 }
   792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
   793     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   794         return spanNotBack(s, length);
   795     }
   796     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
   797     if(pos==0) {
   798         return 0;
   799     }
   800     int32_t spanLength=length-pos;
   802     // Consider strings; they may overlap with the span.
   803     OffsetList offsets;
   804     if(spanCondition==USET_SPAN_CONTAINED) {
   805         // Use offset list to try all possibilities.
   806         offsets.setMaxLength(maxLength16);
   807     }
   808     int32_t i, stringsLength=strings.size();
   809     uint8_t *spanBackLengths=spanLengths;
   810     if(all) {
   811         spanBackLengths+=stringsLength;
   812     }
   813     for(;;) {
   814         if(spanCondition==USET_SPAN_CONTAINED) {
   815             for(i=0; i<stringsLength; ++i) {
   816                 int32_t overlap=spanBackLengths[i];
   817                 if(overlap==ALL_CP_CONTAINED) {
   818                     continue;  // Irrelevant string.
   819                 }
   820                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   821                 const UChar *s16=string.getBuffer();
   822                 int32_t length16=string.length();
   824                 // Try to match this string at pos-(length16-overlap)..pos-length16.
   825                 if(overlap>=LONG_SPAN) {
   826                     overlap=length16;
   827                     // While contained: No point matching fully inside the code point span.
   828                     int32_t len1=0;
   829                     U16_FWD_1(s16, len1, overlap);
   830                     overlap-=len1;  // Length of the string minus the first code point.
   831                 }
   832                 if(overlap>spanLength) {
   833                     overlap=spanLength;
   834                 }
   835                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
   836                 for(;;) {
   837                     if(dec>pos) {
   838                         break;
   839                     }
   840                     // Try to match if the decrement is not listed already.
   841                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
   842                         if(dec==pos) {
   843                             return 0;  // Reached the start of the string.
   844                         }
   845                         offsets.addOffset(dec);
   846                     }
   847                     if(overlap==0) {
   848                         break;
   849                     }
   850                     --overlap;
   851                     ++dec;
   852                 }
   853             }
   854         } else /* USET_SPAN_SIMPLE */ {
   855             int32_t maxDec=0, maxOverlap=0;
   856             for(i=0; i<stringsLength; ++i) {
   857                 int32_t overlap=spanBackLengths[i];
   858                 // For longest match, we do need to try to match even an all-contained string
   859                 // to find the match from the latest end.
   861                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
   862                 const UChar *s16=string.getBuffer();
   863                 int32_t length16=string.length();
   865                 // Try to match this string at pos-(length16-overlap)..pos-length16.
   866                 if(overlap>=LONG_SPAN) {
   867                     overlap=length16;
   868                     // Longest match: Need to match fully inside the code point span
   869                     // to find the match from the latest end.
   870                 }
   871                 if(overlap>spanLength) {
   872                     overlap=spanLength;
   873                 }
   874                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
   875                 for(;;) {
   876                     if(dec>pos || overlap<maxOverlap) {
   877                         break;
   878                     }
   879                     // Try to match if the string is longer or ends later.
   880                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
   881                         matches16CPB(s, pos-dec, length, s16, length16)
   882                     ) {
   883                         maxDec=dec;  // Longest match from latest end.
   884                         maxOverlap=overlap;
   885                         break;
   886                     }
   887                     --overlap;
   888                     ++dec;
   889                 }
   890             }
   892             if(maxDec!=0 || maxOverlap!=0) {
   893                 // Longest-match algorithm, and there was a string match.
   894                 // Simply continue before it.
   895                 pos-=maxDec;
   896                 if(pos==0) {
   897                     return 0;  // Reached the start of the string.
   898                 }
   899                 spanLength=0;  // Match strings from before a string match.
   900                 continue;
   901             }
   902         }
   903         // Finished trying to match all strings at pos.
   905         if(spanLength!=0 || pos==length) {
   906             // The position is before an unlimited code point span (spanLength!=0),
   907             // not before a string match.
   908             // The only position where spanLength==0 before a span is pos==length.
   909             // Otherwise, an unlimited code point span is only tried again when no
   910             // strings match, and if such a non-initial span fails we stop.
   911             if(offsets.isEmpty()) {
   912                 return pos;  // No strings matched before a span.
   913             }
   914             // Match strings from before the next string match.
   915         } else {
   916             // The position is before a string match (or a single code point).
   917             if(offsets.isEmpty()) {
   918                 // No more strings matched before a previous string match.
   919                 // Try another code point span from before the last string match.
   920                 int32_t oldPos=pos;
   921                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
   922                 spanLength=oldPos-pos;
   923                 if( pos==0 ||           // Reached the start of the string, or
   924                     spanLength==0       // neither strings nor span progressed.
   925                 ) {
   926                     return pos;
   927                 }
   928                 continue;  // spanLength>0: Match strings from before a span.
   929             } else {
   930                 // Try to match only one code point from before a string match if some
   931                 // string matched beyond it, so that we try all possible positions
   932                 // and don't overshoot.
   933                 spanLength=spanOneBack(spanSet, s, pos);
   934                 if(spanLength>0) {
   935                     if(spanLength==pos) {
   936                         return 0;  // Reached the start of the string.
   937                     }
   938                     // Match strings before this code point.
   939                     // There cannot be any decrements below it because UnicodeSet strings
   940                     // contain multiple code points.
   941                     pos-=spanLength;
   942                     offsets.shift(spanLength);
   943                     spanLength=0;
   944                     continue;  // Match strings from before a single code point.
   945                 }
   946                 // Match strings from before the next string match.
   947             }
   948         }
   949         pos-=offsets.popMinimum();
   950         spanLength=0;  // Match strings from before a string match.
   951     }
   952 }
   954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
   955     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
   956         return spanNotUTF8(s, length);
   957     }
   958     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
   959     if(spanLength==length) {
   960         return length;
   961     }
   963     // Consider strings; they may overlap with the span.
   964     OffsetList offsets;
   965     if(spanCondition==USET_SPAN_CONTAINED) {
   966         // Use offset list to try all possibilities.
   967         offsets.setMaxLength(maxLength8);
   968     }
   969     int32_t pos=spanLength, rest=length-pos;
   970     int32_t i, stringsLength=strings.size();
   971     uint8_t *spanUTF8Lengths=spanLengths;
   972     if(all) {
   973         spanUTF8Lengths+=2*stringsLength;
   974     }
   975     for(;;) {
   976         const uint8_t *s8=utf8;
   977         int32_t length8;
   978         if(spanCondition==USET_SPAN_CONTAINED) {
   979             for(i=0; i<stringsLength; ++i) {
   980                 length8=utf8Lengths[i];
   981                 if(length8==0) {
   982                     continue;  // String not representable in UTF-8.
   983                 }
   984                 int32_t overlap=spanUTF8Lengths[i];
   985                 if(overlap==ALL_CP_CONTAINED) {
   986                     s8+=length8;
   987                     continue;  // Irrelevant string.
   988                 }
   990                 // Try to match this string at pos-overlap..pos.
   991                 if(overlap>=LONG_SPAN) {
   992                     overlap=length8;
   993                     // While contained: No point matching fully inside the code point span.
   994                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
   995                 }
   996                 if(overlap>spanLength) {
   997                     overlap=spanLength;
   998                 }
   999                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
  1000                 for(;;) {
  1001                     if(inc>rest) {
  1002                         break;
  1004                     // Try to match if the increment is not listed already.
  1005                     // Match at code point boundaries. (The UTF-8 strings were converted
  1006                     // from UTF-16 and are guaranteed to be well-formed.)
  1007                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
  1008                         !offsets.containsOffset(inc) &&
  1009                         matches8(s+pos-overlap, s8, length8)
  1011                     ) {
  1012                         if(inc==rest) {
  1013                             return length;  // Reached the end of the string.
  1015                         offsets.addOffset(inc);
  1017                     if(overlap==0) {
  1018                         break;
  1020                     --overlap;
  1021                     ++inc;
  1023                 s8+=length8;
  1025         } else /* USET_SPAN_SIMPLE */ {
  1026             int32_t maxInc=0, maxOverlap=0;
  1027             for(i=0; i<stringsLength; ++i) {
  1028                 length8=utf8Lengths[i];
  1029                 if(length8==0) {
  1030                     continue;  // String not representable in UTF-8.
  1032                 int32_t overlap=spanUTF8Lengths[i];
  1033                 // For longest match, we do need to try to match even an all-contained string
  1034                 // to find the match from the earliest start.
  1036                 // Try to match this string at pos-overlap..pos.
  1037                 if(overlap>=LONG_SPAN) {
  1038                     overlap=length8;
  1039                     // Longest match: Need to match fully inside the code point span
  1040                     // to find the match from the earliest start.
  1042                 if(overlap>spanLength) {
  1043                     overlap=spanLength;
  1045                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
  1046                 for(;;) {
  1047                     if(inc>rest || overlap<maxOverlap) {
  1048                         break;
  1050                     // Try to match if the string is longer or starts earlier.
  1051                     // Match at code point boundaries. (The UTF-8 strings were converted
  1052                     // from UTF-16 and are guaranteed to be well-formed.)
  1053                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
  1054                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
  1055                         matches8(s+pos-overlap, s8, length8)
  1057                     ) {
  1058                         maxInc=inc;  // Longest match from earliest start.
  1059                         maxOverlap=overlap;
  1060                         break;
  1062                     --overlap;
  1063                     ++inc;
  1065                 s8+=length8;
  1068             if(maxInc!=0 || maxOverlap!=0) {
  1069                 // Longest-match algorithm, and there was a string match.
  1070                 // Simply continue after it.
  1071                 pos+=maxInc;
  1072                 rest-=maxInc;
  1073                 if(rest==0) {
  1074                     return length;  // Reached the end of the string.
  1076                 spanLength=0;  // Match strings from after a string match.
  1077                 continue;
  1080         // Finished trying to match all strings at pos.
  1082         if(spanLength!=0 || pos==0) {
  1083             // The position is after an unlimited code point span (spanLength!=0),
  1084             // not after a string match.
  1085             // The only position where spanLength==0 after a span is pos==0.
  1086             // Otherwise, an unlimited code point span is only tried again when no
  1087             // strings match, and if such a non-initial span fails we stop.
  1088             if(offsets.isEmpty()) {
  1089                 return pos;  // No strings matched after a span.
  1091             // Match strings from after the next string match.
  1092         } else {
  1093             // The position is after a string match (or a single code point).
  1094             if(offsets.isEmpty()) {
  1095                 // No more strings matched after a previous string match.
  1096                 // Try another code point span from after the last string match.
  1097                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
  1098                 if( spanLength==rest || // Reached the end of the string, or
  1099                     spanLength==0       // neither strings nor span progressed.
  1100                 ) {
  1101                     return pos+spanLength;
  1103                 pos+=spanLength;
  1104                 rest-=spanLength;
  1105                 continue;  // spanLength>0: Match strings from after a span.
  1106             } else {
  1107                 // Try to match only one code point from after a string match if some
  1108                 // string matched beyond it, so that we try all possible positions
  1109                 // and don't overshoot.
  1110                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
  1111                 if(spanLength>0) {
  1112                     if(spanLength==rest) {
  1113                         return length;  // Reached the end of the string.
  1115                     // Match strings after this code point.
  1116                     // There cannot be any increments below it because UnicodeSet strings
  1117                     // contain multiple code points.
  1118                     pos+=spanLength;
  1119                     rest-=spanLength;
  1120                     offsets.shift(spanLength);
  1121                     spanLength=0;
  1122                     continue;  // Match strings from after a single code point.
  1124                 // Match strings from after the next string match.
  1127         int32_t minOffset=offsets.popMinimum();
  1128         pos+=minOffset;
  1129         rest-=minOffset;
  1130         spanLength=0;  // Match strings from after a string match.
  1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
  1135     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
  1136         return spanNotBackUTF8(s, length);
  1138     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
  1139     if(pos==0) {
  1140         return 0;
  1142     int32_t spanLength=length-pos;
  1144     // Consider strings; they may overlap with the span.
  1145     OffsetList offsets;
  1146     if(spanCondition==USET_SPAN_CONTAINED) {
  1147         // Use offset list to try all possibilities.
  1148         offsets.setMaxLength(maxLength8);
  1150     int32_t i, stringsLength=strings.size();
  1151     uint8_t *spanBackUTF8Lengths=spanLengths;
  1152     if(all) {
  1153         spanBackUTF8Lengths+=3*stringsLength;
  1155     for(;;) {
  1156         const uint8_t *s8=utf8;
  1157         int32_t length8;
  1158         if(spanCondition==USET_SPAN_CONTAINED) {
  1159             for(i=0; i<stringsLength; ++i) {
  1160                 length8=utf8Lengths[i];
  1161                 if(length8==0) {
  1162                     continue;  // String not representable in UTF-8.
  1164                 int32_t overlap=spanBackUTF8Lengths[i];
  1165                 if(overlap==ALL_CP_CONTAINED) {
  1166                     s8+=length8;
  1167                     continue;  // Irrelevant string.
  1170                 // Try to match this string at pos-(length8-overlap)..pos-length8.
  1171                 if(overlap>=LONG_SPAN) {
  1172                     overlap=length8;
  1173                     // While contained: No point matching fully inside the code point span.
  1174                     int32_t len1=0;
  1175                     U8_FWD_1(s8, len1, overlap);
  1176                     overlap-=len1;  // Length of the string minus the first code point.
  1178                 if(overlap>spanLength) {
  1179                     overlap=spanLength;
  1181                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
  1182                 for(;;) {
  1183                     if(dec>pos) {
  1184                         break;
  1186                     // Try to match if the decrement is not listed already.
  1187                     // Match at code point boundaries. (The UTF-8 strings were converted
  1188                     // from UTF-16 and are guaranteed to be well-formed.)
  1189                     if( !U8_IS_TRAIL(s[pos-dec]) &&
  1190                         !offsets.containsOffset(dec) &&
  1191                         matches8(s+pos-dec, s8, length8)
  1192                     ) {
  1193                         if(dec==pos) {
  1194                             return 0;  // Reached the start of the string.
  1196                         offsets.addOffset(dec);
  1198                     if(overlap==0) {
  1199                         break;
  1201                     --overlap;
  1202                     ++dec;
  1204                 s8+=length8;
  1206         } else /* USET_SPAN_SIMPLE */ {
  1207             int32_t maxDec=0, maxOverlap=0;
  1208             for(i=0; i<stringsLength; ++i) {
  1209                 length8=utf8Lengths[i];
  1210                 if(length8==0) {
  1211                     continue;  // String not representable in UTF-8.
  1213                 int32_t overlap=spanBackUTF8Lengths[i];
  1214                 // For longest match, we do need to try to match even an all-contained string
  1215                 // to find the match from the latest end.
  1217                 // Try to match this string at pos-(length8-overlap)..pos-length8.
  1218                 if(overlap>=LONG_SPAN) {
  1219                     overlap=length8;
  1220                     // Longest match: Need to match fully inside the code point span
  1221                     // to find the match from the latest end.
  1223                 if(overlap>spanLength) {
  1224                     overlap=spanLength;
  1226                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
  1227                 for(;;) {
  1228                     if(dec>pos || overlap<maxOverlap) {
  1229                         break;
  1231                     // Try to match if the string is longer or ends later.
  1232                     // Match at code point boundaries. (The UTF-8 strings were converted
  1233                     // from UTF-16 and are guaranteed to be well-formed.)
  1234                     if( !U8_IS_TRAIL(s[pos-dec]) &&
  1235                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
  1236                         matches8(s+pos-dec, s8, length8)
  1237                     ) {
  1238                         maxDec=dec;  // Longest match from latest end.
  1239                         maxOverlap=overlap;
  1240                         break;
  1242                     --overlap;
  1243                     ++dec;
  1245                 s8+=length8;
  1248             if(maxDec!=0 || maxOverlap!=0) {
  1249                 // Longest-match algorithm, and there was a string match.
  1250                 // Simply continue before it.
  1251                 pos-=maxDec;
  1252                 if(pos==0) {
  1253                     return 0;  // Reached the start of the string.
  1255                 spanLength=0;  // Match strings from before a string match.
  1256                 continue;
  1259         // Finished trying to match all strings at pos.
  1261         if(spanLength!=0 || pos==length) {
  1262             // The position is before an unlimited code point span (spanLength!=0),
  1263             // not before a string match.
  1264             // The only position where spanLength==0 before a span is pos==length.
  1265             // Otherwise, an unlimited code point span is only tried again when no
  1266             // strings match, and if such a non-initial span fails we stop.
  1267             if(offsets.isEmpty()) {
  1268                 return pos;  // No strings matched before a span.
  1270             // Match strings from before the next string match.
  1271         } else {
  1272             // The position is before a string match (or a single code point).
  1273             if(offsets.isEmpty()) {
  1274                 // No more strings matched before a previous string match.
  1275                 // Try another code point span from before the last string match.
  1276                 int32_t oldPos=pos;
  1277                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
  1278                 spanLength=oldPos-pos;
  1279                 if( pos==0 ||           // Reached the start of the string, or
  1280                     spanLength==0       // neither strings nor span progressed.
  1281                 ) {
  1282                     return pos;
  1284                 continue;  // spanLength>0: Match strings from before a span.
  1285             } else {
  1286                 // Try to match only one code point from before a string match if some
  1287                 // string matched beyond it, so that we try all possible positions
  1288                 // and don't overshoot.
  1289                 spanLength=spanOneBackUTF8(spanSet, s, pos);
  1290                 if(spanLength>0) {
  1291                     if(spanLength==pos) {
  1292                         return 0;  // Reached the start of the string.
  1294                     // Match strings before this code point.
  1295                     // There cannot be any decrements below it because UnicodeSet strings
  1296                     // contain multiple code points.
  1297                     pos-=spanLength;
  1298                     offsets.shift(spanLength);
  1299                     spanLength=0;
  1300                     continue;  // Match strings from before a single code point.
  1302                 // Match strings from before the next string match.
  1305         pos-=offsets.popMinimum();
  1306         spanLength=0;  // Match strings from before a string match.
  1310 /*
  1311  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
  1313  * Theoretical algorithm:
  1314  * - Iterate through the string, and at each code point boundary:
  1315  *   + If the code point there is in the set, then return with the current position.
  1316  *   + If a set string matches at the current position, then return with the current position.
  1318  * Optimized implementation:
  1320  * (Same assumption as for span() above.)
  1322  * Create and cache a spanNotSet which contains all of the single code points
  1323  * of the original set but none of its strings.
  1324  * For each set string add its initial code point to the spanNotSet.
  1325  * (Also add its final code point for spanNotBack().)
  1327  * - Loop:
  1328  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
  1329  *   + If the current code point is in the original set, then
  1330  *     return the current position.
  1331  *   + If any set string matches at the current position, then
  1332  *     return the current position.
  1333  *   + If there is no match at the current position, neither for the code point there
  1334  *     nor for any set string, then skip this code point and continue the loop.
  1335  *     This happens for set-string-initial code points that were added to spanNotSet
  1336  *     when there is not actually a match for such a set string.
  1337  */
  1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
  1340     int32_t pos=0, rest=length;
  1341     int32_t i, stringsLength=strings.size();
  1342     do {
  1343         // Span until we find a code point from the set,
  1344         // or a code point that starts or ends some string.
  1345         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
  1346         if(i==rest) {
  1347             return length;  // Reached the end of the string.
  1349         pos+=i;
  1350         rest-=i;
  1352         // Check whether the current code point is in the original set,
  1353         // without the string starts and ends.
  1354         int32_t cpLength=spanOne(spanSet, s+pos, rest);
  1355         if(cpLength>0) {
  1356             return pos;  // There is a set element at pos.
  1359         // Try to match the strings at pos.
  1360         for(i=0; i<stringsLength; ++i) {
  1361             if(spanLengths[i]==ALL_CP_CONTAINED) {
  1362                 continue;  // Irrelevant string.
  1364             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
  1365             const UChar *s16=string.getBuffer();
  1366             int32_t length16=string.length();
  1367             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
  1368                 return pos;  // There is a set element at pos.
  1372         // The span(while not contained) ended on a string start/end which is
  1373         // not in the original set. Skip this code point and continue.
  1374         // cpLength<0
  1375         pos-=cpLength;
  1376         rest+=cpLength;
  1377     } while(rest!=0);
  1378     return length;  // Reached the end of the string.
  1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
  1382     int32_t pos=length;
  1383     int32_t i, stringsLength=strings.size();
  1384     do {
  1385         // Span until we find a code point from the set,
  1386         // or a code point that starts or ends some string.
  1387         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
  1388         if(pos==0) {
  1389             return 0;  // Reached the start of the string.
  1392         // Check whether the current code point is in the original set,
  1393         // without the string starts and ends.
  1394         int32_t cpLength=spanOneBack(spanSet, s, pos);
  1395         if(cpLength>0) {
  1396             return pos;  // There is a set element at pos.
  1399         // Try to match the strings at pos.
  1400         for(i=0; i<stringsLength; ++i) {
  1401             // Use spanLengths rather than a spanBackLengths pointer because
  1402             // it is easier and we only need to know whether the string is irrelevant
  1403             // which is the same in either array.
  1404             if(spanLengths[i]==ALL_CP_CONTAINED) {
  1405                 continue;  // Irrelevant string.
  1407             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
  1408             const UChar *s16=string.getBuffer();
  1409             int32_t length16=string.length();
  1410             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
  1411                 return pos;  // There is a set element at pos.
  1415         // The span(while not contained) ended on a string start/end which is
  1416         // not in the original set. Skip this code point and continue.
  1417         // cpLength<0
  1418         pos+=cpLength;
  1419     } while(pos!=0);
  1420     return 0;  // Reached the start of the string.
  1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
  1424     int32_t pos=0, rest=length;
  1425     int32_t i, stringsLength=strings.size();
  1426     uint8_t *spanUTF8Lengths=spanLengths;
  1427     if(all) {
  1428         spanUTF8Lengths+=2*stringsLength;
  1430     do {
  1431         // Span until we find a code point from the set,
  1432         // or a code point that starts or ends some string.
  1433         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
  1434         if(i==rest) {
  1435             return length;  // Reached the end of the string.
  1437         pos+=i;
  1438         rest-=i;
  1440         // Check whether the current code point is in the original set,
  1441         // without the string starts and ends.
  1442         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
  1443         if(cpLength>0) {
  1444             return pos;  // There is a set element at pos.
  1447         // Try to match the strings at pos.
  1448         const uint8_t *s8=utf8;
  1449         int32_t length8;
  1450         for(i=0; i<stringsLength; ++i) {
  1451             length8=utf8Lengths[i];
  1452             // ALL_CP_CONTAINED: Irrelevant string.
  1453             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
  1454                 return pos;  // There is a set element at pos.
  1456             s8+=length8;
  1459         // The span(while not contained) ended on a string start/end which is
  1460         // not in the original set. Skip this code point and continue.
  1461         // cpLength<0
  1462         pos-=cpLength;
  1463         rest+=cpLength;
  1464     } while(rest!=0);
  1465     return length;  // Reached the end of the string.
  1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
  1469     int32_t pos=length;
  1470     int32_t i, stringsLength=strings.size();
  1471     uint8_t *spanBackUTF8Lengths=spanLengths;
  1472     if(all) {
  1473         spanBackUTF8Lengths+=3*stringsLength;
  1475     do {
  1476         // Span until we find a code point from the set,
  1477         // or a code point that starts or ends some string.
  1478         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
  1479         if(pos==0) {
  1480             return 0;  // Reached the start of the string.
  1483         // Check whether the current code point is in the original set,
  1484         // without the string starts and ends.
  1485         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
  1486         if(cpLength>0) {
  1487             return pos;  // There is a set element at pos.
  1490         // Try to match the strings at pos.
  1491         const uint8_t *s8=utf8;
  1492         int32_t length8;
  1493         for(i=0; i<stringsLength; ++i) {
  1494             length8=utf8Lengths[i];
  1495             // ALL_CP_CONTAINED: Irrelevant string.
  1496             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
  1497                 return pos;  // There is a set element at pos.
  1499             s8+=length8;
  1502         // The span(while not contained) ended on a string start/end which is
  1503         // not in the original set. Skip this code point and continue.
  1504         // cpLength<0
  1505         pos+=cpLength;
  1506     } while(pos!=0);
  1507     return 0;  // Reached the start of the string.
  1510 U_NAMESPACE_END

mercurial