1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unisetspan.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1510 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2007-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* file name: unisetspan.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2007mar01 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#include "unicode/uniset.h" 1.22 +#include "unicode/ustring.h" 1.23 +#include "unicode/utf8.h" 1.24 +#include "unicode/utf16.h" 1.25 +#include "cmemory.h" 1.26 +#include "uvector.h" 1.27 +#include "unisetspan.h" 1.28 + 1.29 +U_NAMESPACE_BEGIN 1.30 + 1.31 +/* 1.32 + * List of offsets from the current position from where to try matching 1.33 + * a code point or a string. 1.34 + * Store offsets rather than indexes to simplify the code and use the same list 1.35 + * for both increments (in span()) and decrements (in spanBack()). 1.36 + * 1.37 + * Assumption: The maximum offset is limited, and the offsets that are stored 1.38 + * at any one time are relatively dense, that is, there are normally no gaps of 1.39 + * hundreds or thousands of offset values. 1.40 + * 1.41 + * The implementation uses a circular buffer of byte flags, 1.42 + * each indicating whether the corresponding offset is in the list. 1.43 + * This avoids inserting into a sorted list of offsets (or absolute indexes) and 1.44 + * physically moving part of the list. 1.45 + * 1.46 + * Note: In principle, the caller should setMaxLength() to the maximum of the 1.47 + * max string length and U16_LENGTH/U8_LENGTH to account for 1.48 + * "long" single code points. 1.49 + * However, this implementation uses at least a staticList with more than 1.50 + * U8_LENGTH entries anyway. 1.51 + * 1.52 + * Note: If maxLength were guaranteed to be no more than 32 or 64, 1.53 + * the list could be stored as bit flags in a single integer. 1.54 + * Rather than handling a circular buffer with a start list index, 1.55 + * the integer would simply be shifted when lower offsets are removed. 1.56 + * UnicodeSet does not have a limit on the lengths of strings. 1.57 + */ 1.58 +class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. 1.59 +public: 1.60 + OffsetList() : list(staticList), capacity(0), length(0), start(0) {} 1.61 + 1.62 + ~OffsetList() { 1.63 + if(list!=staticList) { 1.64 + uprv_free(list); 1.65 + } 1.66 + } 1.67 + 1.68 + // Call exactly once if the list is to be used. 1.69 + void setMaxLength(int32_t maxLength) { 1.70 + if(maxLength<=(int32_t)sizeof(staticList)) { 1.71 + capacity=(int32_t)sizeof(staticList); 1.72 + } else { 1.73 + UBool *l=(UBool *)uprv_malloc(maxLength); 1.74 + if(l!=NULL) { 1.75 + list=l; 1.76 + capacity=maxLength; 1.77 + } 1.78 + } 1.79 + uprv_memset(list, 0, capacity); 1.80 + } 1.81 + 1.82 + void clear() { 1.83 + uprv_memset(list, 0, capacity); 1.84 + start=length=0; 1.85 + } 1.86 + 1.87 + UBool isEmpty() const { 1.88 + return (UBool)(length==0); 1.89 + } 1.90 + 1.91 + // Reduce all stored offsets by delta, used when the current position 1.92 + // moves by delta. 1.93 + // There must not be any offsets lower than delta. 1.94 + // If there is an offset equal to delta, it is removed. 1.95 + // delta=[1..maxLength] 1.96 + void shift(int32_t delta) { 1.97 + int32_t i=start+delta; 1.98 + if(i>=capacity) { 1.99 + i-=capacity; 1.100 + } 1.101 + if(list[i]) { 1.102 + list[i]=FALSE; 1.103 + --length; 1.104 + } 1.105 + start=i; 1.106 + } 1.107 + 1.108 + // Add an offset. The list must not contain it yet. 1.109 + // offset=[1..maxLength] 1.110 + void addOffset(int32_t offset) { 1.111 + int32_t i=start+offset; 1.112 + if(i>=capacity) { 1.113 + i-=capacity; 1.114 + } 1.115 + list[i]=TRUE; 1.116 + ++length; 1.117 + } 1.118 + 1.119 + // offset=[1..maxLength] 1.120 + UBool containsOffset(int32_t offset) const { 1.121 + int32_t i=start+offset; 1.122 + if(i>=capacity) { 1.123 + i-=capacity; 1.124 + } 1.125 + return list[i]; 1.126 + } 1.127 + 1.128 + // Find the lowest stored offset from a non-empty list, remove it, 1.129 + // and reduce all other offsets by this minimum. 1.130 + // Returns [1..maxLength]. 1.131 + int32_t popMinimum() { 1.132 + // Look for the next offset in list[start+1..capacity-1]. 1.133 + int32_t i=start, result; 1.134 + while(++i<capacity) { 1.135 + if(list[i]) { 1.136 + list[i]=FALSE; 1.137 + --length; 1.138 + result=i-start; 1.139 + start=i; 1.140 + return result; 1.141 + } 1.142 + } 1.143 + // i==capacity 1.144 + 1.145 + // Wrap around and look for the next offset in list[0..start]. 1.146 + // Since the list is not empty, there will be one. 1.147 + result=capacity-start; 1.148 + i=0; 1.149 + while(!list[i]) { 1.150 + ++i; 1.151 + } 1.152 + list[i]=FALSE; 1.153 + --length; 1.154 + start=i; 1.155 + return result+=i; 1.156 + } 1.157 + 1.158 +private: 1.159 + UBool *list; 1.160 + int32_t capacity; 1.161 + int32_t length; 1.162 + int32_t start; 1.163 + 1.164 + UBool staticList[16]; 1.165 +}; 1.166 + 1.167 +// Get the number of UTF-8 bytes for a UTF-16 (sub)string. 1.168 +static int32_t 1.169 +getUTF8Length(const UChar *s, int32_t length) { 1.170 + UErrorCode errorCode=U_ZERO_ERROR; 1.171 + int32_t length8=0; 1.172 + u_strToUTF8(NULL, 0, &length8, s, length, &errorCode); 1.173 + if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) { 1.174 + return length8; 1.175 + } else { 1.176 + // The string contains an unpaired surrogate. 1.177 + // Ignore this string. 1.178 + return 0; 1.179 + } 1.180 +} 1.181 + 1.182 +// Append the UTF-8 version of the string to t and return the appended UTF-8 length. 1.183 +static int32_t 1.184 +appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) { 1.185 + UErrorCode errorCode=U_ZERO_ERROR; 1.186 + int32_t length8=0; 1.187 + u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode); 1.188 + if(U_SUCCESS(errorCode)) { 1.189 + return length8; 1.190 + } else { 1.191 + // The string contains an unpaired surrogate. 1.192 + // Ignore this string. 1.193 + return 0; 1.194 + } 1.195 +} 1.196 + 1.197 +static inline uint8_t 1.198 +makeSpanLengthByte(int32_t spanLength) { 1.199 + // 0xfe==UnicodeSetStringSpan::LONG_SPAN 1.200 + return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe; 1.201 +} 1.202 + 1.203 +// Construct for all variants of span(), or only for any one variant. 1.204 +// Initialize as little as possible, for single use. 1.205 +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set, 1.206 + const UVector &setStrings, 1.207 + uint32_t which) 1.208 + : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings), 1.209 + utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 1.210 + utf8Length(0), 1.211 + maxLength16(0), maxLength8(0), 1.212 + all((UBool)(which==ALL)) { 1.213 + spanSet.retainAll(set); 1.214 + if(which&NOT_CONTAINED) { 1.215 + // Default to the same sets. 1.216 + // addToSpanNotSet() will create a separate set if necessary. 1.217 + pSpanNotSet=&spanSet; 1.218 + } 1.219 + 1.220 + // Determine if the strings even need to be taken into account at all for span() etc. 1.221 + // If any string is relevant, then all strings need to be used for 1.222 + // span(longest match) but only the relevant ones for span(while contained). 1.223 + // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 1.224 + // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 1.225 + // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 1.226 + // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 1.227 + int32_t stringsLength=strings.size(); 1.228 + 1.229 + int32_t i, spanLength; 1.230 + UBool someRelevant=FALSE; 1.231 + for(i=0; i<stringsLength; ++i) { 1.232 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.233 + const UChar *s16=string.getBuffer(); 1.234 + int32_t length16=string.length(); 1.235 + UBool thisRelevant; 1.236 + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 1.237 + if(spanLength<length16) { // Relevant string. 1.238 + someRelevant=thisRelevant=TRUE; 1.239 + } else { 1.240 + thisRelevant=FALSE; 1.241 + } 1.242 + if((which&UTF16) && length16>maxLength16) { 1.243 + maxLength16=length16; 1.244 + } 1.245 + if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { 1.246 + int32_t length8=getUTF8Length(s16, length16); 1.247 + utf8Length+=length8; 1.248 + if(length8>maxLength8) { 1.249 + maxLength8=length8; 1.250 + } 1.251 + } 1.252 + } 1.253 + if(!someRelevant) { 1.254 + maxLength16=maxLength8=0; 1.255 + return; 1.256 + } 1.257 + 1.258 + // Freeze after checking for the need to use strings at all because freezing 1.259 + // a set takes some time and memory which are wasted if there are no relevant strings. 1.260 + if(all) { 1.261 + spanSet.freeze(); 1.262 + } 1.263 + 1.264 + uint8_t *spanBackLengths; 1.265 + uint8_t *spanUTF8Lengths; 1.266 + uint8_t *spanBackUTF8Lengths; 1.267 + 1.268 + // Allocate a block of meta data. 1.269 + int32_t allocSize; 1.270 + if(all) { 1.271 + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 1.272 + allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 1.273 + } else { 1.274 + allocSize=stringsLength; // One set of span lengths. 1.275 + if(which&UTF8) { 1.276 + // UTF-8 lengths and UTF-8 strings. 1.277 + allocSize+=stringsLength*4+utf8Length; 1.278 + } 1.279 + } 1.280 + if(allocSize<=(int32_t)sizeof(staticLengths)) { 1.281 + utf8Lengths=staticLengths; 1.282 + } else { 1.283 + utf8Lengths=(int32_t *)uprv_malloc(allocSize); 1.284 + if(utf8Lengths==NULL) { 1.285 + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 1.286 + return; // Out of memory. 1.287 + } 1.288 + } 1.289 + 1.290 + if(all) { 1.291 + // Store span lengths for all span() variants. 1.292 + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 1.293 + spanBackLengths=spanLengths+stringsLength; 1.294 + spanUTF8Lengths=spanBackLengths+stringsLength; 1.295 + spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; 1.296 + utf8=spanBackUTF8Lengths+stringsLength; 1.297 + } else { 1.298 + // Store span lengths for only one span() variant. 1.299 + if(which&UTF8) { 1.300 + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 1.301 + utf8=spanLengths+stringsLength; 1.302 + } else { 1.303 + spanLengths=(uint8_t *)utf8Lengths; 1.304 + } 1.305 + spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; 1.306 + } 1.307 + 1.308 + // Set the meta data and pSpanNotSet and write the UTF-8 strings. 1.309 + int32_t utf8Count=0; // Count UTF-8 bytes written so far. 1.310 + 1.311 + for(i=0; i<stringsLength; ++i) { 1.312 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.313 + const UChar *s16=string.getBuffer(); 1.314 + int32_t length16=string.length(); 1.315 + spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED); 1.316 + if(spanLength<length16) { // Relevant string. 1.317 + if(which&UTF16) { 1.318 + if(which&CONTAINED) { 1.319 + if(which&FWD) { 1.320 + spanLengths[i]=makeSpanLengthByte(spanLength); 1.321 + } 1.322 + if(which&BACK) { 1.323 + spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED); 1.324 + spanBackLengths[i]=makeSpanLengthByte(spanLength); 1.325 + } 1.326 + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 1.327 + spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag. 1.328 + } 1.329 + } 1.330 + if(which&UTF8) { 1.331 + uint8_t *s8=utf8+utf8Count; 1.332 + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 1.333 + utf8Count+=utf8Lengths[i]=length8; 1.334 + if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8. 1.335 + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED; 1.336 + } else { // Relevant for UTF-8. 1.337 + if(which&CONTAINED) { 1.338 + if(which&FWD) { 1.339 + spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 1.340 + spanUTF8Lengths[i]=makeSpanLengthByte(spanLength); 1.341 + } 1.342 + if(which&BACK) { 1.343 + spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED); 1.344 + spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength); 1.345 + } 1.346 + } else /* not CONTAINED, not all, but NOT_CONTAINED */ { 1.347 + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag. 1.348 + } 1.349 + } 1.350 + } 1.351 + if(which&NOT_CONTAINED) { 1.352 + // Add string start and end code points to the spanNotSet so that 1.353 + // a span(while not contained) stops before any string. 1.354 + UChar32 c; 1.355 + if(which&FWD) { 1.356 + int32_t len=0; 1.357 + U16_NEXT(s16, len, length16, c); 1.358 + addToSpanNotSet(c); 1.359 + } 1.360 + if(which&BACK) { 1.361 + int32_t len=length16; 1.362 + U16_PREV(s16, 0, len, c); 1.363 + addToSpanNotSet(c); 1.364 + } 1.365 + } 1.366 + } else { // Irrelevant string. 1.367 + if(which&UTF8) { 1.368 + if(which&CONTAINED) { // Only necessary for LONGEST_MATCH. 1.369 + uint8_t *s8=utf8+utf8Count; 1.370 + int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count); 1.371 + utf8Count+=utf8Lengths[i]=length8; 1.372 + } else { 1.373 + utf8Lengths[i]=0; 1.374 + } 1.375 + } 1.376 + if(all) { 1.377 + spanLengths[i]=spanBackLengths[i]= 1.378 + spanUTF8Lengths[i]=spanBackUTF8Lengths[i]= 1.379 + (uint8_t)ALL_CP_CONTAINED; 1.380 + } else { 1.381 + // All spanXYZLengths pointers contain the same address. 1.382 + spanLengths[i]=(uint8_t)ALL_CP_CONTAINED; 1.383 + } 1.384 + } 1.385 + } 1.386 + 1.387 + // Finish. 1.388 + if(all) { 1.389 + pSpanNotSet->freeze(); 1.390 + } 1.391 +} 1.392 + 1.393 +// Copy constructor. Assumes which==ALL for a frozen set. 1.394 +UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, 1.395 + const UVector &newParentSetStrings) 1.396 + : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), 1.397 + utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), 1.398 + utf8Length(otherStringSpan.utf8Length), 1.399 + maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), 1.400 + all(TRUE) { 1.401 + if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { 1.402 + pSpanNotSet=&spanSet; 1.403 + } else { 1.404 + pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); 1.405 + } 1.406 + 1.407 + // Allocate a block of meta data. 1.408 + // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. 1.409 + int32_t stringsLength=strings.size(); 1.410 + int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; 1.411 + if(allocSize<=(int32_t)sizeof(staticLengths)) { 1.412 + utf8Lengths=staticLengths; 1.413 + } else { 1.414 + utf8Lengths=(int32_t *)uprv_malloc(allocSize); 1.415 + if(utf8Lengths==NULL) { 1.416 + maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. 1.417 + return; // Out of memory. 1.418 + } 1.419 + } 1.420 + 1.421 + spanLengths=(uint8_t *)(utf8Lengths+stringsLength); 1.422 + utf8=spanLengths+stringsLength*4; 1.423 + uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); 1.424 +} 1.425 + 1.426 +UnicodeSetStringSpan::~UnicodeSetStringSpan() { 1.427 + if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { 1.428 + delete pSpanNotSet; 1.429 + } 1.430 + if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { 1.431 + uprv_free(utf8Lengths); 1.432 + } 1.433 +} 1.434 + 1.435 +void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { 1.436 + if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { 1.437 + if(spanSet.contains(c)) { 1.438 + return; // Nothing to do. 1.439 + } 1.440 + UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); 1.441 + if(newSet==NULL) { 1.442 + return; // Out of memory. 1.443 + } else { 1.444 + pSpanNotSet=newSet; 1.445 + } 1.446 + } 1.447 + pSpanNotSet->add(c); 1.448 +} 1.449 + 1.450 +// Compare strings without any argument checks. Requires length>0. 1.451 +static inline UBool 1.452 +matches16(const UChar *s, const UChar *t, int32_t length) { 1.453 + do { 1.454 + if(*s++!=*t++) { 1.455 + return FALSE; 1.456 + } 1.457 + } while(--length>0); 1.458 + return TRUE; 1.459 +} 1.460 + 1.461 +static inline UBool 1.462 +matches8(const uint8_t *s, const uint8_t *t, int32_t length) { 1.463 + do { 1.464 + if(*s++!=*t++) { 1.465 + return FALSE; 1.466 + } 1.467 + } while(--length>0); 1.468 + return TRUE; 1.469 +} 1.470 + 1.471 +// Compare 16-bit Unicode strings (which may be malformed UTF-16) 1.472 +// at code point boundaries. 1.473 +// That is, each edge of a match must not be in the middle of a surrogate pair. 1.474 +static inline UBool 1.475 +matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { 1.476 + s+=start; 1.477 + limit-=start; 1.478 + return matches16(s, t, length) && 1.479 + !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) && 1.480 + !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length])); 1.481 +} 1.482 + 1.483 +// Does the set contain the next code point? 1.484 +// If so, return its length; otherwise return its negative length. 1.485 +static inline int32_t 1.486 +spanOne(const UnicodeSet &set, const UChar *s, int32_t length) { 1.487 + UChar c=*s, c2; 1.488 + if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { 1.489 + return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; 1.490 + } 1.491 + return set.contains(c) ? 1 : -1; 1.492 +} 1.493 + 1.494 +static inline int32_t 1.495 +spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { 1.496 + UChar c=s[length-1], c2; 1.497 + if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { 1.498 + return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; 1.499 + } 1.500 + return set.contains(c) ? 1 : -1; 1.501 +} 1.502 + 1.503 +static inline int32_t 1.504 +spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 1.505 + UChar32 c=*s; 1.506 + if((int8_t)c>=0) { 1.507 + return set.contains(c) ? 1 : -1; 1.508 + } 1.509 + // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). 1.510 + int32_t i=0; 1.511 + U8_NEXT_OR_FFFD(s, i, length, c); 1.512 + return set.contains(c) ? i : -i; 1.513 +} 1.514 + 1.515 +static inline int32_t 1.516 +spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { 1.517 + UChar32 c=s[length-1]; 1.518 + if((int8_t)c>=0) { 1.519 + return set.contains(c) ? 1 : -1; 1.520 + } 1.521 + int32_t i=length-1; 1.522 + c=utf8_prevCharSafeBody(s, 0, &i, c, -3); 1.523 + length-=i; 1.524 + return set.contains(c) ? length : -length; 1.525 +} 1.526 + 1.527 +/* 1.528 + * Note: In span() when spanLength==0 (after a string match, or at the beginning 1.529 + * after an empty code point span) and in spanNot() and spanNotUTF8(), 1.530 + * string matching could use a binary search 1.531 + * because all string matches are done from the same start index. 1.532 + * 1.533 + * For UTF-8, this would require a comparison function that returns UTF-16 order. 1.534 + * 1.535 + * This optimization should not be necessary for normal UnicodeSets because 1.536 + * most sets have no strings, and most sets with strings have 1.537 + * very few very short strings. 1.538 + * For cases with many strings, it might be better to use a different API 1.539 + * and implementation with a DFA (state machine). 1.540 + */ 1.541 + 1.542 +/* 1.543 + * Algorithm for span(USET_SPAN_CONTAINED) 1.544 + * 1.545 + * Theoretical algorithm: 1.546 + * - Iterate through the string, and at each code point boundary: 1.547 + * + If the code point there is in the set, then remember to continue after it. 1.548 + * + If a set string matches at the current position, then remember to continue after it. 1.549 + * + Either recursively span for each code point or string match, 1.550 + * or recursively span for all but the shortest one and 1.551 + * iteratively continue the span with the shortest local match. 1.552 + * + Remember the longest recursive span (the farthest end point). 1.553 + * + If there is no match at the current position, neither for the code point there 1.554 + * nor for any set string, then stop and return the longest recursive span length. 1.555 + * 1.556 + * Optimized implementation: 1.557 + * 1.558 + * (We assume that most sets will have very few very short strings. 1.559 + * A span using a string-less set is extremely fast.) 1.560 + * 1.561 + * Create and cache a spanSet which contains all of the single code points 1.562 + * of the original set but none of its strings. 1.563 + * 1.564 + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 1.565 + * - Loop: 1.566 + * + Try to match each set string at the end of the spanLength. 1.567 + * ~ Set strings that start with set-contained code points must be matched 1.568 + * with a partial overlap because the recursive algorithm would have tried 1.569 + * to match them at every position. 1.570 + * ~ Set strings that entirely consist of set-contained code points 1.571 + * are irrelevant for span(USET_SPAN_CONTAINED) because the 1.572 + * recursive algorithm would continue after them anyway 1.573 + * and find the longest recursive match from their end. 1.574 + * ~ Rather than recursing, note each end point of a set string match. 1.575 + * + If no set string matched after spanSet.span(), then return 1.576 + * with where the spanSet.span() ended. 1.577 + * + If at least one set string matched after spanSet.span(), then 1.578 + * pop the shortest string match end point and continue 1.579 + * the loop, trying to match all set strings from there. 1.580 + * + If at least one more set string matched after a previous string match, 1.581 + * then test if the code point after the previous string match is also 1.582 + * contained in the set. 1.583 + * Continue the loop with the shortest end point of either this code point 1.584 + * or a matching set string. 1.585 + * + If no more set string matched after a previous string match, 1.586 + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 1.587 + * Stop if spanLength==0, otherwise continue the loop. 1.588 + * 1.589 + * By noting each end point of a set string match, 1.590 + * the function visits each string position at most once and finishes 1.591 + * in linear time. 1.592 + * 1.593 + * The recursive algorithm may visit the same string position many times 1.594 + * if multiple paths lead to it and finishes in exponential time. 1.595 + */ 1.596 + 1.597 +/* 1.598 + * Algorithm for span(USET_SPAN_SIMPLE) 1.599 + * 1.600 + * Theoretical algorithm: 1.601 + * - Iterate through the string, and at each code point boundary: 1.602 + * + If the code point there is in the set, then remember to continue after it. 1.603 + * + If a set string matches at the current position, then remember to continue after it. 1.604 + * + Continue from the farthest match position and ignore all others. 1.605 + * + If there is no match at the current position, 1.606 + * then stop and return the current position. 1.607 + * 1.608 + * Optimized implementation: 1.609 + * 1.610 + * (Same assumption and spanSet as above.) 1.611 + * 1.612 + * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). 1.613 + * - Loop: 1.614 + * + Try to match each set string at the end of the spanLength. 1.615 + * ~ Set strings that start with set-contained code points must be matched 1.616 + * with a partial overlap because the standard algorithm would have tried 1.617 + * to match them earlier. 1.618 + * ~ Set strings that entirely consist of set-contained code points 1.619 + * must be matched with a full overlap because the longest-match algorithm 1.620 + * would hide set string matches that end earlier. 1.621 + * Such set strings need not be matched earlier inside the code point span 1.622 + * because the standard algorithm would then have continued after 1.623 + * the set string match anyway. 1.624 + * ~ Remember the longest set string match (farthest end point) from the earliest 1.625 + * starting point. 1.626 + * + If no set string matched after spanSet.span(), then return 1.627 + * with where the spanSet.span() ended. 1.628 + * + If at least one set string matched, then continue the loop after the 1.629 + * longest match from the earliest position. 1.630 + * + If no more set string matched after a previous string match, 1.631 + * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). 1.632 + * Stop if spanLength==0, otherwise continue the loop. 1.633 + */ 1.634 + 1.635 +int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 1.636 + if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1.637 + return spanNot(s, length); 1.638 + } 1.639 + int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); 1.640 + if(spanLength==length) { 1.641 + return length; 1.642 + } 1.643 + 1.644 + // Consider strings; they may overlap with the span. 1.645 + OffsetList offsets; 1.646 + if(spanCondition==USET_SPAN_CONTAINED) { 1.647 + // Use offset list to try all possibilities. 1.648 + offsets.setMaxLength(maxLength16); 1.649 + } 1.650 + int32_t pos=spanLength, rest=length-pos; 1.651 + int32_t i, stringsLength=strings.size(); 1.652 + for(;;) { 1.653 + if(spanCondition==USET_SPAN_CONTAINED) { 1.654 + for(i=0; i<stringsLength; ++i) { 1.655 + int32_t overlap=spanLengths[i]; 1.656 + if(overlap==ALL_CP_CONTAINED) { 1.657 + continue; // Irrelevant string. 1.658 + } 1.659 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.660 + const UChar *s16=string.getBuffer(); 1.661 + int32_t length16=string.length(); 1.662 + 1.663 + // Try to match this string at pos-overlap..pos. 1.664 + if(overlap>=LONG_SPAN) { 1.665 + overlap=length16; 1.666 + // While contained: No point matching fully inside the code point span. 1.667 + U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. 1.668 + } 1.669 + if(overlap>spanLength) { 1.670 + overlap=spanLength; 1.671 + } 1.672 + int32_t inc=length16-overlap; // Keep overlap+inc==length16. 1.673 + for(;;) { 1.674 + if(inc>rest) { 1.675 + break; 1.676 + } 1.677 + // Try to match if the increment is not listed already. 1.678 + if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { 1.679 + if(inc==rest) { 1.680 + return length; // Reached the end of the string. 1.681 + } 1.682 + offsets.addOffset(inc); 1.683 + } 1.684 + if(overlap==0) { 1.685 + break; 1.686 + } 1.687 + --overlap; 1.688 + ++inc; 1.689 + } 1.690 + } 1.691 + } else /* USET_SPAN_SIMPLE */ { 1.692 + int32_t maxInc=0, maxOverlap=0; 1.693 + for(i=0; i<stringsLength; ++i) { 1.694 + int32_t overlap=spanLengths[i]; 1.695 + // For longest match, we do need to try to match even an all-contained string 1.696 + // to find the match from the earliest start. 1.697 + 1.698 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.699 + const UChar *s16=string.getBuffer(); 1.700 + int32_t length16=string.length(); 1.701 + 1.702 + // Try to match this string at pos-overlap..pos. 1.703 + if(overlap>=LONG_SPAN) { 1.704 + overlap=length16; 1.705 + // Longest match: Need to match fully inside the code point span 1.706 + // to find the match from the earliest start. 1.707 + } 1.708 + if(overlap>spanLength) { 1.709 + overlap=spanLength; 1.710 + } 1.711 + int32_t inc=length16-overlap; // Keep overlap+inc==length16. 1.712 + for(;;) { 1.713 + if(inc>rest || overlap<maxOverlap) { 1.714 + break; 1.715 + } 1.716 + // Try to match if the string is longer or starts earlier. 1.717 + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1.718 + matches16CPB(s, pos-overlap, length, s16, length16) 1.719 + ) { 1.720 + maxInc=inc; // Longest match from earliest start. 1.721 + maxOverlap=overlap; 1.722 + break; 1.723 + } 1.724 + --overlap; 1.725 + ++inc; 1.726 + } 1.727 + } 1.728 + 1.729 + if(maxInc!=0 || maxOverlap!=0) { 1.730 + // Longest-match algorithm, and there was a string match. 1.731 + // Simply continue after it. 1.732 + pos+=maxInc; 1.733 + rest-=maxInc; 1.734 + if(rest==0) { 1.735 + return length; // Reached the end of the string. 1.736 + } 1.737 + spanLength=0; // Match strings from after a string match. 1.738 + continue; 1.739 + } 1.740 + } 1.741 + // Finished trying to match all strings at pos. 1.742 + 1.743 + if(spanLength!=0 || pos==0) { 1.744 + // The position is after an unlimited code point span (spanLength!=0), 1.745 + // not after a string match. 1.746 + // The only position where spanLength==0 after a span is pos==0. 1.747 + // Otherwise, an unlimited code point span is only tried again when no 1.748 + // strings match, and if such a non-initial span fails we stop. 1.749 + if(offsets.isEmpty()) { 1.750 + return pos; // No strings matched after a span. 1.751 + } 1.752 + // Match strings from after the next string match. 1.753 + } else { 1.754 + // The position is after a string match (or a single code point). 1.755 + if(offsets.isEmpty()) { 1.756 + // No more strings matched after a previous string match. 1.757 + // Try another code point span from after the last string match. 1.758 + spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); 1.759 + if( spanLength==rest || // Reached the end of the string, or 1.760 + spanLength==0 // neither strings nor span progressed. 1.761 + ) { 1.762 + return pos+spanLength; 1.763 + } 1.764 + pos+=spanLength; 1.765 + rest-=spanLength; 1.766 + continue; // spanLength>0: Match strings from after a span. 1.767 + } else { 1.768 + // Try to match only one code point from after a string match if some 1.769 + // string matched beyond it, so that we try all possible positions 1.770 + // and don't overshoot. 1.771 + spanLength=spanOne(spanSet, s+pos, rest); 1.772 + if(spanLength>0) { 1.773 + if(spanLength==rest) { 1.774 + return length; // Reached the end of the string. 1.775 + } 1.776 + // Match strings after this code point. 1.777 + // There cannot be any increments below it because UnicodeSet strings 1.778 + // contain multiple code points. 1.779 + pos+=spanLength; 1.780 + rest-=spanLength; 1.781 + offsets.shift(spanLength); 1.782 + spanLength=0; 1.783 + continue; // Match strings from after a single code point. 1.784 + } 1.785 + // Match strings from after the next string match. 1.786 + } 1.787 + } 1.788 + int32_t minOffset=offsets.popMinimum(); 1.789 + pos+=minOffset; 1.790 + rest-=minOffset; 1.791 + spanLength=0; // Match strings from after a string match. 1.792 + } 1.793 +} 1.794 + 1.795 +int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { 1.796 + if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1.797 + return spanNotBack(s, length); 1.798 + } 1.799 + int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); 1.800 + if(pos==0) { 1.801 + return 0; 1.802 + } 1.803 + int32_t spanLength=length-pos; 1.804 + 1.805 + // Consider strings; they may overlap with the span. 1.806 + OffsetList offsets; 1.807 + if(spanCondition==USET_SPAN_CONTAINED) { 1.808 + // Use offset list to try all possibilities. 1.809 + offsets.setMaxLength(maxLength16); 1.810 + } 1.811 + int32_t i, stringsLength=strings.size(); 1.812 + uint8_t *spanBackLengths=spanLengths; 1.813 + if(all) { 1.814 + spanBackLengths+=stringsLength; 1.815 + } 1.816 + for(;;) { 1.817 + if(spanCondition==USET_SPAN_CONTAINED) { 1.818 + for(i=0; i<stringsLength; ++i) { 1.819 + int32_t overlap=spanBackLengths[i]; 1.820 + if(overlap==ALL_CP_CONTAINED) { 1.821 + continue; // Irrelevant string. 1.822 + } 1.823 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.824 + const UChar *s16=string.getBuffer(); 1.825 + int32_t length16=string.length(); 1.826 + 1.827 + // Try to match this string at pos-(length16-overlap)..pos-length16. 1.828 + if(overlap>=LONG_SPAN) { 1.829 + overlap=length16; 1.830 + // While contained: No point matching fully inside the code point span. 1.831 + int32_t len1=0; 1.832 + U16_FWD_1(s16, len1, overlap); 1.833 + overlap-=len1; // Length of the string minus the first code point. 1.834 + } 1.835 + if(overlap>spanLength) { 1.836 + overlap=spanLength; 1.837 + } 1.838 + int32_t dec=length16-overlap; // Keep dec+overlap==length16. 1.839 + for(;;) { 1.840 + if(dec>pos) { 1.841 + break; 1.842 + } 1.843 + // Try to match if the decrement is not listed already. 1.844 + if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { 1.845 + if(dec==pos) { 1.846 + return 0; // Reached the start of the string. 1.847 + } 1.848 + offsets.addOffset(dec); 1.849 + } 1.850 + if(overlap==0) { 1.851 + break; 1.852 + } 1.853 + --overlap; 1.854 + ++dec; 1.855 + } 1.856 + } 1.857 + } else /* USET_SPAN_SIMPLE */ { 1.858 + int32_t maxDec=0, maxOverlap=0; 1.859 + for(i=0; i<stringsLength; ++i) { 1.860 + int32_t overlap=spanBackLengths[i]; 1.861 + // For longest match, we do need to try to match even an all-contained string 1.862 + // to find the match from the latest end. 1.863 + 1.864 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.865 + const UChar *s16=string.getBuffer(); 1.866 + int32_t length16=string.length(); 1.867 + 1.868 + // Try to match this string at pos-(length16-overlap)..pos-length16. 1.869 + if(overlap>=LONG_SPAN) { 1.870 + overlap=length16; 1.871 + // Longest match: Need to match fully inside the code point span 1.872 + // to find the match from the latest end. 1.873 + } 1.874 + if(overlap>spanLength) { 1.875 + overlap=spanLength; 1.876 + } 1.877 + int32_t dec=length16-overlap; // Keep dec+overlap==length16. 1.878 + for(;;) { 1.879 + if(dec>pos || overlap<maxOverlap) { 1.880 + break; 1.881 + } 1.882 + // Try to match if the string is longer or ends later. 1.883 + if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1.884 + matches16CPB(s, pos-dec, length, s16, length16) 1.885 + ) { 1.886 + maxDec=dec; // Longest match from latest end. 1.887 + maxOverlap=overlap; 1.888 + break; 1.889 + } 1.890 + --overlap; 1.891 + ++dec; 1.892 + } 1.893 + } 1.894 + 1.895 + if(maxDec!=0 || maxOverlap!=0) { 1.896 + // Longest-match algorithm, and there was a string match. 1.897 + // Simply continue before it. 1.898 + pos-=maxDec; 1.899 + if(pos==0) { 1.900 + return 0; // Reached the start of the string. 1.901 + } 1.902 + spanLength=0; // Match strings from before a string match. 1.903 + continue; 1.904 + } 1.905 + } 1.906 + // Finished trying to match all strings at pos. 1.907 + 1.908 + if(spanLength!=0 || pos==length) { 1.909 + // The position is before an unlimited code point span (spanLength!=0), 1.910 + // not before a string match. 1.911 + // The only position where spanLength==0 before a span is pos==length. 1.912 + // Otherwise, an unlimited code point span is only tried again when no 1.913 + // strings match, and if such a non-initial span fails we stop. 1.914 + if(offsets.isEmpty()) { 1.915 + return pos; // No strings matched before a span. 1.916 + } 1.917 + // Match strings from before the next string match. 1.918 + } else { 1.919 + // The position is before a string match (or a single code point). 1.920 + if(offsets.isEmpty()) { 1.921 + // No more strings matched before a previous string match. 1.922 + // Try another code point span from before the last string match. 1.923 + int32_t oldPos=pos; 1.924 + pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); 1.925 + spanLength=oldPos-pos; 1.926 + if( pos==0 || // Reached the start of the string, or 1.927 + spanLength==0 // neither strings nor span progressed. 1.928 + ) { 1.929 + return pos; 1.930 + } 1.931 + continue; // spanLength>0: Match strings from before a span. 1.932 + } else { 1.933 + // Try to match only one code point from before a string match if some 1.934 + // string matched beyond it, so that we try all possible positions 1.935 + // and don't overshoot. 1.936 + spanLength=spanOneBack(spanSet, s, pos); 1.937 + if(spanLength>0) { 1.938 + if(spanLength==pos) { 1.939 + return 0; // Reached the start of the string. 1.940 + } 1.941 + // Match strings before this code point. 1.942 + // There cannot be any decrements below it because UnicodeSet strings 1.943 + // contain multiple code points. 1.944 + pos-=spanLength; 1.945 + offsets.shift(spanLength); 1.946 + spanLength=0; 1.947 + continue; // Match strings from before a single code point. 1.948 + } 1.949 + // Match strings from before the next string match. 1.950 + } 1.951 + } 1.952 + pos-=offsets.popMinimum(); 1.953 + spanLength=0; // Match strings from before a string match. 1.954 + } 1.955 +} 1.956 + 1.957 +int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1.958 + if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1.959 + return spanNotUTF8(s, length); 1.960 + } 1.961 + int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1.962 + if(spanLength==length) { 1.963 + return length; 1.964 + } 1.965 + 1.966 + // Consider strings; they may overlap with the span. 1.967 + OffsetList offsets; 1.968 + if(spanCondition==USET_SPAN_CONTAINED) { 1.969 + // Use offset list to try all possibilities. 1.970 + offsets.setMaxLength(maxLength8); 1.971 + } 1.972 + int32_t pos=spanLength, rest=length-pos; 1.973 + int32_t i, stringsLength=strings.size(); 1.974 + uint8_t *spanUTF8Lengths=spanLengths; 1.975 + if(all) { 1.976 + spanUTF8Lengths+=2*stringsLength; 1.977 + } 1.978 + for(;;) { 1.979 + const uint8_t *s8=utf8; 1.980 + int32_t length8; 1.981 + if(spanCondition==USET_SPAN_CONTAINED) { 1.982 + for(i=0; i<stringsLength; ++i) { 1.983 + length8=utf8Lengths[i]; 1.984 + if(length8==0) { 1.985 + continue; // String not representable in UTF-8. 1.986 + } 1.987 + int32_t overlap=spanUTF8Lengths[i]; 1.988 + if(overlap==ALL_CP_CONTAINED) { 1.989 + s8+=length8; 1.990 + continue; // Irrelevant string. 1.991 + } 1.992 + 1.993 + // Try to match this string at pos-overlap..pos. 1.994 + if(overlap>=LONG_SPAN) { 1.995 + overlap=length8; 1.996 + // While contained: No point matching fully inside the code point span. 1.997 + U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. 1.998 + } 1.999 + if(overlap>spanLength) { 1.1000 + overlap=spanLength; 1.1001 + } 1.1002 + int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1.1003 + for(;;) { 1.1004 + if(inc>rest) { 1.1005 + break; 1.1006 + } 1.1007 + // Try to match if the increment is not listed already. 1.1008 + // Match at code point boundaries. (The UTF-8 strings were converted 1.1009 + // from UTF-16 and are guaranteed to be well-formed.) 1.1010 + if( !U8_IS_TRAIL(s[pos-overlap]) && 1.1011 + !offsets.containsOffset(inc) && 1.1012 + matches8(s+pos-overlap, s8, length8) 1.1013 + 1.1014 + ) { 1.1015 + if(inc==rest) { 1.1016 + return length; // Reached the end of the string. 1.1017 + } 1.1018 + offsets.addOffset(inc); 1.1019 + } 1.1020 + if(overlap==0) { 1.1021 + break; 1.1022 + } 1.1023 + --overlap; 1.1024 + ++inc; 1.1025 + } 1.1026 + s8+=length8; 1.1027 + } 1.1028 + } else /* USET_SPAN_SIMPLE */ { 1.1029 + int32_t maxInc=0, maxOverlap=0; 1.1030 + for(i=0; i<stringsLength; ++i) { 1.1031 + length8=utf8Lengths[i]; 1.1032 + if(length8==0) { 1.1033 + continue; // String not representable in UTF-8. 1.1034 + } 1.1035 + int32_t overlap=spanUTF8Lengths[i]; 1.1036 + // For longest match, we do need to try to match even an all-contained string 1.1037 + // to find the match from the earliest start. 1.1038 + 1.1039 + // Try to match this string at pos-overlap..pos. 1.1040 + if(overlap>=LONG_SPAN) { 1.1041 + overlap=length8; 1.1042 + // Longest match: Need to match fully inside the code point span 1.1043 + // to find the match from the earliest start. 1.1044 + } 1.1045 + if(overlap>spanLength) { 1.1046 + overlap=spanLength; 1.1047 + } 1.1048 + int32_t inc=length8-overlap; // Keep overlap+inc==length8. 1.1049 + for(;;) { 1.1050 + if(inc>rest || overlap<maxOverlap) { 1.1051 + break; 1.1052 + } 1.1053 + // Try to match if the string is longer or starts earlier. 1.1054 + // Match at code point boundaries. (The UTF-8 strings were converted 1.1055 + // from UTF-16 and are guaranteed to be well-formed.) 1.1056 + if( !U8_IS_TRAIL(s[pos-overlap]) && 1.1057 + (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && 1.1058 + matches8(s+pos-overlap, s8, length8) 1.1059 + 1.1060 + ) { 1.1061 + maxInc=inc; // Longest match from earliest start. 1.1062 + maxOverlap=overlap; 1.1063 + break; 1.1064 + } 1.1065 + --overlap; 1.1066 + ++inc; 1.1067 + } 1.1068 + s8+=length8; 1.1069 + } 1.1070 + 1.1071 + if(maxInc!=0 || maxOverlap!=0) { 1.1072 + // Longest-match algorithm, and there was a string match. 1.1073 + // Simply continue after it. 1.1074 + pos+=maxInc; 1.1075 + rest-=maxInc; 1.1076 + if(rest==0) { 1.1077 + return length; // Reached the end of the string. 1.1078 + } 1.1079 + spanLength=0; // Match strings from after a string match. 1.1080 + continue; 1.1081 + } 1.1082 + } 1.1083 + // Finished trying to match all strings at pos. 1.1084 + 1.1085 + if(spanLength!=0 || pos==0) { 1.1086 + // The position is after an unlimited code point span (spanLength!=0), 1.1087 + // not after a string match. 1.1088 + // The only position where spanLength==0 after a span is pos==0. 1.1089 + // Otherwise, an unlimited code point span is only tried again when no 1.1090 + // strings match, and if such a non-initial span fails we stop. 1.1091 + if(offsets.isEmpty()) { 1.1092 + return pos; // No strings matched after a span. 1.1093 + } 1.1094 + // Match strings from after the next string match. 1.1095 + } else { 1.1096 + // The position is after a string match (or a single code point). 1.1097 + if(offsets.isEmpty()) { 1.1098 + // No more strings matched after a previous string match. 1.1099 + // Try another code point span from after the last string match. 1.1100 + spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); 1.1101 + if( spanLength==rest || // Reached the end of the string, or 1.1102 + spanLength==0 // neither strings nor span progressed. 1.1103 + ) { 1.1104 + return pos+spanLength; 1.1105 + } 1.1106 + pos+=spanLength; 1.1107 + rest-=spanLength; 1.1108 + continue; // spanLength>0: Match strings from after a span. 1.1109 + } else { 1.1110 + // Try to match only one code point from after a string match if some 1.1111 + // string matched beyond it, so that we try all possible positions 1.1112 + // and don't overshoot. 1.1113 + spanLength=spanOneUTF8(spanSet, s+pos, rest); 1.1114 + if(spanLength>0) { 1.1115 + if(spanLength==rest) { 1.1116 + return length; // Reached the end of the string. 1.1117 + } 1.1118 + // Match strings after this code point. 1.1119 + // There cannot be any increments below it because UnicodeSet strings 1.1120 + // contain multiple code points. 1.1121 + pos+=spanLength; 1.1122 + rest-=spanLength; 1.1123 + offsets.shift(spanLength); 1.1124 + spanLength=0; 1.1125 + continue; // Match strings from after a single code point. 1.1126 + } 1.1127 + // Match strings from after the next string match. 1.1128 + } 1.1129 + } 1.1130 + int32_t minOffset=offsets.popMinimum(); 1.1131 + pos+=minOffset; 1.1132 + rest-=minOffset; 1.1133 + spanLength=0; // Match strings from after a string match. 1.1134 + } 1.1135 +} 1.1136 + 1.1137 +int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { 1.1138 + if(spanCondition==USET_SPAN_NOT_CONTAINED) { 1.1139 + return spanNotBackUTF8(s, length); 1.1140 + } 1.1141 + int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); 1.1142 + if(pos==0) { 1.1143 + return 0; 1.1144 + } 1.1145 + int32_t spanLength=length-pos; 1.1146 + 1.1147 + // Consider strings; they may overlap with the span. 1.1148 + OffsetList offsets; 1.1149 + if(spanCondition==USET_SPAN_CONTAINED) { 1.1150 + // Use offset list to try all possibilities. 1.1151 + offsets.setMaxLength(maxLength8); 1.1152 + } 1.1153 + int32_t i, stringsLength=strings.size(); 1.1154 + uint8_t *spanBackUTF8Lengths=spanLengths; 1.1155 + if(all) { 1.1156 + spanBackUTF8Lengths+=3*stringsLength; 1.1157 + } 1.1158 + for(;;) { 1.1159 + const uint8_t *s8=utf8; 1.1160 + int32_t length8; 1.1161 + if(spanCondition==USET_SPAN_CONTAINED) { 1.1162 + for(i=0; i<stringsLength; ++i) { 1.1163 + length8=utf8Lengths[i]; 1.1164 + if(length8==0) { 1.1165 + continue; // String not representable in UTF-8. 1.1166 + } 1.1167 + int32_t overlap=spanBackUTF8Lengths[i]; 1.1168 + if(overlap==ALL_CP_CONTAINED) { 1.1169 + s8+=length8; 1.1170 + continue; // Irrelevant string. 1.1171 + } 1.1172 + 1.1173 + // Try to match this string at pos-(length8-overlap)..pos-length8. 1.1174 + if(overlap>=LONG_SPAN) { 1.1175 + overlap=length8; 1.1176 + // While contained: No point matching fully inside the code point span. 1.1177 + int32_t len1=0; 1.1178 + U8_FWD_1(s8, len1, overlap); 1.1179 + overlap-=len1; // Length of the string minus the first code point. 1.1180 + } 1.1181 + if(overlap>spanLength) { 1.1182 + overlap=spanLength; 1.1183 + } 1.1184 + int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1.1185 + for(;;) { 1.1186 + if(dec>pos) { 1.1187 + break; 1.1188 + } 1.1189 + // Try to match if the decrement is not listed already. 1.1190 + // Match at code point boundaries. (The UTF-8 strings were converted 1.1191 + // from UTF-16 and are guaranteed to be well-formed.) 1.1192 + if( !U8_IS_TRAIL(s[pos-dec]) && 1.1193 + !offsets.containsOffset(dec) && 1.1194 + matches8(s+pos-dec, s8, length8) 1.1195 + ) { 1.1196 + if(dec==pos) { 1.1197 + return 0; // Reached the start of the string. 1.1198 + } 1.1199 + offsets.addOffset(dec); 1.1200 + } 1.1201 + if(overlap==0) { 1.1202 + break; 1.1203 + } 1.1204 + --overlap; 1.1205 + ++dec; 1.1206 + } 1.1207 + s8+=length8; 1.1208 + } 1.1209 + } else /* USET_SPAN_SIMPLE */ { 1.1210 + int32_t maxDec=0, maxOverlap=0; 1.1211 + for(i=0; i<stringsLength; ++i) { 1.1212 + length8=utf8Lengths[i]; 1.1213 + if(length8==0) { 1.1214 + continue; // String not representable in UTF-8. 1.1215 + } 1.1216 + int32_t overlap=spanBackUTF8Lengths[i]; 1.1217 + // For longest match, we do need to try to match even an all-contained string 1.1218 + // to find the match from the latest end. 1.1219 + 1.1220 + // Try to match this string at pos-(length8-overlap)..pos-length8. 1.1221 + if(overlap>=LONG_SPAN) { 1.1222 + overlap=length8; 1.1223 + // Longest match: Need to match fully inside the code point span 1.1224 + // to find the match from the latest end. 1.1225 + } 1.1226 + if(overlap>spanLength) { 1.1227 + overlap=spanLength; 1.1228 + } 1.1229 + int32_t dec=length8-overlap; // Keep dec+overlap==length8. 1.1230 + for(;;) { 1.1231 + if(dec>pos || overlap<maxOverlap) { 1.1232 + break; 1.1233 + } 1.1234 + // Try to match if the string is longer or ends later. 1.1235 + // Match at code point boundaries. (The UTF-8 strings were converted 1.1236 + // from UTF-16 and are guaranteed to be well-formed.) 1.1237 + if( !U8_IS_TRAIL(s[pos-dec]) && 1.1238 + (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && 1.1239 + matches8(s+pos-dec, s8, length8) 1.1240 + ) { 1.1241 + maxDec=dec; // Longest match from latest end. 1.1242 + maxOverlap=overlap; 1.1243 + break; 1.1244 + } 1.1245 + --overlap; 1.1246 + ++dec; 1.1247 + } 1.1248 + s8+=length8; 1.1249 + } 1.1250 + 1.1251 + if(maxDec!=0 || maxOverlap!=0) { 1.1252 + // Longest-match algorithm, and there was a string match. 1.1253 + // Simply continue before it. 1.1254 + pos-=maxDec; 1.1255 + if(pos==0) { 1.1256 + return 0; // Reached the start of the string. 1.1257 + } 1.1258 + spanLength=0; // Match strings from before a string match. 1.1259 + continue; 1.1260 + } 1.1261 + } 1.1262 + // Finished trying to match all strings at pos. 1.1263 + 1.1264 + if(spanLength!=0 || pos==length) { 1.1265 + // The position is before an unlimited code point span (spanLength!=0), 1.1266 + // not before a string match. 1.1267 + // The only position where spanLength==0 before a span is pos==length. 1.1268 + // Otherwise, an unlimited code point span is only tried again when no 1.1269 + // strings match, and if such a non-initial span fails we stop. 1.1270 + if(offsets.isEmpty()) { 1.1271 + return pos; // No strings matched before a span. 1.1272 + } 1.1273 + // Match strings from before the next string match. 1.1274 + } else { 1.1275 + // The position is before a string match (or a single code point). 1.1276 + if(offsets.isEmpty()) { 1.1277 + // No more strings matched before a previous string match. 1.1278 + // Try another code point span from before the last string match. 1.1279 + int32_t oldPos=pos; 1.1280 + pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); 1.1281 + spanLength=oldPos-pos; 1.1282 + if( pos==0 || // Reached the start of the string, or 1.1283 + spanLength==0 // neither strings nor span progressed. 1.1284 + ) { 1.1285 + return pos; 1.1286 + } 1.1287 + continue; // spanLength>0: Match strings from before a span. 1.1288 + } else { 1.1289 + // Try to match only one code point from before a string match if some 1.1290 + // string matched beyond it, so that we try all possible positions 1.1291 + // and don't overshoot. 1.1292 + spanLength=spanOneBackUTF8(spanSet, s, pos); 1.1293 + if(spanLength>0) { 1.1294 + if(spanLength==pos) { 1.1295 + return 0; // Reached the start of the string. 1.1296 + } 1.1297 + // Match strings before this code point. 1.1298 + // There cannot be any decrements below it because UnicodeSet strings 1.1299 + // contain multiple code points. 1.1300 + pos-=spanLength; 1.1301 + offsets.shift(spanLength); 1.1302 + spanLength=0; 1.1303 + continue; // Match strings from before a single code point. 1.1304 + } 1.1305 + // Match strings from before the next string match. 1.1306 + } 1.1307 + } 1.1308 + pos-=offsets.popMinimum(); 1.1309 + spanLength=0; // Match strings from before a string match. 1.1310 + } 1.1311 +} 1.1312 + 1.1313 +/* 1.1314 + * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) 1.1315 + * 1.1316 + * Theoretical algorithm: 1.1317 + * - Iterate through the string, and at each code point boundary: 1.1318 + * + If the code point there is in the set, then return with the current position. 1.1319 + * + If a set string matches at the current position, then return with the current position. 1.1320 + * 1.1321 + * Optimized implementation: 1.1322 + * 1.1323 + * (Same assumption as for span() above.) 1.1324 + * 1.1325 + * Create and cache a spanNotSet which contains all of the single code points 1.1326 + * of the original set but none of its strings. 1.1327 + * For each set string add its initial code point to the spanNotSet. 1.1328 + * (Also add its final code point for spanNotBack().) 1.1329 + * 1.1330 + * - Loop: 1.1331 + * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). 1.1332 + * + If the current code point is in the original set, then 1.1333 + * return the current position. 1.1334 + * + If any set string matches at the current position, then 1.1335 + * return the current position. 1.1336 + * + If there is no match at the current position, neither for the code point there 1.1337 + * nor for any set string, then skip this code point and continue the loop. 1.1338 + * This happens for set-string-initial code points that were added to spanNotSet 1.1339 + * when there is not actually a match for such a set string. 1.1340 + */ 1.1341 + 1.1342 +int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { 1.1343 + int32_t pos=0, rest=length; 1.1344 + int32_t i, stringsLength=strings.size(); 1.1345 + do { 1.1346 + // Span until we find a code point from the set, 1.1347 + // or a code point that starts or ends some string. 1.1348 + i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); 1.1349 + if(i==rest) { 1.1350 + return length; // Reached the end of the string. 1.1351 + } 1.1352 + pos+=i; 1.1353 + rest-=i; 1.1354 + 1.1355 + // Check whether the current code point is in the original set, 1.1356 + // without the string starts and ends. 1.1357 + int32_t cpLength=spanOne(spanSet, s+pos, rest); 1.1358 + if(cpLength>0) { 1.1359 + return pos; // There is a set element at pos. 1.1360 + } 1.1361 + 1.1362 + // Try to match the strings at pos. 1.1363 + for(i=0; i<stringsLength; ++i) { 1.1364 + if(spanLengths[i]==ALL_CP_CONTAINED) { 1.1365 + continue; // Irrelevant string. 1.1366 + } 1.1367 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.1368 + const UChar *s16=string.getBuffer(); 1.1369 + int32_t length16=string.length(); 1.1370 + if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) { 1.1371 + return pos; // There is a set element at pos. 1.1372 + } 1.1373 + } 1.1374 + 1.1375 + // The span(while not contained) ended on a string start/end which is 1.1376 + // not in the original set. Skip this code point and continue. 1.1377 + // cpLength<0 1.1378 + pos-=cpLength; 1.1379 + rest+=cpLength; 1.1380 + } while(rest!=0); 1.1381 + return length; // Reached the end of the string. 1.1382 +} 1.1383 + 1.1384 +int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const { 1.1385 + int32_t pos=length; 1.1386 + int32_t i, stringsLength=strings.size(); 1.1387 + do { 1.1388 + // Span until we find a code point from the set, 1.1389 + // or a code point that starts or ends some string. 1.1390 + pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED); 1.1391 + if(pos==0) { 1.1392 + return 0; // Reached the start of the string. 1.1393 + } 1.1394 + 1.1395 + // Check whether the current code point is in the original set, 1.1396 + // without the string starts and ends. 1.1397 + int32_t cpLength=spanOneBack(spanSet, s, pos); 1.1398 + if(cpLength>0) { 1.1399 + return pos; // There is a set element at pos. 1.1400 + } 1.1401 + 1.1402 + // Try to match the strings at pos. 1.1403 + for(i=0; i<stringsLength; ++i) { 1.1404 + // Use spanLengths rather than a spanBackLengths pointer because 1.1405 + // it is easier and we only need to know whether the string is irrelevant 1.1406 + // which is the same in either array. 1.1407 + if(spanLengths[i]==ALL_CP_CONTAINED) { 1.1408 + continue; // Irrelevant string. 1.1409 + } 1.1410 + const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i); 1.1411 + const UChar *s16=string.getBuffer(); 1.1412 + int32_t length16=string.length(); 1.1413 + if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) { 1.1414 + return pos; // There is a set element at pos. 1.1415 + } 1.1416 + } 1.1417 + 1.1418 + // The span(while not contained) ended on a string start/end which is 1.1419 + // not in the original set. Skip this code point and continue. 1.1420 + // cpLength<0 1.1421 + pos+=cpLength; 1.1422 + } while(pos!=0); 1.1423 + return 0; // Reached the start of the string. 1.1424 +} 1.1425 + 1.1426 +int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const { 1.1427 + int32_t pos=0, rest=length; 1.1428 + int32_t i, stringsLength=strings.size(); 1.1429 + uint8_t *spanUTF8Lengths=spanLengths; 1.1430 + if(all) { 1.1431 + spanUTF8Lengths+=2*stringsLength; 1.1432 + } 1.1433 + do { 1.1434 + // Span until we find a code point from the set, 1.1435 + // or a code point that starts or ends some string. 1.1436 + i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); 1.1437 + if(i==rest) { 1.1438 + return length; // Reached the end of the string. 1.1439 + } 1.1440 + pos+=i; 1.1441 + rest-=i; 1.1442 + 1.1443 + // Check whether the current code point is in the original set, 1.1444 + // without the string starts and ends. 1.1445 + int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); 1.1446 + if(cpLength>0) { 1.1447 + return pos; // There is a set element at pos. 1.1448 + } 1.1449 + 1.1450 + // Try to match the strings at pos. 1.1451 + const uint8_t *s8=utf8; 1.1452 + int32_t length8; 1.1453 + for(i=0; i<stringsLength; ++i) { 1.1454 + length8=utf8Lengths[i]; 1.1455 + // ALL_CP_CONTAINED: Irrelevant string. 1.1456 + if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) { 1.1457 + return pos; // There is a set element at pos. 1.1458 + } 1.1459 + s8+=length8; 1.1460 + } 1.1461 + 1.1462 + // The span(while not contained) ended on a string start/end which is 1.1463 + // not in the original set. Skip this code point and continue. 1.1464 + // cpLength<0 1.1465 + pos-=cpLength; 1.1466 + rest+=cpLength; 1.1467 + } while(rest!=0); 1.1468 + return length; // Reached the end of the string. 1.1469 +} 1.1470 + 1.1471 +int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const { 1.1472 + int32_t pos=length; 1.1473 + int32_t i, stringsLength=strings.size(); 1.1474 + uint8_t *spanBackUTF8Lengths=spanLengths; 1.1475 + if(all) { 1.1476 + spanBackUTF8Lengths+=3*stringsLength; 1.1477 + } 1.1478 + do { 1.1479 + // Span until we find a code point from the set, 1.1480 + // or a code point that starts or ends some string. 1.1481 + pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); 1.1482 + if(pos==0) { 1.1483 + return 0; // Reached the start of the string. 1.1484 + } 1.1485 + 1.1486 + // Check whether the current code point is in the original set, 1.1487 + // without the string starts and ends. 1.1488 + int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); 1.1489 + if(cpLength>0) { 1.1490 + return pos; // There is a set element at pos. 1.1491 + } 1.1492 + 1.1493 + // Try to match the strings at pos. 1.1494 + const uint8_t *s8=utf8; 1.1495 + int32_t length8; 1.1496 + for(i=0; i<stringsLength; ++i) { 1.1497 + length8=utf8Lengths[i]; 1.1498 + // ALL_CP_CONTAINED: Irrelevant string. 1.1499 + if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) { 1.1500 + return pos; // There is a set element at pos. 1.1501 + } 1.1502 + s8+=length8; 1.1503 + } 1.1504 + 1.1505 + // The span(while not contained) ended on a string start/end which is 1.1506 + // not in the original set. Skip this code point and continue. 1.1507 + // cpLength<0 1.1508 + pos+=cpLength; 1.1509 + } while(pos!=0); 1.1510 + return 0; // Reached the start of the string. 1.1511 +} 1.1512 + 1.1513 +U_NAMESPACE_END