intl/icu/source/common/unisetspan.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2007-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 * file name: unisetspan.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2007mar01
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18 #include "unicode/uniset.h"
michael@0 19 #include "unicode/ustring.h"
michael@0 20 #include "unicode/utf8.h"
michael@0 21 #include "unicode/utf16.h"
michael@0 22 #include "cmemory.h"
michael@0 23 #include "uvector.h"
michael@0 24 #include "unisetspan.h"
michael@0 25
michael@0 26 U_NAMESPACE_BEGIN
michael@0 27
michael@0 28 /*
michael@0 29 * List of offsets from the current position from where to try matching
michael@0 30 * a code point or a string.
michael@0 31 * Store offsets rather than indexes to simplify the code and use the same list
michael@0 32 * for both increments (in span()) and decrements (in spanBack()).
michael@0 33 *
michael@0 34 * Assumption: The maximum offset is limited, and the offsets that are stored
michael@0 35 * at any one time are relatively dense, that is, there are normally no gaps of
michael@0 36 * hundreds or thousands of offset values.
michael@0 37 *
michael@0 38 * The implementation uses a circular buffer of byte flags,
michael@0 39 * each indicating whether the corresponding offset is in the list.
michael@0 40 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
michael@0 41 * physically moving part of the list.
michael@0 42 *
michael@0 43 * Note: In principle, the caller should setMaxLength() to the maximum of the
michael@0 44 * max string length and U16_LENGTH/U8_LENGTH to account for
michael@0 45 * "long" single code points.
michael@0 46 * However, this implementation uses at least a staticList with more than
michael@0 47 * U8_LENGTH entries anyway.
michael@0 48 *
michael@0 49 * Note: If maxLength were guaranteed to be no more than 32 or 64,
michael@0 50 * the list could be stored as bit flags in a single integer.
michael@0 51 * Rather than handling a circular buffer with a start list index,
michael@0 52 * the integer would simply be shifted when lower offsets are removed.
michael@0 53 * UnicodeSet does not have a limit on the lengths of strings.
michael@0 54 */
michael@0 55 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
michael@0 56 public:
michael@0 57 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
michael@0 58
michael@0 59 ~OffsetList() {
michael@0 60 if(list!=staticList) {
michael@0 61 uprv_free(list);
michael@0 62 }
michael@0 63 }
michael@0 64
michael@0 65 // Call exactly once if the list is to be used.
michael@0 66 void setMaxLength(int32_t maxLength) {
michael@0 67 if(maxLength<=(int32_t)sizeof(staticList)) {
michael@0 68 capacity=(int32_t)sizeof(staticList);
michael@0 69 } else {
michael@0 70 UBool *l=(UBool *)uprv_malloc(maxLength);
michael@0 71 if(l!=NULL) {
michael@0 72 list=l;
michael@0 73 capacity=maxLength;
michael@0 74 }
michael@0 75 }
michael@0 76 uprv_memset(list, 0, capacity);
michael@0 77 }
michael@0 78
michael@0 79 void clear() {
michael@0 80 uprv_memset(list, 0, capacity);
michael@0 81 start=length=0;
michael@0 82 }
michael@0 83
michael@0 84 UBool isEmpty() const {
michael@0 85 return (UBool)(length==0);
michael@0 86 }
michael@0 87
michael@0 88 // Reduce all stored offsets by delta, used when the current position
michael@0 89 // moves by delta.
michael@0 90 // There must not be any offsets lower than delta.
michael@0 91 // If there is an offset equal to delta, it is removed.
michael@0 92 // delta=[1..maxLength]
michael@0 93 void shift(int32_t delta) {
michael@0 94 int32_t i=start+delta;
michael@0 95 if(i>=capacity) {
michael@0 96 i-=capacity;
michael@0 97 }
michael@0 98 if(list[i]) {
michael@0 99 list[i]=FALSE;
michael@0 100 --length;
michael@0 101 }
michael@0 102 start=i;
michael@0 103 }
michael@0 104
michael@0 105 // Add an offset. The list must not contain it yet.
michael@0 106 // offset=[1..maxLength]
michael@0 107 void addOffset(int32_t offset) {
michael@0 108 int32_t i=start+offset;
michael@0 109 if(i>=capacity) {
michael@0 110 i-=capacity;
michael@0 111 }
michael@0 112 list[i]=TRUE;
michael@0 113 ++length;
michael@0 114 }
michael@0 115
michael@0 116 // offset=[1..maxLength]
michael@0 117 UBool containsOffset(int32_t offset) const {
michael@0 118 int32_t i=start+offset;
michael@0 119 if(i>=capacity) {
michael@0 120 i-=capacity;
michael@0 121 }
michael@0 122 return list[i];
michael@0 123 }
michael@0 124
michael@0 125 // Find the lowest stored offset from a non-empty list, remove it,
michael@0 126 // and reduce all other offsets by this minimum.
michael@0 127 // Returns [1..maxLength].
michael@0 128 int32_t popMinimum() {
michael@0 129 // Look for the next offset in list[start+1..capacity-1].
michael@0 130 int32_t i=start, result;
michael@0 131 while(++i<capacity) {
michael@0 132 if(list[i]) {
michael@0 133 list[i]=FALSE;
michael@0 134 --length;
michael@0 135 result=i-start;
michael@0 136 start=i;
michael@0 137 return result;
michael@0 138 }
michael@0 139 }
michael@0 140 // i==capacity
michael@0 141
michael@0 142 // Wrap around and look for the next offset in list[0..start].
michael@0 143 // Since the list is not empty, there will be one.
michael@0 144 result=capacity-start;
michael@0 145 i=0;
michael@0 146 while(!list[i]) {
michael@0 147 ++i;
michael@0 148 }
michael@0 149 list[i]=FALSE;
michael@0 150 --length;
michael@0 151 start=i;
michael@0 152 return result+=i;
michael@0 153 }
michael@0 154
michael@0 155 private:
michael@0 156 UBool *list;
michael@0 157 int32_t capacity;
michael@0 158 int32_t length;
michael@0 159 int32_t start;
michael@0 160
michael@0 161 UBool staticList[16];
michael@0 162 };
michael@0 163
michael@0 164 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
michael@0 165 static int32_t
michael@0 166 getUTF8Length(const UChar *s, int32_t length) {
michael@0 167 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 168 int32_t length8=0;
michael@0 169 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
michael@0 170 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
michael@0 171 return length8;
michael@0 172 } else {
michael@0 173 // The string contains an unpaired surrogate.
michael@0 174 // Ignore this string.
michael@0 175 return 0;
michael@0 176 }
michael@0 177 }
michael@0 178
michael@0 179 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
michael@0 180 static int32_t
michael@0 181 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
michael@0 182 UErrorCode errorCode=U_ZERO_ERROR;
michael@0 183 int32_t length8=0;
michael@0 184 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
michael@0 185 if(U_SUCCESS(errorCode)) {
michael@0 186 return length8;
michael@0 187 } else {
michael@0 188 // The string contains an unpaired surrogate.
michael@0 189 // Ignore this string.
michael@0 190 return 0;
michael@0 191 }
michael@0 192 }
michael@0 193
michael@0 194 static inline uint8_t
michael@0 195 makeSpanLengthByte(int32_t spanLength) {
michael@0 196 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
michael@0 197 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
michael@0 198 }
michael@0 199
michael@0 200 // Construct for all variants of span(), or only for any one variant.
michael@0 201 // Initialize as little as possible, for single use.
michael@0 202 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
michael@0 203 const UVector &setStrings,
michael@0 204 uint32_t which)
michael@0 205 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
michael@0 206 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
michael@0 207 utf8Length(0),
michael@0 208 maxLength16(0), maxLength8(0),
michael@0 209 all((UBool)(which==ALL)) {
michael@0 210 spanSet.retainAll(set);
michael@0 211 if(which&NOT_CONTAINED) {
michael@0 212 // Default to the same sets.
michael@0 213 // addToSpanNotSet() will create a separate set if necessary.
michael@0 214 pSpanNotSet=&spanSet;
michael@0 215 }
michael@0 216
michael@0 217 // Determine if the strings even need to be taken into account at all for span() etc.
michael@0 218 // If any string is relevant, then all strings need to be used for
michael@0 219 // span(longest match) but only the relevant ones for span(while contained).
michael@0 220 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
michael@0 221 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
michael@0 222 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
michael@0 223 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
michael@0 224 int32_t stringsLength=strings.size();
michael@0 225
michael@0 226 int32_t i, spanLength;
michael@0 227 UBool someRelevant=FALSE;
michael@0 228 for(i=0; i<stringsLength; ++i) {
michael@0 229 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 230 const UChar *s16=string.getBuffer();
michael@0 231 int32_t length16=string.length();
michael@0 232 UBool thisRelevant;
michael@0 233 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
michael@0 234 if(spanLength<length16) { // Relevant string.
michael@0 235 someRelevant=thisRelevant=TRUE;
michael@0 236 } else {
michael@0 237 thisRelevant=FALSE;
michael@0 238 }
michael@0 239 if((which&UTF16) && length16>maxLength16) {
michael@0 240 maxLength16=length16;
michael@0 241 }
michael@0 242 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
michael@0 243 int32_t length8=getUTF8Length(s16, length16);
michael@0 244 utf8Length+=length8;
michael@0 245 if(length8>maxLength8) {
michael@0 246 maxLength8=length8;
michael@0 247 }
michael@0 248 }
michael@0 249 }
michael@0 250 if(!someRelevant) {
michael@0 251 maxLength16=maxLength8=0;
michael@0 252 return;
michael@0 253 }
michael@0 254
michael@0 255 // Freeze after checking for the need to use strings at all because freezing
michael@0 256 // a set takes some time and memory which are wasted if there are no relevant strings.
michael@0 257 if(all) {
michael@0 258 spanSet.freeze();
michael@0 259 }
michael@0 260
michael@0 261 uint8_t *spanBackLengths;
michael@0 262 uint8_t *spanUTF8Lengths;
michael@0 263 uint8_t *spanBackUTF8Lengths;
michael@0 264
michael@0 265 // Allocate a block of meta data.
michael@0 266 int32_t allocSize;
michael@0 267 if(all) {
michael@0 268 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
michael@0 269 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
michael@0 270 } else {
michael@0 271 allocSize=stringsLength; // One set of span lengths.
michael@0 272 if(which&UTF8) {
michael@0 273 // UTF-8 lengths and UTF-8 strings.
michael@0 274 allocSize+=stringsLength*4+utf8Length;
michael@0 275 }
michael@0 276 }
michael@0 277 if(allocSize<=(int32_t)sizeof(staticLengths)) {
michael@0 278 utf8Lengths=staticLengths;
michael@0 279 } else {
michael@0 280 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
michael@0 281 if(utf8Lengths==NULL) {
michael@0 282 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
michael@0 283 return; // Out of memory.
michael@0 284 }
michael@0 285 }
michael@0 286
michael@0 287 if(all) {
michael@0 288 // Store span lengths for all span() variants.
michael@0 289 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
michael@0 290 spanBackLengths=spanLengths+stringsLength;
michael@0 291 spanUTF8Lengths=spanBackLengths+stringsLength;
michael@0 292 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
michael@0 293 utf8=spanBackUTF8Lengths+stringsLength;
michael@0 294 } else {
michael@0 295 // Store span lengths for only one span() variant.
michael@0 296 if(which&UTF8) {
michael@0 297 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
michael@0 298 utf8=spanLengths+stringsLength;
michael@0 299 } else {
michael@0 300 spanLengths=(uint8_t *)utf8Lengths;
michael@0 301 }
michael@0 302 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
michael@0 303 }
michael@0 304
michael@0 305 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
michael@0 306 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
michael@0 307
michael@0 308 for(i=0; i<stringsLength; ++i) {
michael@0 309 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 310 const UChar *s16=string.getBuffer();
michael@0 311 int32_t length16=string.length();
michael@0 312 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
michael@0 313 if(spanLength<length16) { // Relevant string.
michael@0 314 if(which&UTF16) {
michael@0 315 if(which&CONTAINED) {
michael@0 316 if(which&FWD) {
michael@0 317 spanLengths[i]=makeSpanLengthByte(spanLength);
michael@0 318 }
michael@0 319 if(which&BACK) {
michael@0 320 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
michael@0 321 spanBackLengths[i]=makeSpanLengthByte(spanLength);
michael@0 322 }
michael@0 323 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
michael@0 324 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
michael@0 325 }
michael@0 326 }
michael@0 327 if(which&UTF8) {
michael@0 328 uint8_t *s8=utf8+utf8Count;
michael@0 329 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
michael@0 330 utf8Count+=utf8Lengths[i]=length8;
michael@0 331 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
michael@0 332 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
michael@0 333 } else { // Relevant for UTF-8.
michael@0 334 if(which&CONTAINED) {
michael@0 335 if(which&FWD) {
michael@0 336 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
michael@0 337 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
michael@0 338 }
michael@0 339 if(which&BACK) {
michael@0 340 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
michael@0 341 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
michael@0 342 }
michael@0 343 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
michael@0 344 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
michael@0 345 }
michael@0 346 }
michael@0 347 }
michael@0 348 if(which&NOT_CONTAINED) {
michael@0 349 // Add string start and end code points to the spanNotSet so that
michael@0 350 // a span(while not contained) stops before any string.
michael@0 351 UChar32 c;
michael@0 352 if(which&FWD) {
michael@0 353 int32_t len=0;
michael@0 354 U16_NEXT(s16, len, length16, c);
michael@0 355 addToSpanNotSet(c);
michael@0 356 }
michael@0 357 if(which&BACK) {
michael@0 358 int32_t len=length16;
michael@0 359 U16_PREV(s16, 0, len, c);
michael@0 360 addToSpanNotSet(c);
michael@0 361 }
michael@0 362 }
michael@0 363 } else { // Irrelevant string.
michael@0 364 if(which&UTF8) {
michael@0 365 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
michael@0 366 uint8_t *s8=utf8+utf8Count;
michael@0 367 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
michael@0 368 utf8Count+=utf8Lengths[i]=length8;
michael@0 369 } else {
michael@0 370 utf8Lengths[i]=0;
michael@0 371 }
michael@0 372 }
michael@0 373 if(all) {
michael@0 374 spanLengths[i]=spanBackLengths[i]=
michael@0 375 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
michael@0 376 (uint8_t)ALL_CP_CONTAINED;
michael@0 377 } else {
michael@0 378 // All spanXYZLengths pointers contain the same address.
michael@0 379 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
michael@0 380 }
michael@0 381 }
michael@0 382 }
michael@0 383
michael@0 384 // Finish.
michael@0 385 if(all) {
michael@0 386 pSpanNotSet->freeze();
michael@0 387 }
michael@0 388 }
michael@0 389
michael@0 390 // Copy constructor. Assumes which==ALL for a frozen set.
michael@0 391 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
michael@0 392 const UVector &newParentSetStrings)
michael@0 393 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
michael@0 394 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
michael@0 395 utf8Length(otherStringSpan.utf8Length),
michael@0 396 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
michael@0 397 all(TRUE) {
michael@0 398 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
michael@0 399 pSpanNotSet=&spanSet;
michael@0 400 } else {
michael@0 401 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
michael@0 402 }
michael@0 403
michael@0 404 // Allocate a block of meta data.
michael@0 405 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
michael@0 406 int32_t stringsLength=strings.size();
michael@0 407 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
michael@0 408 if(allocSize<=(int32_t)sizeof(staticLengths)) {
michael@0 409 utf8Lengths=staticLengths;
michael@0 410 } else {
michael@0 411 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
michael@0 412 if(utf8Lengths==NULL) {
michael@0 413 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
michael@0 414 return; // Out of memory.
michael@0 415 }
michael@0 416 }
michael@0 417
michael@0 418 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
michael@0 419 utf8=spanLengths+stringsLength*4;
michael@0 420 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
michael@0 421 }
michael@0 422
michael@0 423 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
michael@0 424 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
michael@0 425 delete pSpanNotSet;
michael@0 426 }
michael@0 427 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
michael@0 428 uprv_free(utf8Lengths);
michael@0 429 }
michael@0 430 }
michael@0 431
michael@0 432 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
michael@0 433 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
michael@0 434 if(spanSet.contains(c)) {
michael@0 435 return; // Nothing to do.
michael@0 436 }
michael@0 437 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
michael@0 438 if(newSet==NULL) {
michael@0 439 return; // Out of memory.
michael@0 440 } else {
michael@0 441 pSpanNotSet=newSet;
michael@0 442 }
michael@0 443 }
michael@0 444 pSpanNotSet->add(c);
michael@0 445 }
michael@0 446
michael@0 447 // Compare strings without any argument checks. Requires length>0.
michael@0 448 static inline UBool
michael@0 449 matches16(const UChar *s, const UChar *t, int32_t length) {
michael@0 450 do {
michael@0 451 if(*s++!=*t++) {
michael@0 452 return FALSE;
michael@0 453 }
michael@0 454 } while(--length>0);
michael@0 455 return TRUE;
michael@0 456 }
michael@0 457
michael@0 458 static inline UBool
michael@0 459 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
michael@0 460 do {
michael@0 461 if(*s++!=*t++) {
michael@0 462 return FALSE;
michael@0 463 }
michael@0 464 } while(--length>0);
michael@0 465 return TRUE;
michael@0 466 }
michael@0 467
michael@0 468 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
michael@0 469 // at code point boundaries.
michael@0 470 // That is, each edge of a match must not be in the middle of a surrogate pair.
michael@0 471 static inline UBool
michael@0 472 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
michael@0 473 s+=start;
michael@0 474 limit-=start;
michael@0 475 return matches16(s, t, length) &&
michael@0 476 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
michael@0 477 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
michael@0 478 }
michael@0 479
michael@0 480 // Does the set contain the next code point?
michael@0 481 // If so, return its length; otherwise return its negative length.
michael@0 482 static inline int32_t
michael@0 483 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
michael@0 484 UChar c=*s, c2;
michael@0 485 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
michael@0 486 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
michael@0 487 }
michael@0 488 return set.contains(c) ? 1 : -1;
michael@0 489 }
michael@0 490
michael@0 491 static inline int32_t
michael@0 492 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
michael@0 493 UChar c=s[length-1], c2;
michael@0 494 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
michael@0 495 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
michael@0 496 }
michael@0 497 return set.contains(c) ? 1 : -1;
michael@0 498 }
michael@0 499
michael@0 500 static inline int32_t
michael@0 501 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
michael@0 502 UChar32 c=*s;
michael@0 503 if((int8_t)c>=0) {
michael@0 504 return set.contains(c) ? 1 : -1;
michael@0 505 }
michael@0 506 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
michael@0 507 int32_t i=0;
michael@0 508 U8_NEXT_OR_FFFD(s, i, length, c);
michael@0 509 return set.contains(c) ? i : -i;
michael@0 510 }
michael@0 511
michael@0 512 static inline int32_t
michael@0 513 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
michael@0 514 UChar32 c=s[length-1];
michael@0 515 if((int8_t)c>=0) {
michael@0 516 return set.contains(c) ? 1 : -1;
michael@0 517 }
michael@0 518 int32_t i=length-1;
michael@0 519 c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
michael@0 520 length-=i;
michael@0 521 return set.contains(c) ? length : -length;
michael@0 522 }
michael@0 523
michael@0 524 /*
michael@0 525 * Note: In span() when spanLength==0 (after a string match, or at the beginning
michael@0 526 * after an empty code point span) and in spanNot() and spanNotUTF8(),
michael@0 527 * string matching could use a binary search
michael@0 528 * because all string matches are done from the same start index.
michael@0 529 *
michael@0 530 * For UTF-8, this would require a comparison function that returns UTF-16 order.
michael@0 531 *
michael@0 532 * This optimization should not be necessary for normal UnicodeSets because
michael@0 533 * most sets have no strings, and most sets with strings have
michael@0 534 * very few very short strings.
michael@0 535 * For cases with many strings, it might be better to use a different API
michael@0 536 * and implementation with a DFA (state machine).
michael@0 537 */
michael@0 538
michael@0 539 /*
michael@0 540 * Algorithm for span(USET_SPAN_CONTAINED)
michael@0 541 *
michael@0 542 * Theoretical algorithm:
michael@0 543 * - Iterate through the string, and at each code point boundary:
michael@0 544 * + If the code point there is in the set, then remember to continue after it.
michael@0 545 * + If a set string matches at the current position, then remember to continue after it.
michael@0 546 * + Either recursively span for each code point or string match,
michael@0 547 * or recursively span for all but the shortest one and
michael@0 548 * iteratively continue the span with the shortest local match.
michael@0 549 * + Remember the longest recursive span (the farthest end point).
michael@0 550 * + If there is no match at the current position, neither for the code point there
michael@0 551 * nor for any set string, then stop and return the longest recursive span length.
michael@0 552 *
michael@0 553 * Optimized implementation:
michael@0 554 *
michael@0 555 * (We assume that most sets will have very few very short strings.
michael@0 556 * A span using a string-less set is extremely fast.)
michael@0 557 *
michael@0 558 * Create and cache a spanSet which contains all of the single code points
michael@0 559 * of the original set but none of its strings.
michael@0 560 *
michael@0 561 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
michael@0 562 * - Loop:
michael@0 563 * + Try to match each set string at the end of the spanLength.
michael@0 564 * ~ Set strings that start with set-contained code points must be matched
michael@0 565 * with a partial overlap because the recursive algorithm would have tried
michael@0 566 * to match them at every position.
michael@0 567 * ~ Set strings that entirely consist of set-contained code points
michael@0 568 * are irrelevant for span(USET_SPAN_CONTAINED) because the
michael@0 569 * recursive algorithm would continue after them anyway
michael@0 570 * and find the longest recursive match from their end.
michael@0 571 * ~ Rather than recursing, note each end point of a set string match.
michael@0 572 * + If no set string matched after spanSet.span(), then return
michael@0 573 * with where the spanSet.span() ended.
michael@0 574 * + If at least one set string matched after spanSet.span(), then
michael@0 575 * pop the shortest string match end point and continue
michael@0 576 * the loop, trying to match all set strings from there.
michael@0 577 * + If at least one more set string matched after a previous string match,
michael@0 578 * then test if the code point after the previous string match is also
michael@0 579 * contained in the set.
michael@0 580 * Continue the loop with the shortest end point of either this code point
michael@0 581 * or a matching set string.
michael@0 582 * + If no more set string matched after a previous string match,
michael@0 583 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
michael@0 584 * Stop if spanLength==0, otherwise continue the loop.
michael@0 585 *
michael@0 586 * By noting each end point of a set string match,
michael@0 587 * the function visits each string position at most once and finishes
michael@0 588 * in linear time.
michael@0 589 *
michael@0 590 * The recursive algorithm may visit the same string position many times
michael@0 591 * if multiple paths lead to it and finishes in exponential time.
michael@0 592 */
michael@0 593
michael@0 594 /*
michael@0 595 * Algorithm for span(USET_SPAN_SIMPLE)
michael@0 596 *
michael@0 597 * Theoretical algorithm:
michael@0 598 * - Iterate through the string, and at each code point boundary:
michael@0 599 * + If the code point there is in the set, then remember to continue after it.
michael@0 600 * + If a set string matches at the current position, then remember to continue after it.
michael@0 601 * + Continue from the farthest match position and ignore all others.
michael@0 602 * + If there is no match at the current position,
michael@0 603 * then stop and return the current position.
michael@0 604 *
michael@0 605 * Optimized implementation:
michael@0 606 *
michael@0 607 * (Same assumption and spanSet as above.)
michael@0 608 *
michael@0 609 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
michael@0 610 * - Loop:
michael@0 611 * + Try to match each set string at the end of the spanLength.
michael@0 612 * ~ Set strings that start with set-contained code points must be matched
michael@0 613 * with a partial overlap because the standard algorithm would have tried
michael@0 614 * to match them earlier.
michael@0 615 * ~ Set strings that entirely consist of set-contained code points
michael@0 616 * must be matched with a full overlap because the longest-match algorithm
michael@0 617 * would hide set string matches that end earlier.
michael@0 618 * Such set strings need not be matched earlier inside the code point span
michael@0 619 * because the standard algorithm would then have continued after
michael@0 620 * the set string match anyway.
michael@0 621 * ~ Remember the longest set string match (farthest end point) from the earliest
michael@0 622 * starting point.
michael@0 623 * + If no set string matched after spanSet.span(), then return
michael@0 624 * with where the spanSet.span() ended.
michael@0 625 * + If at least one set string matched, then continue the loop after the
michael@0 626 * longest match from the earliest position.
michael@0 627 * + If no more set string matched after a previous string match,
michael@0 628 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
michael@0 629 * Stop if spanLength==0, otherwise continue the loop.
michael@0 630 */
michael@0 631
michael@0 632 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 633 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
michael@0 634 return spanNot(s, length);
michael@0 635 }
michael@0 636 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
michael@0 637 if(spanLength==length) {
michael@0 638 return length;
michael@0 639 }
michael@0 640
michael@0 641 // Consider strings; they may overlap with the span.
michael@0 642 OffsetList offsets;
michael@0 643 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 644 // Use offset list to try all possibilities.
michael@0 645 offsets.setMaxLength(maxLength16);
michael@0 646 }
michael@0 647 int32_t pos=spanLength, rest=length-pos;
michael@0 648 int32_t i, stringsLength=strings.size();
michael@0 649 for(;;) {
michael@0 650 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 651 for(i=0; i<stringsLength; ++i) {
michael@0 652 int32_t overlap=spanLengths[i];
michael@0 653 if(overlap==ALL_CP_CONTAINED) {
michael@0 654 continue; // Irrelevant string.
michael@0 655 }
michael@0 656 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 657 const UChar *s16=string.getBuffer();
michael@0 658 int32_t length16=string.length();
michael@0 659
michael@0 660 // Try to match this string at pos-overlap..pos.
michael@0 661 if(overlap>=LONG_SPAN) {
michael@0 662 overlap=length16;
michael@0 663 // While contained: No point matching fully inside the code point span.
michael@0 664 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
michael@0 665 }
michael@0 666 if(overlap>spanLength) {
michael@0 667 overlap=spanLength;
michael@0 668 }
michael@0 669 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
michael@0 670 for(;;) {
michael@0 671 if(inc>rest) {
michael@0 672 break;
michael@0 673 }
michael@0 674 // Try to match if the increment is not listed already.
michael@0 675 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
michael@0 676 if(inc==rest) {
michael@0 677 return length; // Reached the end of the string.
michael@0 678 }
michael@0 679 offsets.addOffset(inc);
michael@0 680 }
michael@0 681 if(overlap==0) {
michael@0 682 break;
michael@0 683 }
michael@0 684 --overlap;
michael@0 685 ++inc;
michael@0 686 }
michael@0 687 }
michael@0 688 } else /* USET_SPAN_SIMPLE */ {
michael@0 689 int32_t maxInc=0, maxOverlap=0;
michael@0 690 for(i=0; i<stringsLength; ++i) {
michael@0 691 int32_t overlap=spanLengths[i];
michael@0 692 // For longest match, we do need to try to match even an all-contained string
michael@0 693 // to find the match from the earliest start.
michael@0 694
michael@0 695 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 696 const UChar *s16=string.getBuffer();
michael@0 697 int32_t length16=string.length();
michael@0 698
michael@0 699 // Try to match this string at pos-overlap..pos.
michael@0 700 if(overlap>=LONG_SPAN) {
michael@0 701 overlap=length16;
michael@0 702 // Longest match: Need to match fully inside the code point span
michael@0 703 // to find the match from the earliest start.
michael@0 704 }
michael@0 705 if(overlap>spanLength) {
michael@0 706 overlap=spanLength;
michael@0 707 }
michael@0 708 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
michael@0 709 for(;;) {
michael@0 710 if(inc>rest || overlap<maxOverlap) {
michael@0 711 break;
michael@0 712 }
michael@0 713 // Try to match if the string is longer or starts earlier.
michael@0 714 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
michael@0 715 matches16CPB(s, pos-overlap, length, s16, length16)
michael@0 716 ) {
michael@0 717 maxInc=inc; // Longest match from earliest start.
michael@0 718 maxOverlap=overlap;
michael@0 719 break;
michael@0 720 }
michael@0 721 --overlap;
michael@0 722 ++inc;
michael@0 723 }
michael@0 724 }
michael@0 725
michael@0 726 if(maxInc!=0 || maxOverlap!=0) {
michael@0 727 // Longest-match algorithm, and there was a string match.
michael@0 728 // Simply continue after it.
michael@0 729 pos+=maxInc;
michael@0 730 rest-=maxInc;
michael@0 731 if(rest==0) {
michael@0 732 return length; // Reached the end of the string.
michael@0 733 }
michael@0 734 spanLength=0; // Match strings from after a string match.
michael@0 735 continue;
michael@0 736 }
michael@0 737 }
michael@0 738 // Finished trying to match all strings at pos.
michael@0 739
michael@0 740 if(spanLength!=0 || pos==0) {
michael@0 741 // The position is after an unlimited code point span (spanLength!=0),
michael@0 742 // not after a string match.
michael@0 743 // The only position where spanLength==0 after a span is pos==0.
michael@0 744 // Otherwise, an unlimited code point span is only tried again when no
michael@0 745 // strings match, and if such a non-initial span fails we stop.
michael@0 746 if(offsets.isEmpty()) {
michael@0 747 return pos; // No strings matched after a span.
michael@0 748 }
michael@0 749 // Match strings from after the next string match.
michael@0 750 } else {
michael@0 751 // The position is after a string match (or a single code point).
michael@0 752 if(offsets.isEmpty()) {
michael@0 753 // No more strings matched after a previous string match.
michael@0 754 // Try another code point span from after the last string match.
michael@0 755 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
michael@0 756 if( spanLength==rest || // Reached the end of the string, or
michael@0 757 spanLength==0 // neither strings nor span progressed.
michael@0 758 ) {
michael@0 759 return pos+spanLength;
michael@0 760 }
michael@0 761 pos+=spanLength;
michael@0 762 rest-=spanLength;
michael@0 763 continue; // spanLength>0: Match strings from after a span.
michael@0 764 } else {
michael@0 765 // Try to match only one code point from after a string match if some
michael@0 766 // string matched beyond it, so that we try all possible positions
michael@0 767 // and don't overshoot.
michael@0 768 spanLength=spanOne(spanSet, s+pos, rest);
michael@0 769 if(spanLength>0) {
michael@0 770 if(spanLength==rest) {
michael@0 771 return length; // Reached the end of the string.
michael@0 772 }
michael@0 773 // Match strings after this code point.
michael@0 774 // There cannot be any increments below it because UnicodeSet strings
michael@0 775 // contain multiple code points.
michael@0 776 pos+=spanLength;
michael@0 777 rest-=spanLength;
michael@0 778 offsets.shift(spanLength);
michael@0 779 spanLength=0;
michael@0 780 continue; // Match strings from after a single code point.
michael@0 781 }
michael@0 782 // Match strings from after the next string match.
michael@0 783 }
michael@0 784 }
michael@0 785 int32_t minOffset=offsets.popMinimum();
michael@0 786 pos+=minOffset;
michael@0 787 rest-=minOffset;
michael@0 788 spanLength=0; // Match strings from after a string match.
michael@0 789 }
michael@0 790 }
michael@0 791
michael@0 792 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 793 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
michael@0 794 return spanNotBack(s, length);
michael@0 795 }
michael@0 796 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
michael@0 797 if(pos==0) {
michael@0 798 return 0;
michael@0 799 }
michael@0 800 int32_t spanLength=length-pos;
michael@0 801
michael@0 802 // Consider strings; they may overlap with the span.
michael@0 803 OffsetList offsets;
michael@0 804 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 805 // Use offset list to try all possibilities.
michael@0 806 offsets.setMaxLength(maxLength16);
michael@0 807 }
michael@0 808 int32_t i, stringsLength=strings.size();
michael@0 809 uint8_t *spanBackLengths=spanLengths;
michael@0 810 if(all) {
michael@0 811 spanBackLengths+=stringsLength;
michael@0 812 }
michael@0 813 for(;;) {
michael@0 814 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 815 for(i=0; i<stringsLength; ++i) {
michael@0 816 int32_t overlap=spanBackLengths[i];
michael@0 817 if(overlap==ALL_CP_CONTAINED) {
michael@0 818 continue; // Irrelevant string.
michael@0 819 }
michael@0 820 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 821 const UChar *s16=string.getBuffer();
michael@0 822 int32_t length16=string.length();
michael@0 823
michael@0 824 // Try to match this string at pos-(length16-overlap)..pos-length16.
michael@0 825 if(overlap>=LONG_SPAN) {
michael@0 826 overlap=length16;
michael@0 827 // While contained: No point matching fully inside the code point span.
michael@0 828 int32_t len1=0;
michael@0 829 U16_FWD_1(s16, len1, overlap);
michael@0 830 overlap-=len1; // Length of the string minus the first code point.
michael@0 831 }
michael@0 832 if(overlap>spanLength) {
michael@0 833 overlap=spanLength;
michael@0 834 }
michael@0 835 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
michael@0 836 for(;;) {
michael@0 837 if(dec>pos) {
michael@0 838 break;
michael@0 839 }
michael@0 840 // Try to match if the decrement is not listed already.
michael@0 841 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
michael@0 842 if(dec==pos) {
michael@0 843 return 0; // Reached the start of the string.
michael@0 844 }
michael@0 845 offsets.addOffset(dec);
michael@0 846 }
michael@0 847 if(overlap==0) {
michael@0 848 break;
michael@0 849 }
michael@0 850 --overlap;
michael@0 851 ++dec;
michael@0 852 }
michael@0 853 }
michael@0 854 } else /* USET_SPAN_SIMPLE */ {
michael@0 855 int32_t maxDec=0, maxOverlap=0;
michael@0 856 for(i=0; i<stringsLength; ++i) {
michael@0 857 int32_t overlap=spanBackLengths[i];
michael@0 858 // For longest match, we do need to try to match even an all-contained string
michael@0 859 // to find the match from the latest end.
michael@0 860
michael@0 861 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 862 const UChar *s16=string.getBuffer();
michael@0 863 int32_t length16=string.length();
michael@0 864
michael@0 865 // Try to match this string at pos-(length16-overlap)..pos-length16.
michael@0 866 if(overlap>=LONG_SPAN) {
michael@0 867 overlap=length16;
michael@0 868 // Longest match: Need to match fully inside the code point span
michael@0 869 // to find the match from the latest end.
michael@0 870 }
michael@0 871 if(overlap>spanLength) {
michael@0 872 overlap=spanLength;
michael@0 873 }
michael@0 874 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
michael@0 875 for(;;) {
michael@0 876 if(dec>pos || overlap<maxOverlap) {
michael@0 877 break;
michael@0 878 }
michael@0 879 // Try to match if the string is longer or ends later.
michael@0 880 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
michael@0 881 matches16CPB(s, pos-dec, length, s16, length16)
michael@0 882 ) {
michael@0 883 maxDec=dec; // Longest match from latest end.
michael@0 884 maxOverlap=overlap;
michael@0 885 break;
michael@0 886 }
michael@0 887 --overlap;
michael@0 888 ++dec;
michael@0 889 }
michael@0 890 }
michael@0 891
michael@0 892 if(maxDec!=0 || maxOverlap!=0) {
michael@0 893 // Longest-match algorithm, and there was a string match.
michael@0 894 // Simply continue before it.
michael@0 895 pos-=maxDec;
michael@0 896 if(pos==0) {
michael@0 897 return 0; // Reached the start of the string.
michael@0 898 }
michael@0 899 spanLength=0; // Match strings from before a string match.
michael@0 900 continue;
michael@0 901 }
michael@0 902 }
michael@0 903 // Finished trying to match all strings at pos.
michael@0 904
michael@0 905 if(spanLength!=0 || pos==length) {
michael@0 906 // The position is before an unlimited code point span (spanLength!=0),
michael@0 907 // not before a string match.
michael@0 908 // The only position where spanLength==0 before a span is pos==length.
michael@0 909 // Otherwise, an unlimited code point span is only tried again when no
michael@0 910 // strings match, and if such a non-initial span fails we stop.
michael@0 911 if(offsets.isEmpty()) {
michael@0 912 return pos; // No strings matched before a span.
michael@0 913 }
michael@0 914 // Match strings from before the next string match.
michael@0 915 } else {
michael@0 916 // The position is before a string match (or a single code point).
michael@0 917 if(offsets.isEmpty()) {
michael@0 918 // No more strings matched before a previous string match.
michael@0 919 // Try another code point span from before the last string match.
michael@0 920 int32_t oldPos=pos;
michael@0 921 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
michael@0 922 spanLength=oldPos-pos;
michael@0 923 if( pos==0 || // Reached the start of the string, or
michael@0 924 spanLength==0 // neither strings nor span progressed.
michael@0 925 ) {
michael@0 926 return pos;
michael@0 927 }
michael@0 928 continue; // spanLength>0: Match strings from before a span.
michael@0 929 } else {
michael@0 930 // Try to match only one code point from before a string match if some
michael@0 931 // string matched beyond it, so that we try all possible positions
michael@0 932 // and don't overshoot.
michael@0 933 spanLength=spanOneBack(spanSet, s, pos);
michael@0 934 if(spanLength>0) {
michael@0 935 if(spanLength==pos) {
michael@0 936 return 0; // Reached the start of the string.
michael@0 937 }
michael@0 938 // Match strings before this code point.
michael@0 939 // There cannot be any decrements below it because UnicodeSet strings
michael@0 940 // contain multiple code points.
michael@0 941 pos-=spanLength;
michael@0 942 offsets.shift(spanLength);
michael@0 943 spanLength=0;
michael@0 944 continue; // Match strings from before a single code point.
michael@0 945 }
michael@0 946 // Match strings from before the next string match.
michael@0 947 }
michael@0 948 }
michael@0 949 pos-=offsets.popMinimum();
michael@0 950 spanLength=0; // Match strings from before a string match.
michael@0 951 }
michael@0 952 }
michael@0 953
michael@0 954 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 955 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
michael@0 956 return spanNotUTF8(s, length);
michael@0 957 }
michael@0 958 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
michael@0 959 if(spanLength==length) {
michael@0 960 return length;
michael@0 961 }
michael@0 962
michael@0 963 // Consider strings; they may overlap with the span.
michael@0 964 OffsetList offsets;
michael@0 965 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 966 // Use offset list to try all possibilities.
michael@0 967 offsets.setMaxLength(maxLength8);
michael@0 968 }
michael@0 969 int32_t pos=spanLength, rest=length-pos;
michael@0 970 int32_t i, stringsLength=strings.size();
michael@0 971 uint8_t *spanUTF8Lengths=spanLengths;
michael@0 972 if(all) {
michael@0 973 spanUTF8Lengths+=2*stringsLength;
michael@0 974 }
michael@0 975 for(;;) {
michael@0 976 const uint8_t *s8=utf8;
michael@0 977 int32_t length8;
michael@0 978 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 979 for(i=0; i<stringsLength; ++i) {
michael@0 980 length8=utf8Lengths[i];
michael@0 981 if(length8==0) {
michael@0 982 continue; // String not representable in UTF-8.
michael@0 983 }
michael@0 984 int32_t overlap=spanUTF8Lengths[i];
michael@0 985 if(overlap==ALL_CP_CONTAINED) {
michael@0 986 s8+=length8;
michael@0 987 continue; // Irrelevant string.
michael@0 988 }
michael@0 989
michael@0 990 // Try to match this string at pos-overlap..pos.
michael@0 991 if(overlap>=LONG_SPAN) {
michael@0 992 overlap=length8;
michael@0 993 // While contained: No point matching fully inside the code point span.
michael@0 994 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
michael@0 995 }
michael@0 996 if(overlap>spanLength) {
michael@0 997 overlap=spanLength;
michael@0 998 }
michael@0 999 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
michael@0 1000 for(;;) {
michael@0 1001 if(inc>rest) {
michael@0 1002 break;
michael@0 1003 }
michael@0 1004 // Try to match if the increment is not listed already.
michael@0 1005 // Match at code point boundaries. (The UTF-8 strings were converted
michael@0 1006 // from UTF-16 and are guaranteed to be well-formed.)
michael@0 1007 if( !U8_IS_TRAIL(s[pos-overlap]) &&
michael@0 1008 !offsets.containsOffset(inc) &&
michael@0 1009 matches8(s+pos-overlap, s8, length8)
michael@0 1010
michael@0 1011 ) {
michael@0 1012 if(inc==rest) {
michael@0 1013 return length; // Reached the end of the string.
michael@0 1014 }
michael@0 1015 offsets.addOffset(inc);
michael@0 1016 }
michael@0 1017 if(overlap==0) {
michael@0 1018 break;
michael@0 1019 }
michael@0 1020 --overlap;
michael@0 1021 ++inc;
michael@0 1022 }
michael@0 1023 s8+=length8;
michael@0 1024 }
michael@0 1025 } else /* USET_SPAN_SIMPLE */ {
michael@0 1026 int32_t maxInc=0, maxOverlap=0;
michael@0 1027 for(i=0; i<stringsLength; ++i) {
michael@0 1028 length8=utf8Lengths[i];
michael@0 1029 if(length8==0) {
michael@0 1030 continue; // String not representable in UTF-8.
michael@0 1031 }
michael@0 1032 int32_t overlap=spanUTF8Lengths[i];
michael@0 1033 // For longest match, we do need to try to match even an all-contained string
michael@0 1034 // to find the match from the earliest start.
michael@0 1035
michael@0 1036 // Try to match this string at pos-overlap..pos.
michael@0 1037 if(overlap>=LONG_SPAN) {
michael@0 1038 overlap=length8;
michael@0 1039 // Longest match: Need to match fully inside the code point span
michael@0 1040 // to find the match from the earliest start.
michael@0 1041 }
michael@0 1042 if(overlap>spanLength) {
michael@0 1043 overlap=spanLength;
michael@0 1044 }
michael@0 1045 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
michael@0 1046 for(;;) {
michael@0 1047 if(inc>rest || overlap<maxOverlap) {
michael@0 1048 break;
michael@0 1049 }
michael@0 1050 // Try to match if the string is longer or starts earlier.
michael@0 1051 // Match at code point boundaries. (The UTF-8 strings were converted
michael@0 1052 // from UTF-16 and are guaranteed to be well-formed.)
michael@0 1053 if( !U8_IS_TRAIL(s[pos-overlap]) &&
michael@0 1054 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
michael@0 1055 matches8(s+pos-overlap, s8, length8)
michael@0 1056
michael@0 1057 ) {
michael@0 1058 maxInc=inc; // Longest match from earliest start.
michael@0 1059 maxOverlap=overlap;
michael@0 1060 break;
michael@0 1061 }
michael@0 1062 --overlap;
michael@0 1063 ++inc;
michael@0 1064 }
michael@0 1065 s8+=length8;
michael@0 1066 }
michael@0 1067
michael@0 1068 if(maxInc!=0 || maxOverlap!=0) {
michael@0 1069 // Longest-match algorithm, and there was a string match.
michael@0 1070 // Simply continue after it.
michael@0 1071 pos+=maxInc;
michael@0 1072 rest-=maxInc;
michael@0 1073 if(rest==0) {
michael@0 1074 return length; // Reached the end of the string.
michael@0 1075 }
michael@0 1076 spanLength=0; // Match strings from after a string match.
michael@0 1077 continue;
michael@0 1078 }
michael@0 1079 }
michael@0 1080 // Finished trying to match all strings at pos.
michael@0 1081
michael@0 1082 if(spanLength!=0 || pos==0) {
michael@0 1083 // The position is after an unlimited code point span (spanLength!=0),
michael@0 1084 // not after a string match.
michael@0 1085 // The only position where spanLength==0 after a span is pos==0.
michael@0 1086 // Otherwise, an unlimited code point span is only tried again when no
michael@0 1087 // strings match, and if such a non-initial span fails we stop.
michael@0 1088 if(offsets.isEmpty()) {
michael@0 1089 return pos; // No strings matched after a span.
michael@0 1090 }
michael@0 1091 // Match strings from after the next string match.
michael@0 1092 } else {
michael@0 1093 // The position is after a string match (or a single code point).
michael@0 1094 if(offsets.isEmpty()) {
michael@0 1095 // No more strings matched after a previous string match.
michael@0 1096 // Try another code point span from after the last string match.
michael@0 1097 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
michael@0 1098 if( spanLength==rest || // Reached the end of the string, or
michael@0 1099 spanLength==0 // neither strings nor span progressed.
michael@0 1100 ) {
michael@0 1101 return pos+spanLength;
michael@0 1102 }
michael@0 1103 pos+=spanLength;
michael@0 1104 rest-=spanLength;
michael@0 1105 continue; // spanLength>0: Match strings from after a span.
michael@0 1106 } else {
michael@0 1107 // Try to match only one code point from after a string match if some
michael@0 1108 // string matched beyond it, so that we try all possible positions
michael@0 1109 // and don't overshoot.
michael@0 1110 spanLength=spanOneUTF8(spanSet, s+pos, rest);
michael@0 1111 if(spanLength>0) {
michael@0 1112 if(spanLength==rest) {
michael@0 1113 return length; // Reached the end of the string.
michael@0 1114 }
michael@0 1115 // Match strings after this code point.
michael@0 1116 // There cannot be any increments below it because UnicodeSet strings
michael@0 1117 // contain multiple code points.
michael@0 1118 pos+=spanLength;
michael@0 1119 rest-=spanLength;
michael@0 1120 offsets.shift(spanLength);
michael@0 1121 spanLength=0;
michael@0 1122 continue; // Match strings from after a single code point.
michael@0 1123 }
michael@0 1124 // Match strings from after the next string match.
michael@0 1125 }
michael@0 1126 }
michael@0 1127 int32_t minOffset=offsets.popMinimum();
michael@0 1128 pos+=minOffset;
michael@0 1129 rest-=minOffset;
michael@0 1130 spanLength=0; // Match strings from after a string match.
michael@0 1131 }
michael@0 1132 }
michael@0 1133
michael@0 1134 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
michael@0 1135 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
michael@0 1136 return spanNotBackUTF8(s, length);
michael@0 1137 }
michael@0 1138 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
michael@0 1139 if(pos==0) {
michael@0 1140 return 0;
michael@0 1141 }
michael@0 1142 int32_t spanLength=length-pos;
michael@0 1143
michael@0 1144 // Consider strings; they may overlap with the span.
michael@0 1145 OffsetList offsets;
michael@0 1146 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 1147 // Use offset list to try all possibilities.
michael@0 1148 offsets.setMaxLength(maxLength8);
michael@0 1149 }
michael@0 1150 int32_t i, stringsLength=strings.size();
michael@0 1151 uint8_t *spanBackUTF8Lengths=spanLengths;
michael@0 1152 if(all) {
michael@0 1153 spanBackUTF8Lengths+=3*stringsLength;
michael@0 1154 }
michael@0 1155 for(;;) {
michael@0 1156 const uint8_t *s8=utf8;
michael@0 1157 int32_t length8;
michael@0 1158 if(spanCondition==USET_SPAN_CONTAINED) {
michael@0 1159 for(i=0; i<stringsLength; ++i) {
michael@0 1160 length8=utf8Lengths[i];
michael@0 1161 if(length8==0) {
michael@0 1162 continue; // String not representable in UTF-8.
michael@0 1163 }
michael@0 1164 int32_t overlap=spanBackUTF8Lengths[i];
michael@0 1165 if(overlap==ALL_CP_CONTAINED) {
michael@0 1166 s8+=length8;
michael@0 1167 continue; // Irrelevant string.
michael@0 1168 }
michael@0 1169
michael@0 1170 // Try to match this string at pos-(length8-overlap)..pos-length8.
michael@0 1171 if(overlap>=LONG_SPAN) {
michael@0 1172 overlap=length8;
michael@0 1173 // While contained: No point matching fully inside the code point span.
michael@0 1174 int32_t len1=0;
michael@0 1175 U8_FWD_1(s8, len1, overlap);
michael@0 1176 overlap-=len1; // Length of the string minus the first code point.
michael@0 1177 }
michael@0 1178 if(overlap>spanLength) {
michael@0 1179 overlap=spanLength;
michael@0 1180 }
michael@0 1181 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
michael@0 1182 for(;;) {
michael@0 1183 if(dec>pos) {
michael@0 1184 break;
michael@0 1185 }
michael@0 1186 // Try to match if the decrement is not listed already.
michael@0 1187 // Match at code point boundaries. (The UTF-8 strings were converted
michael@0 1188 // from UTF-16 and are guaranteed to be well-formed.)
michael@0 1189 if( !U8_IS_TRAIL(s[pos-dec]) &&
michael@0 1190 !offsets.containsOffset(dec) &&
michael@0 1191 matches8(s+pos-dec, s8, length8)
michael@0 1192 ) {
michael@0 1193 if(dec==pos) {
michael@0 1194 return 0; // Reached the start of the string.
michael@0 1195 }
michael@0 1196 offsets.addOffset(dec);
michael@0 1197 }
michael@0 1198 if(overlap==0) {
michael@0 1199 break;
michael@0 1200 }
michael@0 1201 --overlap;
michael@0 1202 ++dec;
michael@0 1203 }
michael@0 1204 s8+=length8;
michael@0 1205 }
michael@0 1206 } else /* USET_SPAN_SIMPLE */ {
michael@0 1207 int32_t maxDec=0, maxOverlap=0;
michael@0 1208 for(i=0; i<stringsLength; ++i) {
michael@0 1209 length8=utf8Lengths[i];
michael@0 1210 if(length8==0) {
michael@0 1211 continue; // String not representable in UTF-8.
michael@0 1212 }
michael@0 1213 int32_t overlap=spanBackUTF8Lengths[i];
michael@0 1214 // For longest match, we do need to try to match even an all-contained string
michael@0 1215 // to find the match from the latest end.
michael@0 1216
michael@0 1217 // Try to match this string at pos-(length8-overlap)..pos-length8.
michael@0 1218 if(overlap>=LONG_SPAN) {
michael@0 1219 overlap=length8;
michael@0 1220 // Longest match: Need to match fully inside the code point span
michael@0 1221 // to find the match from the latest end.
michael@0 1222 }
michael@0 1223 if(overlap>spanLength) {
michael@0 1224 overlap=spanLength;
michael@0 1225 }
michael@0 1226 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
michael@0 1227 for(;;) {
michael@0 1228 if(dec>pos || overlap<maxOverlap) {
michael@0 1229 break;
michael@0 1230 }
michael@0 1231 // Try to match if the string is longer or ends later.
michael@0 1232 // Match at code point boundaries. (The UTF-8 strings were converted
michael@0 1233 // from UTF-16 and are guaranteed to be well-formed.)
michael@0 1234 if( !U8_IS_TRAIL(s[pos-dec]) &&
michael@0 1235 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
michael@0 1236 matches8(s+pos-dec, s8, length8)
michael@0 1237 ) {
michael@0 1238 maxDec=dec; // Longest match from latest end.
michael@0 1239 maxOverlap=overlap;
michael@0 1240 break;
michael@0 1241 }
michael@0 1242 --overlap;
michael@0 1243 ++dec;
michael@0 1244 }
michael@0 1245 s8+=length8;
michael@0 1246 }
michael@0 1247
michael@0 1248 if(maxDec!=0 || maxOverlap!=0) {
michael@0 1249 // Longest-match algorithm, and there was a string match.
michael@0 1250 // Simply continue before it.
michael@0 1251 pos-=maxDec;
michael@0 1252 if(pos==0) {
michael@0 1253 return 0; // Reached the start of the string.
michael@0 1254 }
michael@0 1255 spanLength=0; // Match strings from before a string match.
michael@0 1256 continue;
michael@0 1257 }
michael@0 1258 }
michael@0 1259 // Finished trying to match all strings at pos.
michael@0 1260
michael@0 1261 if(spanLength!=0 || pos==length) {
michael@0 1262 // The position is before an unlimited code point span (spanLength!=0),
michael@0 1263 // not before a string match.
michael@0 1264 // The only position where spanLength==0 before a span is pos==length.
michael@0 1265 // Otherwise, an unlimited code point span is only tried again when no
michael@0 1266 // strings match, and if such a non-initial span fails we stop.
michael@0 1267 if(offsets.isEmpty()) {
michael@0 1268 return pos; // No strings matched before a span.
michael@0 1269 }
michael@0 1270 // Match strings from before the next string match.
michael@0 1271 } else {
michael@0 1272 // The position is before a string match (or a single code point).
michael@0 1273 if(offsets.isEmpty()) {
michael@0 1274 // No more strings matched before a previous string match.
michael@0 1275 // Try another code point span from before the last string match.
michael@0 1276 int32_t oldPos=pos;
michael@0 1277 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
michael@0 1278 spanLength=oldPos-pos;
michael@0 1279 if( pos==0 || // Reached the start of the string, or
michael@0 1280 spanLength==0 // neither strings nor span progressed.
michael@0 1281 ) {
michael@0 1282 return pos;
michael@0 1283 }
michael@0 1284 continue; // spanLength>0: Match strings from before a span.
michael@0 1285 } else {
michael@0 1286 // Try to match only one code point from before a string match if some
michael@0 1287 // string matched beyond it, so that we try all possible positions
michael@0 1288 // and don't overshoot.
michael@0 1289 spanLength=spanOneBackUTF8(spanSet, s, pos);
michael@0 1290 if(spanLength>0) {
michael@0 1291 if(spanLength==pos) {
michael@0 1292 return 0; // Reached the start of the string.
michael@0 1293 }
michael@0 1294 // Match strings before this code point.
michael@0 1295 // There cannot be any decrements below it because UnicodeSet strings
michael@0 1296 // contain multiple code points.
michael@0 1297 pos-=spanLength;
michael@0 1298 offsets.shift(spanLength);
michael@0 1299 spanLength=0;
michael@0 1300 continue; // Match strings from before a single code point.
michael@0 1301 }
michael@0 1302 // Match strings from before the next string match.
michael@0 1303 }
michael@0 1304 }
michael@0 1305 pos-=offsets.popMinimum();
michael@0 1306 spanLength=0; // Match strings from before a string match.
michael@0 1307 }
michael@0 1308 }
michael@0 1309
michael@0 1310 /*
michael@0 1311 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
michael@0 1312 *
michael@0 1313 * Theoretical algorithm:
michael@0 1314 * - Iterate through the string, and at each code point boundary:
michael@0 1315 * + If the code point there is in the set, then return with the current position.
michael@0 1316 * + If a set string matches at the current position, then return with the current position.
michael@0 1317 *
michael@0 1318 * Optimized implementation:
michael@0 1319 *
michael@0 1320 * (Same assumption as for span() above.)
michael@0 1321 *
michael@0 1322 * Create and cache a spanNotSet which contains all of the single code points
michael@0 1323 * of the original set but none of its strings.
michael@0 1324 * For each set string add its initial code point to the spanNotSet.
michael@0 1325 * (Also add its final code point for spanNotBack().)
michael@0 1326 *
michael@0 1327 * - Loop:
michael@0 1328 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
michael@0 1329 * + If the current code point is in the original set, then
michael@0 1330 * return the current position.
michael@0 1331 * + If any set string matches at the current position, then
michael@0 1332 * return the current position.
michael@0 1333 * + If there is no match at the current position, neither for the code point there
michael@0 1334 * nor for any set string, then skip this code point and continue the loop.
michael@0 1335 * This happens for set-string-initial code points that were added to spanNotSet
michael@0 1336 * when there is not actually a match for such a set string.
michael@0 1337 */
michael@0 1338
michael@0 1339 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
michael@0 1340 int32_t pos=0, rest=length;
michael@0 1341 int32_t i, stringsLength=strings.size();
michael@0 1342 do {
michael@0 1343 // Span until we find a code point from the set,
michael@0 1344 // or a code point that starts or ends some string.
michael@0 1345 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
michael@0 1346 if(i==rest) {
michael@0 1347 return length; // Reached the end of the string.
michael@0 1348 }
michael@0 1349 pos+=i;
michael@0 1350 rest-=i;
michael@0 1351
michael@0 1352 // Check whether the current code point is in the original set,
michael@0 1353 // without the string starts and ends.
michael@0 1354 int32_t cpLength=spanOne(spanSet, s+pos, rest);
michael@0 1355 if(cpLength>0) {
michael@0 1356 return pos; // There is a set element at pos.
michael@0 1357 }
michael@0 1358
michael@0 1359 // Try to match the strings at pos.
michael@0 1360 for(i=0; i<stringsLength; ++i) {
michael@0 1361 if(spanLengths[i]==ALL_CP_CONTAINED) {
michael@0 1362 continue; // Irrelevant string.
michael@0 1363 }
michael@0 1364 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 1365 const UChar *s16=string.getBuffer();
michael@0 1366 int32_t length16=string.length();
michael@0 1367 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
michael@0 1368 return pos; // There is a set element at pos.
michael@0 1369 }
michael@0 1370 }
michael@0 1371
michael@0 1372 // The span(while not contained) ended on a string start/end which is
michael@0 1373 // not in the original set. Skip this code point and continue.
michael@0 1374 // cpLength<0
michael@0 1375 pos-=cpLength;
michael@0 1376 rest+=cpLength;
michael@0 1377 } while(rest!=0);
michael@0 1378 return length; // Reached the end of the string.
michael@0 1379 }
michael@0 1380
michael@0 1381 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
michael@0 1382 int32_t pos=length;
michael@0 1383 int32_t i, stringsLength=strings.size();
michael@0 1384 do {
michael@0 1385 // Span until we find a code point from the set,
michael@0 1386 // or a code point that starts or ends some string.
michael@0 1387 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
michael@0 1388 if(pos==0) {
michael@0 1389 return 0; // Reached the start of the string.
michael@0 1390 }
michael@0 1391
michael@0 1392 // Check whether the current code point is in the original set,
michael@0 1393 // without the string starts and ends.
michael@0 1394 int32_t cpLength=spanOneBack(spanSet, s, pos);
michael@0 1395 if(cpLength>0) {
michael@0 1396 return pos; // There is a set element at pos.
michael@0 1397 }
michael@0 1398
michael@0 1399 // Try to match the strings at pos.
michael@0 1400 for(i=0; i<stringsLength; ++i) {
michael@0 1401 // Use spanLengths rather than a spanBackLengths pointer because
michael@0 1402 // it is easier and we only need to know whether the string is irrelevant
michael@0 1403 // which is the same in either array.
michael@0 1404 if(spanLengths[i]==ALL_CP_CONTAINED) {
michael@0 1405 continue; // Irrelevant string.
michael@0 1406 }
michael@0 1407 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
michael@0 1408 const UChar *s16=string.getBuffer();
michael@0 1409 int32_t length16=string.length();
michael@0 1410 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
michael@0 1411 return pos; // There is a set element at pos.
michael@0 1412 }
michael@0 1413 }
michael@0 1414
michael@0 1415 // The span(while not contained) ended on a string start/end which is
michael@0 1416 // not in the original set. Skip this code point and continue.
michael@0 1417 // cpLength<0
michael@0 1418 pos+=cpLength;
michael@0 1419 } while(pos!=0);
michael@0 1420 return 0; // Reached the start of the string.
michael@0 1421 }
michael@0 1422
michael@0 1423 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
michael@0 1424 int32_t pos=0, rest=length;
michael@0 1425 int32_t i, stringsLength=strings.size();
michael@0 1426 uint8_t *spanUTF8Lengths=spanLengths;
michael@0 1427 if(all) {
michael@0 1428 spanUTF8Lengths+=2*stringsLength;
michael@0 1429 }
michael@0 1430 do {
michael@0 1431 // Span until we find a code point from the set,
michael@0 1432 // or a code point that starts or ends some string.
michael@0 1433 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
michael@0 1434 if(i==rest) {
michael@0 1435 return length; // Reached the end of the string.
michael@0 1436 }
michael@0 1437 pos+=i;
michael@0 1438 rest-=i;
michael@0 1439
michael@0 1440 // Check whether the current code point is in the original set,
michael@0 1441 // without the string starts and ends.
michael@0 1442 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
michael@0 1443 if(cpLength>0) {
michael@0 1444 return pos; // There is a set element at pos.
michael@0 1445 }
michael@0 1446
michael@0 1447 // Try to match the strings at pos.
michael@0 1448 const uint8_t *s8=utf8;
michael@0 1449 int32_t length8;
michael@0 1450 for(i=0; i<stringsLength; ++i) {
michael@0 1451 length8=utf8Lengths[i];
michael@0 1452 // ALL_CP_CONTAINED: Irrelevant string.
michael@0 1453 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
michael@0 1454 return pos; // There is a set element at pos.
michael@0 1455 }
michael@0 1456 s8+=length8;
michael@0 1457 }
michael@0 1458
michael@0 1459 // The span(while not contained) ended on a string start/end which is
michael@0 1460 // not in the original set. Skip this code point and continue.
michael@0 1461 // cpLength<0
michael@0 1462 pos-=cpLength;
michael@0 1463 rest+=cpLength;
michael@0 1464 } while(rest!=0);
michael@0 1465 return length; // Reached the end of the string.
michael@0 1466 }
michael@0 1467
michael@0 1468 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
michael@0 1469 int32_t pos=length;
michael@0 1470 int32_t i, stringsLength=strings.size();
michael@0 1471 uint8_t *spanBackUTF8Lengths=spanLengths;
michael@0 1472 if(all) {
michael@0 1473 spanBackUTF8Lengths+=3*stringsLength;
michael@0 1474 }
michael@0 1475 do {
michael@0 1476 // Span until we find a code point from the set,
michael@0 1477 // or a code point that starts or ends some string.
michael@0 1478 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
michael@0 1479 if(pos==0) {
michael@0 1480 return 0; // Reached the start of the string.
michael@0 1481 }
michael@0 1482
michael@0 1483 // Check whether the current code point is in the original set,
michael@0 1484 // without the string starts and ends.
michael@0 1485 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
michael@0 1486 if(cpLength>0) {
michael@0 1487 return pos; // There is a set element at pos.
michael@0 1488 }
michael@0 1489
michael@0 1490 // Try to match the strings at pos.
michael@0 1491 const uint8_t *s8=utf8;
michael@0 1492 int32_t length8;
michael@0 1493 for(i=0; i<stringsLength; ++i) {
michael@0 1494 length8=utf8Lengths[i];
michael@0 1495 // ALL_CP_CONTAINED: Irrelevant string.
michael@0 1496 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
michael@0 1497 return pos; // There is a set element at pos.
michael@0 1498 }
michael@0 1499 s8+=length8;
michael@0 1500 }
michael@0 1501
michael@0 1502 // The span(while not contained) ended on a string start/end which is
michael@0 1503 // not in the original set. Skip this code point and continue.
michael@0 1504 // cpLength<0
michael@0 1505 pos+=cpLength;
michael@0 1506 } while(pos!=0);
michael@0 1507 return 0; // Reached the start of the string.
michael@0 1508 }
michael@0 1509
michael@0 1510 U_NAMESPACE_END

mercurial