michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2007-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * file name: unisetspan.cpp michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2007mar01 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uniset.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf8.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cmemory.h" michael@0: #include "uvector.h" michael@0: #include "unisetspan.h" michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /* michael@0: * List of offsets from the current position from where to try matching michael@0: * a code point or a string. michael@0: * Store offsets rather than indexes to simplify the code and use the same list michael@0: * for both increments (in span()) and decrements (in spanBack()). michael@0: * michael@0: * Assumption: The maximum offset is limited, and the offsets that are stored michael@0: * at any one time are relatively dense, that is, there are normally no gaps of michael@0: * hundreds or thousands of offset values. michael@0: * michael@0: * The implementation uses a circular buffer of byte flags, michael@0: * each indicating whether the corresponding offset is in the list. michael@0: * This avoids inserting into a sorted list of offsets (or absolute indexes) and michael@0: * physically moving part of the list. michael@0: * michael@0: * Note: In principle, the caller should setMaxLength() to the maximum of the michael@0: * max string length and U16_LENGTH/U8_LENGTH to account for michael@0: * "long" single code points. michael@0: * However, this implementation uses at least a staticList with more than michael@0: * U8_LENGTH entries anyway. michael@0: * michael@0: * Note: If maxLength were guaranteed to be no more than 32 or 64, michael@0: * the list could be stored as bit flags in a single integer. michael@0: * Rather than handling a circular buffer with a start list index, michael@0: * the integer would simply be shifted when lower offsets are removed. michael@0: * UnicodeSet does not have a limit on the lengths of strings. michael@0: */ michael@0: class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory. michael@0: public: michael@0: OffsetList() : list(staticList), capacity(0), length(0), start(0) {} michael@0: michael@0: ~OffsetList() { michael@0: if(list!=staticList) { michael@0: uprv_free(list); michael@0: } michael@0: } michael@0: michael@0: // Call exactly once if the list is to be used. michael@0: void setMaxLength(int32_t maxLength) { michael@0: if(maxLength<=(int32_t)sizeof(staticList)) { michael@0: capacity=(int32_t)sizeof(staticList); michael@0: } else { michael@0: UBool *l=(UBool *)uprv_malloc(maxLength); michael@0: if(l!=NULL) { michael@0: list=l; michael@0: capacity=maxLength; michael@0: } michael@0: } michael@0: uprv_memset(list, 0, capacity); michael@0: } michael@0: michael@0: void clear() { michael@0: uprv_memset(list, 0, capacity); michael@0: start=length=0; michael@0: } michael@0: michael@0: UBool isEmpty() const { michael@0: return (UBool)(length==0); michael@0: } michael@0: michael@0: // Reduce all stored offsets by delta, used when the current position michael@0: // moves by delta. michael@0: // There must not be any offsets lower than delta. michael@0: // If there is an offset equal to delta, it is removed. michael@0: // delta=[1..maxLength] michael@0: void shift(int32_t delta) { michael@0: int32_t i=start+delta; michael@0: if(i>=capacity) { michael@0: i-=capacity; michael@0: } michael@0: if(list[i]) { michael@0: list[i]=FALSE; michael@0: --length; michael@0: } michael@0: start=i; michael@0: } michael@0: michael@0: // Add an offset. The list must not contain it yet. michael@0: // offset=[1..maxLength] michael@0: void addOffset(int32_t offset) { michael@0: int32_t i=start+offset; michael@0: if(i>=capacity) { michael@0: i-=capacity; michael@0: } michael@0: list[i]=TRUE; michael@0: ++length; michael@0: } michael@0: michael@0: // offset=[1..maxLength] michael@0: UBool containsOffset(int32_t offset) const { michael@0: int32_t i=start+offset; michael@0: if(i>=capacity) { michael@0: i-=capacity; michael@0: } michael@0: return list[i]; michael@0: } michael@0: michael@0: // Find the lowest stored offset from a non-empty list, remove it, michael@0: // and reduce all other offsets by this minimum. michael@0: // Returns [1..maxLength]. michael@0: int32_t popMinimum() { michael@0: // Look for the next offset in list[start+1..capacity-1]. michael@0: int32_t i=start, result; michael@0: while(++imaxLength16) { michael@0: maxLength16=length16; michael@0: } michael@0: if((which&UTF8) && (thisRelevant || (which&CONTAINED))) { michael@0: int32_t length8=getUTF8Length(s16, length16); michael@0: utf8Length+=length8; michael@0: if(length8>maxLength8) { michael@0: maxLength8=length8; michael@0: } michael@0: } michael@0: } michael@0: if(!someRelevant) { michael@0: maxLength16=maxLength8=0; michael@0: return; michael@0: } michael@0: michael@0: // Freeze after checking for the need to use strings at all because freezing michael@0: // a set takes some time and memory which are wasted if there are no relevant strings. michael@0: if(all) { michael@0: spanSet.freeze(); michael@0: } michael@0: michael@0: uint8_t *spanBackLengths; michael@0: uint8_t *spanUTF8Lengths; michael@0: uint8_t *spanBackUTF8Lengths; michael@0: michael@0: // Allocate a block of meta data. michael@0: int32_t allocSize; michael@0: if(all) { michael@0: // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. michael@0: allocSize=stringsLength*(4+1+1+1+1)+utf8Length; michael@0: } else { michael@0: allocSize=stringsLength; // One set of span lengths. michael@0: if(which&UTF8) { michael@0: // UTF-8 lengths and UTF-8 strings. michael@0: allocSize+=stringsLength*4+utf8Length; michael@0: } michael@0: } michael@0: if(allocSize<=(int32_t)sizeof(staticLengths)) { michael@0: utf8Lengths=staticLengths; michael@0: } else { michael@0: utf8Lengths=(int32_t *)uprv_malloc(allocSize); michael@0: if(utf8Lengths==NULL) { michael@0: maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. michael@0: return; // Out of memory. michael@0: } michael@0: } michael@0: michael@0: if(all) { michael@0: // Store span lengths for all span() variants. michael@0: spanLengths=(uint8_t *)(utf8Lengths+stringsLength); michael@0: spanBackLengths=spanLengths+stringsLength; michael@0: spanUTF8Lengths=spanBackLengths+stringsLength; michael@0: spanBackUTF8Lengths=spanUTF8Lengths+stringsLength; michael@0: utf8=spanBackUTF8Lengths+stringsLength; michael@0: } else { michael@0: // Store span lengths for only one span() variant. michael@0: if(which&UTF8) { michael@0: spanLengths=(uint8_t *)(utf8Lengths+stringsLength); michael@0: utf8=spanLengths+stringsLength; michael@0: } else { michael@0: spanLengths=(uint8_t *)utf8Lengths; michael@0: } michael@0: spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths; michael@0: } michael@0: michael@0: // Set the meta data and pSpanNotSet and write the UTF-8 strings. michael@0: int32_t utf8Count=0; // Count UTF-8 bytes written so far. michael@0: michael@0: for(i=0; ifreeze(); michael@0: } michael@0: } michael@0: michael@0: // Copy constructor. Assumes which==ALL for a frozen set. michael@0: UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan, michael@0: const UVector &newParentSetStrings) michael@0: : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings), michael@0: utf8Lengths(NULL), spanLengths(NULL), utf8(NULL), michael@0: utf8Length(otherStringSpan.utf8Length), michael@0: maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8), michael@0: all(TRUE) { michael@0: if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) { michael@0: pSpanNotSet=&spanSet; michael@0: } else { michael@0: pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone(); michael@0: } michael@0: michael@0: // Allocate a block of meta data. michael@0: // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings. michael@0: int32_t stringsLength=strings.size(); michael@0: int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length; michael@0: if(allocSize<=(int32_t)sizeof(staticLengths)) { michael@0: utf8Lengths=staticLengths; michael@0: } else { michael@0: utf8Lengths=(int32_t *)uprv_malloc(allocSize); michael@0: if(utf8Lengths==NULL) { michael@0: maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE. michael@0: return; // Out of memory. michael@0: } michael@0: } michael@0: michael@0: spanLengths=(uint8_t *)(utf8Lengths+stringsLength); michael@0: utf8=spanLengths+stringsLength*4; michael@0: uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize); michael@0: } michael@0: michael@0: UnicodeSetStringSpan::~UnicodeSetStringSpan() { michael@0: if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) { michael@0: delete pSpanNotSet; michael@0: } michael@0: if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) { michael@0: uprv_free(utf8Lengths); michael@0: } michael@0: } michael@0: michael@0: void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) { michael@0: if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) { michael@0: if(spanSet.contains(c)) { michael@0: return; // Nothing to do. michael@0: } michael@0: UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed(); michael@0: if(newSet==NULL) { michael@0: return; // Out of memory. michael@0: } else { michael@0: pSpanNotSet=newSet; michael@0: } michael@0: } michael@0: pSpanNotSet->add(c); michael@0: } michael@0: michael@0: // Compare strings without any argument checks. Requires length>0. michael@0: static inline UBool michael@0: matches16(const UChar *s, const UChar *t, int32_t length) { michael@0: do { michael@0: if(*s++!=*t++) { michael@0: return FALSE; michael@0: } michael@0: } while(--length>0); michael@0: return TRUE; michael@0: } michael@0: michael@0: static inline UBool michael@0: matches8(const uint8_t *s, const uint8_t *t, int32_t length) { michael@0: do { michael@0: if(*s++!=*t++) { michael@0: return FALSE; michael@0: } michael@0: } while(--length>0); michael@0: return TRUE; michael@0: } michael@0: michael@0: // Compare 16-bit Unicode strings (which may be malformed UTF-16) michael@0: // at code point boundaries. michael@0: // That is, each edge of a match must not be in the middle of a surrogate pair. michael@0: static inline UBool michael@0: matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) { michael@0: s+=start; michael@0: limit-=start; michael@0: return matches16(s, t, length) && michael@0: !(0=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) { michael@0: return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2; michael@0: } michael@0: return set.contains(c) ? 1 : -1; michael@0: } michael@0: michael@0: static inline int32_t michael@0: spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) { michael@0: UChar c=s[length-1], c2; michael@0: if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) { michael@0: return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2; michael@0: } michael@0: return set.contains(c) ? 1 : -1; michael@0: } michael@0: michael@0: static inline int32_t michael@0: spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { michael@0: UChar32 c=*s; michael@0: if((int8_t)c>=0) { michael@0: return set.contains(c) ? 1 : -1; michael@0: } michael@0: // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD(). michael@0: int32_t i=0; michael@0: U8_NEXT_OR_FFFD(s, i, length, c); michael@0: return set.contains(c) ? i : -i; michael@0: } michael@0: michael@0: static inline int32_t michael@0: spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) { michael@0: UChar32 c=s[length-1]; michael@0: if((int8_t)c>=0) { michael@0: return set.contains(c) ? 1 : -1; michael@0: } michael@0: int32_t i=length-1; michael@0: c=utf8_prevCharSafeBody(s, 0, &i, c, -3); michael@0: length-=i; michael@0: return set.contains(c) ? length : -length; michael@0: } michael@0: michael@0: /* michael@0: * Note: In span() when spanLength==0 (after a string match, or at the beginning michael@0: * after an empty code point span) and in spanNot() and spanNotUTF8(), michael@0: * string matching could use a binary search michael@0: * because all string matches are done from the same start index. michael@0: * michael@0: * For UTF-8, this would require a comparison function that returns UTF-16 order. michael@0: * michael@0: * This optimization should not be necessary for normal UnicodeSets because michael@0: * most sets have no strings, and most sets with strings have michael@0: * very few very short strings. michael@0: * For cases with many strings, it might be better to use a different API michael@0: * and implementation with a DFA (state machine). michael@0: */ michael@0: michael@0: /* michael@0: * Algorithm for span(USET_SPAN_CONTAINED) michael@0: * michael@0: * Theoretical algorithm: michael@0: * - Iterate through the string, and at each code point boundary: michael@0: * + If the code point there is in the set, then remember to continue after it. michael@0: * + If a set string matches at the current position, then remember to continue after it. michael@0: * + Either recursively span for each code point or string match, michael@0: * or recursively span for all but the shortest one and michael@0: * iteratively continue the span with the shortest local match. michael@0: * + Remember the longest recursive span (the farthest end point). michael@0: * + If there is no match at the current position, neither for the code point there michael@0: * nor for any set string, then stop and return the longest recursive span length. michael@0: * michael@0: * Optimized implementation: michael@0: * michael@0: * (We assume that most sets will have very few very short strings. michael@0: * A span using a string-less set is extremely fast.) michael@0: * michael@0: * Create and cache a spanSet which contains all of the single code points michael@0: * of the original set but none of its strings. michael@0: * michael@0: * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). michael@0: * - Loop: michael@0: * + Try to match each set string at the end of the spanLength. michael@0: * ~ Set strings that start with set-contained code points must be matched michael@0: * with a partial overlap because the recursive algorithm would have tried michael@0: * to match them at every position. michael@0: * ~ Set strings that entirely consist of set-contained code points michael@0: * are irrelevant for span(USET_SPAN_CONTAINED) because the michael@0: * recursive algorithm would continue after them anyway michael@0: * and find the longest recursive match from their end. michael@0: * ~ Rather than recursing, note each end point of a set string match. michael@0: * + If no set string matched after spanSet.span(), then return michael@0: * with where the spanSet.span() ended. michael@0: * + If at least one set string matched after spanSet.span(), then michael@0: * pop the shortest string match end point and continue michael@0: * the loop, trying to match all set strings from there. michael@0: * + If at least one more set string matched after a previous string match, michael@0: * then test if the code point after the previous string match is also michael@0: * contained in the set. michael@0: * Continue the loop with the shortest end point of either this code point michael@0: * or a matching set string. michael@0: * + If no more set string matched after a previous string match, michael@0: * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). michael@0: * Stop if spanLength==0, otherwise continue the loop. michael@0: * michael@0: * By noting each end point of a set string match, michael@0: * the function visits each string position at most once and finishes michael@0: * in linear time. michael@0: * michael@0: * The recursive algorithm may visit the same string position many times michael@0: * if multiple paths lead to it and finishes in exponential time. michael@0: */ michael@0: michael@0: /* michael@0: * Algorithm for span(USET_SPAN_SIMPLE) michael@0: * michael@0: * Theoretical algorithm: michael@0: * - Iterate through the string, and at each code point boundary: michael@0: * + If the code point there is in the set, then remember to continue after it. michael@0: * + If a set string matches at the current position, then remember to continue after it. michael@0: * + Continue from the farthest match position and ignore all others. michael@0: * + If there is no match at the current position, michael@0: * then stop and return the current position. michael@0: * michael@0: * Optimized implementation: michael@0: * michael@0: * (Same assumption and spanSet as above.) michael@0: * michael@0: * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED). michael@0: * - Loop: michael@0: * + Try to match each set string at the end of the spanLength. michael@0: * ~ Set strings that start with set-contained code points must be matched michael@0: * with a partial overlap because the standard algorithm would have tried michael@0: * to match them earlier. michael@0: * ~ Set strings that entirely consist of set-contained code points michael@0: * must be matched with a full overlap because the longest-match algorithm michael@0: * would hide set string matches that end earlier. michael@0: * Such set strings need not be matched earlier inside the code point span michael@0: * because the standard algorithm would then have continued after michael@0: * the set string match anyway. michael@0: * ~ Remember the longest set string match (farthest end point) from the earliest michael@0: * starting point. michael@0: * + If no set string matched after spanSet.span(), then return michael@0: * with where the spanSet.span() ended. michael@0: * + If at least one set string matched, then continue the loop after the michael@0: * longest match from the earliest position. michael@0: * + If no more set string matched after a previous string match, michael@0: * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED). michael@0: * Stop if spanLength==0, otherwise continue the loop. michael@0: */ michael@0: michael@0: int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: if(spanCondition==USET_SPAN_NOT_CONTAINED) { michael@0: return spanNot(s, length); michael@0: } michael@0: int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED); michael@0: if(spanLength==length) { michael@0: return length; michael@0: } michael@0: michael@0: // Consider strings; they may overlap with the span. michael@0: OffsetList offsets; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: // Use offset list to try all possibilities. michael@0: offsets.setMaxLength(maxLength16); michael@0: } michael@0: int32_t pos=spanLength, rest=length-pos; michael@0: int32_t i, stringsLength=strings.size(); michael@0: for(;;) { michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length16; michael@0: // While contained: No point matching fully inside the code point span. michael@0: U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t inc=length16-overlap; // Keep overlap+inc==length16. michael@0: for(;;) { michael@0: if(inc>rest) { michael@0: break; michael@0: } michael@0: // Try to match if the increment is not listed already. michael@0: if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) { michael@0: if(inc==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: offsets.addOffset(inc); michael@0: } michael@0: if(overlap==0) { michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++inc; michael@0: } michael@0: } michael@0: } else /* USET_SPAN_SIMPLE */ { michael@0: int32_t maxInc=0, maxOverlap=0; michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length16; michael@0: // Longest match: Need to match fully inside the code point span michael@0: // to find the match from the earliest start. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t inc=length16-overlap; // Keep overlap+inc==length16. michael@0: for(;;) { michael@0: if(inc>rest || overlapmaxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && michael@0: matches16CPB(s, pos-overlap, length, s16, length16) michael@0: ) { michael@0: maxInc=inc; // Longest match from earliest start. michael@0: maxOverlap=overlap; michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++inc; michael@0: } michael@0: } michael@0: michael@0: if(maxInc!=0 || maxOverlap!=0) { michael@0: // Longest-match algorithm, and there was a string match. michael@0: // Simply continue after it. michael@0: pos+=maxInc; michael@0: rest-=maxInc; michael@0: if(rest==0) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: spanLength=0; // Match strings from after a string match. michael@0: continue; michael@0: } michael@0: } michael@0: // Finished trying to match all strings at pos. michael@0: michael@0: if(spanLength!=0 || pos==0) { michael@0: // The position is after an unlimited code point span (spanLength!=0), michael@0: // not after a string match. michael@0: // The only position where spanLength==0 after a span is pos==0. michael@0: // Otherwise, an unlimited code point span is only tried again when no michael@0: // strings match, and if such a non-initial span fails we stop. michael@0: if(offsets.isEmpty()) { michael@0: return pos; // No strings matched after a span. michael@0: } michael@0: // Match strings from after the next string match. michael@0: } else { michael@0: // The position is after a string match (or a single code point). michael@0: if(offsets.isEmpty()) { michael@0: // No more strings matched after a previous string match. michael@0: // Try another code point span from after the last string match. michael@0: spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED); michael@0: if( spanLength==rest || // Reached the end of the string, or michael@0: spanLength==0 // neither strings nor span progressed. michael@0: ) { michael@0: return pos+spanLength; michael@0: } michael@0: pos+=spanLength; michael@0: rest-=spanLength; michael@0: continue; // spanLength>0: Match strings from after a span. michael@0: } else { michael@0: // Try to match only one code point from after a string match if some michael@0: // string matched beyond it, so that we try all possible positions michael@0: // and don't overshoot. michael@0: spanLength=spanOne(spanSet, s+pos, rest); michael@0: if(spanLength>0) { michael@0: if(spanLength==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: // Match strings after this code point. michael@0: // There cannot be any increments below it because UnicodeSet strings michael@0: // contain multiple code points. michael@0: pos+=spanLength; michael@0: rest-=spanLength; michael@0: offsets.shift(spanLength); michael@0: spanLength=0; michael@0: continue; // Match strings from after a single code point. michael@0: } michael@0: // Match strings from after the next string match. michael@0: } michael@0: } michael@0: int32_t minOffset=offsets.popMinimum(); michael@0: pos+=minOffset; michael@0: rest-=minOffset; michael@0: spanLength=0; // Match strings from after a string match. michael@0: } michael@0: } michael@0: michael@0: int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: if(spanCondition==USET_SPAN_NOT_CONTAINED) { michael@0: return spanNotBack(s, length); michael@0: } michael@0: int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED); michael@0: if(pos==0) { michael@0: return 0; michael@0: } michael@0: int32_t spanLength=length-pos; michael@0: michael@0: // Consider strings; they may overlap with the span. michael@0: OffsetList offsets; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: // Use offset list to try all possibilities. michael@0: offsets.setMaxLength(maxLength16); michael@0: } michael@0: int32_t i, stringsLength=strings.size(); michael@0: uint8_t *spanBackLengths=spanLengths; michael@0: if(all) { michael@0: spanBackLengths+=stringsLength; michael@0: } michael@0: for(;;) { michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length16; michael@0: // While contained: No point matching fully inside the code point span. michael@0: int32_t len1=0; michael@0: U16_FWD_1(s16, len1, overlap); michael@0: overlap-=len1; // Length of the string minus the first code point. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t dec=length16-overlap; // Keep dec+overlap==length16. michael@0: for(;;) { michael@0: if(dec>pos) { michael@0: break; michael@0: } michael@0: // Try to match if the decrement is not listed already. michael@0: if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) { michael@0: if(dec==pos) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: offsets.addOffset(dec); michael@0: } michael@0: if(overlap==0) { michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++dec; michael@0: } michael@0: } michael@0: } else /* USET_SPAN_SIMPLE */ { michael@0: int32_t maxDec=0, maxOverlap=0; michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length16; michael@0: // Longest match: Need to match fully inside the code point span michael@0: // to find the match from the latest end. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t dec=length16-overlap; // Keep dec+overlap==length16. michael@0: for(;;) { michael@0: if(dec>pos || overlapmaxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && michael@0: matches16CPB(s, pos-dec, length, s16, length16) michael@0: ) { michael@0: maxDec=dec; // Longest match from latest end. michael@0: maxOverlap=overlap; michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++dec; michael@0: } michael@0: } michael@0: michael@0: if(maxDec!=0 || maxOverlap!=0) { michael@0: // Longest-match algorithm, and there was a string match. michael@0: // Simply continue before it. michael@0: pos-=maxDec; michael@0: if(pos==0) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: spanLength=0; // Match strings from before a string match. michael@0: continue; michael@0: } michael@0: } michael@0: // Finished trying to match all strings at pos. michael@0: michael@0: if(spanLength!=0 || pos==length) { michael@0: // The position is before an unlimited code point span (spanLength!=0), michael@0: // not before a string match. michael@0: // The only position where spanLength==0 before a span is pos==length. michael@0: // Otherwise, an unlimited code point span is only tried again when no michael@0: // strings match, and if such a non-initial span fails we stop. michael@0: if(offsets.isEmpty()) { michael@0: return pos; // No strings matched before a span. michael@0: } michael@0: // Match strings from before the next string match. michael@0: } else { michael@0: // The position is before a string match (or a single code point). michael@0: if(offsets.isEmpty()) { michael@0: // No more strings matched before a previous string match. michael@0: // Try another code point span from before the last string match. michael@0: int32_t oldPos=pos; michael@0: pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED); michael@0: spanLength=oldPos-pos; michael@0: if( pos==0 || // Reached the start of the string, or michael@0: spanLength==0 // neither strings nor span progressed. michael@0: ) { michael@0: return pos; michael@0: } michael@0: continue; // spanLength>0: Match strings from before a span. michael@0: } else { michael@0: // Try to match only one code point from before a string match if some michael@0: // string matched beyond it, so that we try all possible positions michael@0: // and don't overshoot. michael@0: spanLength=spanOneBack(spanSet, s, pos); michael@0: if(spanLength>0) { michael@0: if(spanLength==pos) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: // Match strings before this code point. michael@0: // There cannot be any decrements below it because UnicodeSet strings michael@0: // contain multiple code points. michael@0: pos-=spanLength; michael@0: offsets.shift(spanLength); michael@0: spanLength=0; michael@0: continue; // Match strings from before a single code point. michael@0: } michael@0: // Match strings from before the next string match. michael@0: } michael@0: } michael@0: pos-=offsets.popMinimum(); michael@0: spanLength=0; // Match strings from before a string match. michael@0: } michael@0: } michael@0: michael@0: int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: if(spanCondition==USET_SPAN_NOT_CONTAINED) { michael@0: return spanNotUTF8(s, length); michael@0: } michael@0: int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED); michael@0: if(spanLength==length) { michael@0: return length; michael@0: } michael@0: michael@0: // Consider strings; they may overlap with the span. michael@0: OffsetList offsets; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: // Use offset list to try all possibilities. michael@0: offsets.setMaxLength(maxLength8); michael@0: } michael@0: int32_t pos=spanLength, rest=length-pos; michael@0: int32_t i, stringsLength=strings.size(); michael@0: uint8_t *spanUTF8Lengths=spanLengths; michael@0: if(all) { michael@0: spanUTF8Lengths+=2*stringsLength; michael@0: } michael@0: for(;;) { michael@0: const uint8_t *s8=utf8; michael@0: int32_t length8; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length8; michael@0: // While contained: No point matching fully inside the code point span. michael@0: U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t inc=length8-overlap; // Keep overlap+inc==length8. michael@0: for(;;) { michael@0: if(inc>rest) { michael@0: break; michael@0: } michael@0: // Try to match if the increment is not listed already. michael@0: // Match at code point boundaries. (The UTF-8 strings were converted michael@0: // from UTF-16 and are guaranteed to be well-formed.) michael@0: if( !U8_IS_TRAIL(s[pos-overlap]) && michael@0: !offsets.containsOffset(inc) && michael@0: matches8(s+pos-overlap, s8, length8) michael@0: michael@0: ) { michael@0: if(inc==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: offsets.addOffset(inc); michael@0: } michael@0: if(overlap==0) { michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++inc; michael@0: } michael@0: s8+=length8; michael@0: } michael@0: } else /* USET_SPAN_SIMPLE */ { michael@0: int32_t maxInc=0, maxOverlap=0; michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length8; michael@0: // Longest match: Need to match fully inside the code point span michael@0: // to find the match from the earliest start. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t inc=length8-overlap; // Keep overlap+inc==length8. michael@0: for(;;) { michael@0: if(inc>rest || overlapmaxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) && michael@0: matches8(s+pos-overlap, s8, length8) michael@0: michael@0: ) { michael@0: maxInc=inc; // Longest match from earliest start. michael@0: maxOverlap=overlap; michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++inc; michael@0: } michael@0: s8+=length8; michael@0: } michael@0: michael@0: if(maxInc!=0 || maxOverlap!=0) { michael@0: // Longest-match algorithm, and there was a string match. michael@0: // Simply continue after it. michael@0: pos+=maxInc; michael@0: rest-=maxInc; michael@0: if(rest==0) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: spanLength=0; // Match strings from after a string match. michael@0: continue; michael@0: } michael@0: } michael@0: // Finished trying to match all strings at pos. michael@0: michael@0: if(spanLength!=0 || pos==0) { michael@0: // The position is after an unlimited code point span (spanLength!=0), michael@0: // not after a string match. michael@0: // The only position where spanLength==0 after a span is pos==0. michael@0: // Otherwise, an unlimited code point span is only tried again when no michael@0: // strings match, and if such a non-initial span fails we stop. michael@0: if(offsets.isEmpty()) { michael@0: return pos; // No strings matched after a span. michael@0: } michael@0: // Match strings from after the next string match. michael@0: } else { michael@0: // The position is after a string match (or a single code point). michael@0: if(offsets.isEmpty()) { michael@0: // No more strings matched after a previous string match. michael@0: // Try another code point span from after the last string match. michael@0: spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED); michael@0: if( spanLength==rest || // Reached the end of the string, or michael@0: spanLength==0 // neither strings nor span progressed. michael@0: ) { michael@0: return pos+spanLength; michael@0: } michael@0: pos+=spanLength; michael@0: rest-=spanLength; michael@0: continue; // spanLength>0: Match strings from after a span. michael@0: } else { michael@0: // Try to match only one code point from after a string match if some michael@0: // string matched beyond it, so that we try all possible positions michael@0: // and don't overshoot. michael@0: spanLength=spanOneUTF8(spanSet, s+pos, rest); michael@0: if(spanLength>0) { michael@0: if(spanLength==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: // Match strings after this code point. michael@0: // There cannot be any increments below it because UnicodeSet strings michael@0: // contain multiple code points. michael@0: pos+=spanLength; michael@0: rest-=spanLength; michael@0: offsets.shift(spanLength); michael@0: spanLength=0; michael@0: continue; // Match strings from after a single code point. michael@0: } michael@0: // Match strings from after the next string match. michael@0: } michael@0: } michael@0: int32_t minOffset=offsets.popMinimum(); michael@0: pos+=minOffset; michael@0: rest-=minOffset; michael@0: spanLength=0; // Match strings from after a string match. michael@0: } michael@0: } michael@0: michael@0: int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { michael@0: if(spanCondition==USET_SPAN_NOT_CONTAINED) { michael@0: return spanNotBackUTF8(s, length); michael@0: } michael@0: int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED); michael@0: if(pos==0) { michael@0: return 0; michael@0: } michael@0: int32_t spanLength=length-pos; michael@0: michael@0: // Consider strings; they may overlap with the span. michael@0: OffsetList offsets; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: // Use offset list to try all possibilities. michael@0: offsets.setMaxLength(maxLength8); michael@0: } michael@0: int32_t i, stringsLength=strings.size(); michael@0: uint8_t *spanBackUTF8Lengths=spanLengths; michael@0: if(all) { michael@0: spanBackUTF8Lengths+=3*stringsLength; michael@0: } michael@0: for(;;) { michael@0: const uint8_t *s8=utf8; michael@0: int32_t length8; michael@0: if(spanCondition==USET_SPAN_CONTAINED) { michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length8; michael@0: // While contained: No point matching fully inside the code point span. michael@0: int32_t len1=0; michael@0: U8_FWD_1(s8, len1, overlap); michael@0: overlap-=len1; // Length of the string minus the first code point. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t dec=length8-overlap; // Keep dec+overlap==length8. michael@0: for(;;) { michael@0: if(dec>pos) { michael@0: break; michael@0: } michael@0: // Try to match if the decrement is not listed already. michael@0: // Match at code point boundaries. (The UTF-8 strings were converted michael@0: // from UTF-16 and are guaranteed to be well-formed.) michael@0: if( !U8_IS_TRAIL(s[pos-dec]) && michael@0: !offsets.containsOffset(dec) && michael@0: matches8(s+pos-dec, s8, length8) michael@0: ) { michael@0: if(dec==pos) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: offsets.addOffset(dec); michael@0: } michael@0: if(overlap==0) { michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++dec; michael@0: } michael@0: s8+=length8; michael@0: } michael@0: } else /* USET_SPAN_SIMPLE */ { michael@0: int32_t maxDec=0, maxOverlap=0; michael@0: for(i=0; i=LONG_SPAN) { michael@0: overlap=length8; michael@0: // Longest match: Need to match fully inside the code point span michael@0: // to find the match from the latest end. michael@0: } michael@0: if(overlap>spanLength) { michael@0: overlap=spanLength; michael@0: } michael@0: int32_t dec=length8-overlap; // Keep dec+overlap==length8. michael@0: for(;;) { michael@0: if(dec>pos || overlapmaxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) && michael@0: matches8(s+pos-dec, s8, length8) michael@0: ) { michael@0: maxDec=dec; // Longest match from latest end. michael@0: maxOverlap=overlap; michael@0: break; michael@0: } michael@0: --overlap; michael@0: ++dec; michael@0: } michael@0: s8+=length8; michael@0: } michael@0: michael@0: if(maxDec!=0 || maxOverlap!=0) { michael@0: // Longest-match algorithm, and there was a string match. michael@0: // Simply continue before it. michael@0: pos-=maxDec; michael@0: if(pos==0) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: spanLength=0; // Match strings from before a string match. michael@0: continue; michael@0: } michael@0: } michael@0: // Finished trying to match all strings at pos. michael@0: michael@0: if(spanLength!=0 || pos==length) { michael@0: // The position is before an unlimited code point span (spanLength!=0), michael@0: // not before a string match. michael@0: // The only position where spanLength==0 before a span is pos==length. michael@0: // Otherwise, an unlimited code point span is only tried again when no michael@0: // strings match, and if such a non-initial span fails we stop. michael@0: if(offsets.isEmpty()) { michael@0: return pos; // No strings matched before a span. michael@0: } michael@0: // Match strings from before the next string match. michael@0: } else { michael@0: // The position is before a string match (or a single code point). michael@0: if(offsets.isEmpty()) { michael@0: // No more strings matched before a previous string match. michael@0: // Try another code point span from before the last string match. michael@0: int32_t oldPos=pos; michael@0: pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED); michael@0: spanLength=oldPos-pos; michael@0: if( pos==0 || // Reached the start of the string, or michael@0: spanLength==0 // neither strings nor span progressed. michael@0: ) { michael@0: return pos; michael@0: } michael@0: continue; // spanLength>0: Match strings from before a span. michael@0: } else { michael@0: // Try to match only one code point from before a string match if some michael@0: // string matched beyond it, so that we try all possible positions michael@0: // and don't overshoot. michael@0: spanLength=spanOneBackUTF8(spanSet, s, pos); michael@0: if(spanLength>0) { michael@0: if(spanLength==pos) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: // Match strings before this code point. michael@0: // There cannot be any decrements below it because UnicodeSet strings michael@0: // contain multiple code points. michael@0: pos-=spanLength; michael@0: offsets.shift(spanLength); michael@0: spanLength=0; michael@0: continue; // Match strings from before a single code point. michael@0: } michael@0: // Match strings from before the next string match. michael@0: } michael@0: } michael@0: pos-=offsets.popMinimum(); michael@0: spanLength=0; // Match strings from before a string match. michael@0: } michael@0: } michael@0: michael@0: /* michael@0: * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED) michael@0: * michael@0: * Theoretical algorithm: michael@0: * - Iterate through the string, and at each code point boundary: michael@0: * + If the code point there is in the set, then return with the current position. michael@0: * + If a set string matches at the current position, then return with the current position. michael@0: * michael@0: * Optimized implementation: michael@0: * michael@0: * (Same assumption as for span() above.) michael@0: * michael@0: * Create and cache a spanNotSet which contains all of the single code points michael@0: * of the original set but none of its strings. michael@0: * For each set string add its initial code point to the spanNotSet. michael@0: * (Also add its final code point for spanNotBack().) michael@0: * michael@0: * - Loop: michael@0: * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED). michael@0: * + If the current code point is in the original set, then michael@0: * return the current position. michael@0: * + If any set string matches at the current position, then michael@0: * return the current position. michael@0: * + If there is no match at the current position, neither for the code point there michael@0: * nor for any set string, then skip this code point and continue the loop. michael@0: * This happens for set-string-initial code points that were added to spanNotSet michael@0: * when there is not actually a match for such a set string. michael@0: */ michael@0: michael@0: int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const { michael@0: int32_t pos=0, rest=length; michael@0: int32_t i, stringsLength=strings.size(); michael@0: do { michael@0: // Span until we find a code point from the set, michael@0: // or a code point that starts or ends some string. michael@0: i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED); michael@0: if(i==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: pos+=i; michael@0: rest-=i; michael@0: michael@0: // Check whether the current code point is in the original set, michael@0: // without the string starts and ends. michael@0: int32_t cpLength=spanOne(spanSet, s+pos, rest); michael@0: if(cpLength>0) { michael@0: return pos; // There is a set element at pos. michael@0: } michael@0: michael@0: // Try to match the strings at pos. michael@0: for(i=0; ispanBack(s, pos, USET_SPAN_NOT_CONTAINED); michael@0: if(pos==0) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: michael@0: // Check whether the current code point is in the original set, michael@0: // without the string starts and ends. michael@0: int32_t cpLength=spanOneBack(spanSet, s, pos); michael@0: if(cpLength>0) { michael@0: return pos; // There is a set element at pos. michael@0: } michael@0: michael@0: // Try to match the strings at pos. michael@0: for(i=0; ispanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED); michael@0: if(i==rest) { michael@0: return length; // Reached the end of the string. michael@0: } michael@0: pos+=i; michael@0: rest-=i; michael@0: michael@0: // Check whether the current code point is in the original set, michael@0: // without the string starts and ends. michael@0: int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest); michael@0: if(cpLength>0) { michael@0: return pos; // There is a set element at pos. michael@0: } michael@0: michael@0: // Try to match the strings at pos. michael@0: const uint8_t *s8=utf8; michael@0: int32_t length8; michael@0: for(i=0; ispanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED); michael@0: if(pos==0) { michael@0: return 0; // Reached the start of the string. michael@0: } michael@0: michael@0: // Check whether the current code point is in the original set, michael@0: // without the string starts and ends. michael@0: int32_t cpLength=spanOneBackUTF8(spanSet, s, pos); michael@0: if(cpLength>0) { michael@0: return pos; // There is a set element at pos. michael@0: } michael@0: michael@0: // Try to match the strings at pos. michael@0: const uint8_t *s8=utf8; michael@0: int32_t length8; michael@0: for(i=0; i