intl/icu/source/common/ucharstriebuilder.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2010-2012, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  ucharstriebuilder.h
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2010nov14
    12 *   created by: Markus W. Scherer
    13 */
    15 #include "unicode/utypes.h"
    16 #include "unicode/ucharstrie.h"
    17 #include "unicode/ucharstriebuilder.h"
    18 #include "unicode/unistr.h"
    19 #include "unicode/ustring.h"
    20 #include "cmemory.h"
    21 #include "uarrsort.h"
    22 #include "uassert.h"
    23 #include "uhash.h"
    24 #include "ustr_imp.h"
    26 U_NAMESPACE_BEGIN
    28 /*
    29  * Note: This builder implementation stores (string, value) pairs with full copies
    30  * of the 16-bit-unit sequences, until the UCharsTrie is built.
    31  * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
    32  */
    34 class UCharsTrieElement : public UMemory {
    35 public:
    36     // Use compiler's default constructor, initializes nothing.
    38     void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
    40     UnicodeString getString(const UnicodeString &strings) const {
    41         int32_t length=strings[stringOffset];
    42         return strings.tempSubString(stringOffset+1, length);
    43     }
    44     int32_t getStringLength(const UnicodeString &strings) const {
    45         return strings[stringOffset];
    46     }
    48     UChar charAt(int32_t index, const UnicodeString &strings) const {
    49         return strings[stringOffset+1+index];
    50     }
    52     int32_t getValue() const { return value; }
    54     int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
    56 private:
    57     // The first strings unit contains the string length.
    58     // (Compared with a stringLength field here, this saves 2 bytes per string.)
    59     int32_t stringOffset;
    60     int32_t value;
    61 };
    63 void
    64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
    65                          UnicodeString &strings, UErrorCode &errorCode) {
    66     if(U_FAILURE(errorCode)) {
    67         return;
    68     }
    69     int32_t length=s.length();
    70     if(length>0xffff) {
    71         // Too long: We store the length in 1 unit.
    72         errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    73         return;
    74     }
    75     stringOffset=strings.length();
    76     strings.append((UChar)length);
    77     value=val;
    78     strings.append(s);
    79 }
    81 int32_t
    82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
    83     return getString(strings).compare(other.getString(strings));
    84 }
    86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
    87         : elements(NULL), elementsCapacity(0), elementsLength(0),
    88           uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
    90 UCharsTrieBuilder::~UCharsTrieBuilder() {
    91     delete[] elements;
    92     uprv_free(uchars);
    93 }
    95 UCharsTrieBuilder &
    96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
    97     if(U_FAILURE(errorCode)) {
    98         return *this;
    99     }
   100     if(ucharsLength>0) {
   101         // Cannot add elements after building.
   102         errorCode=U_NO_WRITE_PERMISSION;
   103         return *this;
   104     }
   105     if(elementsLength==elementsCapacity) {
   106         int32_t newCapacity;
   107         if(elementsCapacity==0) {
   108             newCapacity=1024;
   109         } else {
   110             newCapacity=4*elementsCapacity;
   111         }
   112         UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
   113         if(newElements==NULL) {
   114             errorCode=U_MEMORY_ALLOCATION_ERROR;
   115             return *this;
   116         }
   117         if(elementsLength>0) {
   118             uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
   119         }
   120         delete[] elements;
   121         elements=newElements;
   122         elementsCapacity=newCapacity;
   123     }
   124     elements[elementsLength++].setTo(s, value, strings, errorCode);
   125     if(U_SUCCESS(errorCode) && strings.isBogus()) {
   126         errorCode=U_MEMORY_ALLOCATION_ERROR;
   127     }
   128     return *this;
   129 }
   131 U_CDECL_BEGIN
   133 static int32_t U_CALLCONV
   134 compareElementStrings(const void *context, const void *left, const void *right) {
   135     const UnicodeString *strings=static_cast<const UnicodeString *>(context);
   136     const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
   137     const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
   138     return leftElement->compareStringTo(*rightElement, *strings);
   139 }
   141 U_CDECL_END
   143 UCharsTrie *
   144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   145     buildUChars(buildOption, errorCode);
   146     UCharsTrie *newTrie=NULL;
   147     if(U_SUCCESS(errorCode)) {
   148         newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
   149         if(newTrie==NULL) {
   150             errorCode=U_MEMORY_ALLOCATION_ERROR;
   151         } else {
   152             uchars=NULL;  // The new trie now owns the array.
   153             ucharsCapacity=0;
   154         }
   155     }
   156     return newTrie;
   157 }
   159 UnicodeString &
   160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
   161                                       UErrorCode &errorCode) {
   162     buildUChars(buildOption, errorCode);
   163     if(U_SUCCESS(errorCode)) {
   164         result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
   165     }
   166     return result;
   167 }
   169 void
   170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   171     if(U_FAILURE(errorCode)) {
   172         return;
   173     }
   174     if(uchars!=NULL && ucharsLength>0) {
   175         // Already built.
   176         return;
   177     }
   178     if(ucharsLength==0) {
   179         if(elementsLength==0) {
   180             errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   181             return;
   182         }
   183         if(strings.isBogus()) {
   184             errorCode=U_MEMORY_ALLOCATION_ERROR;
   185             return;
   186         }
   187         uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
   188                       compareElementStrings, &strings,
   189                       FALSE,  // need not be a stable sort
   190                       &errorCode);
   191         if(U_FAILURE(errorCode)) {
   192             return;
   193         }
   194         // Duplicate strings are not allowed.
   195         UnicodeString prev=elements[0].getString(strings);
   196         for(int32_t i=1; i<elementsLength; ++i) {
   197             UnicodeString current=elements[i].getString(strings);
   198             if(prev==current) {
   199                 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   200                 return;
   201             }
   202             prev.fastCopyFrom(current);
   203         }
   204     }
   205     // Create and UChar-serialize the trie for the elements.
   206     ucharsLength=0;
   207     int32_t capacity=strings.length();
   208     if(capacity<1024) {
   209         capacity=1024;
   210     }
   211     if(ucharsCapacity<capacity) {
   212         uprv_free(uchars);
   213         uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
   214         if(uchars==NULL) {
   215             errorCode=U_MEMORY_ALLOCATION_ERROR;
   216             ucharsCapacity=0;
   217             return;
   218         }
   219         ucharsCapacity=capacity;
   220     }
   221     StringTrieBuilder::build(buildOption, elementsLength, errorCode);
   222     if(uchars==NULL) {
   223         errorCode=U_MEMORY_ALLOCATION_ERROR;
   224     }
   225 }
   227 int32_t
   228 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
   229     return elements[i].getStringLength(strings);
   230 }
   232 UChar
   233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
   234     return elements[i].charAt(unitIndex, strings);
   235 }
   237 int32_t
   238 UCharsTrieBuilder::getElementValue(int32_t i) const {
   239     return elements[i].getValue();
   240 }
   242 int32_t
   243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
   244     const UCharsTrieElement &firstElement=elements[first];
   245     const UCharsTrieElement &lastElement=elements[last];
   246     int32_t minStringLength=firstElement.getStringLength(strings);
   247     while(++unitIndex<minStringLength &&
   248             firstElement.charAt(unitIndex, strings)==
   249             lastElement.charAt(unitIndex, strings)) {}
   250     return unitIndex;
   251 }
   253 int32_t
   254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
   255     int32_t length=0;  // Number of different units at unitIndex.
   256     int32_t i=start;
   257     do {
   258         UChar unit=elements[i++].charAt(unitIndex, strings);
   259         while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
   260             ++i;
   261         }
   262         ++length;
   263     } while(i<limit);
   264     return length;
   265 }
   267 int32_t
   268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
   269     do {
   270         UChar unit=elements[i++].charAt(unitIndex, strings);
   271         while(unit==elements[i].charAt(unitIndex, strings)) {
   272             ++i;
   273         }
   274     } while(--count>0);
   275     return i;
   276 }
   278 int32_t
   279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
   280     while(unit==elements[i].charAt(unitIndex, strings)) {
   281         ++i;
   282     }
   283     return i;
   284 }
   286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
   287         : LinearMatchNode(len, nextNode), s(units) {
   288     hash=hash*37+ustr_hashUCharsN(units, len);
   289 }
   291 UBool
   292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
   293     if(this==&other) {
   294         return TRUE;
   295     }
   296     if(!LinearMatchNode::operator==(other)) {
   297         return FALSE;
   298     }
   299     const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
   300     return 0==u_memcmp(s, o.s, length);
   301 }
   303 void
   304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
   305     UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
   306     next->write(builder);
   307     b.write(s, length);
   308     offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
   309 }
   311 StringTrieBuilder::Node *
   312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
   313                                          Node *nextNode) const {
   314     return new UCTLinearMatchNode(
   315             elements[i].getString(strings).getBuffer()+unitIndex,
   316             length,
   317             nextNode);
   318 }
   320 UBool
   321 UCharsTrieBuilder::ensureCapacity(int32_t length) {
   322     if(uchars==NULL) {
   323         return FALSE;  // previous memory allocation had failed
   324     }
   325     if(length>ucharsCapacity) {
   326         int32_t newCapacity=ucharsCapacity;
   327         do {
   328             newCapacity*=2;
   329         } while(newCapacity<=length);
   330         UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
   331         if(newUChars==NULL) {
   332             // unable to allocate memory
   333             uprv_free(uchars);
   334             uchars=NULL;
   335             ucharsCapacity=0;
   336             return FALSE;
   337         }
   338         u_memcpy(newUChars+(newCapacity-ucharsLength),
   339                  uchars+(ucharsCapacity-ucharsLength), ucharsLength);
   340         uprv_free(uchars);
   341         uchars=newUChars;
   342         ucharsCapacity=newCapacity;
   343     }
   344     return TRUE;
   345 }
   347 int32_t
   348 UCharsTrieBuilder::write(int32_t unit) {
   349     int32_t newLength=ucharsLength+1;
   350     if(ensureCapacity(newLength)) {
   351         ucharsLength=newLength;
   352         uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
   353     }
   354     return ucharsLength;
   355 }
   357 int32_t
   358 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
   359     int32_t newLength=ucharsLength+length;
   360     if(ensureCapacity(newLength)) {
   361         ucharsLength=newLength;
   362         u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
   363     }
   364     return ucharsLength;
   365 }
   367 int32_t
   368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
   369     return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
   370 }
   372 int32_t
   373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
   374     if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
   375         return write(i|(isFinal<<15));
   376     }
   377     UChar intUnits[3];
   378     int32_t length;
   379     if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
   380         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
   381         intUnits[1]=(UChar)((uint32_t)i>>16);
   382         intUnits[2]=(UChar)i;
   383         length=3;
   384     // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
   385     //     intUnits[0]=(UChar)(i);
   386     //     length=1;
   387     } else {
   388         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
   389         intUnits[1]=(UChar)i;
   390         length=2;
   391     }
   392     intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
   393     return write(intUnits, length);
   394 }
   396 int32_t
   397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
   398     if(!hasValue) {
   399         return write(node);
   400     }
   401     UChar intUnits[3];
   402     int32_t length;
   403     if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
   404         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
   405         intUnits[1]=(UChar)((uint32_t)value>>16);
   406         intUnits[2]=(UChar)value;
   407         length=3;
   408     } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
   409         intUnits[0]=(UChar)((value+1)<<6);
   410         length=1;
   411     } else {
   412         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
   413         intUnits[1]=(UChar)value;
   414         length=2;
   415     }
   416     intUnits[0]|=(UChar)node;
   417     return write(intUnits, length);
   418 }
   420 int32_t
   421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
   422     int32_t i=ucharsLength-jumpTarget;
   423     U_ASSERT(i>=0);
   424     if(i<=UCharsTrie::kMaxOneUnitDelta) {
   425         return write(i);
   426     }
   427     UChar intUnits[3];
   428     int32_t length;
   429     if(i<=UCharsTrie::kMaxTwoUnitDelta) {
   430         intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
   431         length=1;
   432     } else {
   433         intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
   434         intUnits[1]=(UChar)(i>>16);
   435         length=2;
   436     }
   437     intUnits[length++]=(UChar)i;
   438     return write(intUnits, length);
   439 }
   441 U_NAMESPACE_END

mercurial