intl/icu/source/common/ucharstriebuilder.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucharstriebuilder.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,441 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2010-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  ucharstriebuilder.h
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2010nov14
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#include "unicode/utypes.h"
    1.19 +#include "unicode/ucharstrie.h"
    1.20 +#include "unicode/ucharstriebuilder.h"
    1.21 +#include "unicode/unistr.h"
    1.22 +#include "unicode/ustring.h"
    1.23 +#include "cmemory.h"
    1.24 +#include "uarrsort.h"
    1.25 +#include "uassert.h"
    1.26 +#include "uhash.h"
    1.27 +#include "ustr_imp.h"
    1.28 +
    1.29 +U_NAMESPACE_BEGIN
    1.30 +
    1.31 +/*
    1.32 + * Note: This builder implementation stores (string, value) pairs with full copies
    1.33 + * of the 16-bit-unit sequences, until the UCharsTrie is built.
    1.34 + * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
    1.35 + */
    1.36 +
    1.37 +class UCharsTrieElement : public UMemory {
    1.38 +public:
    1.39 +    // Use compiler's default constructor, initializes nothing.
    1.40 +
    1.41 +    void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
    1.42 +
    1.43 +    UnicodeString getString(const UnicodeString &strings) const {
    1.44 +        int32_t length=strings[stringOffset];
    1.45 +        return strings.tempSubString(stringOffset+1, length);
    1.46 +    }
    1.47 +    int32_t getStringLength(const UnicodeString &strings) const {
    1.48 +        return strings[stringOffset];
    1.49 +    }
    1.50 +
    1.51 +    UChar charAt(int32_t index, const UnicodeString &strings) const {
    1.52 +        return strings[stringOffset+1+index];
    1.53 +    }
    1.54 +
    1.55 +    int32_t getValue() const { return value; }
    1.56 +
    1.57 +    int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
    1.58 +
    1.59 +private:
    1.60 +    // The first strings unit contains the string length.
    1.61 +    // (Compared with a stringLength field here, this saves 2 bytes per string.)
    1.62 +    int32_t stringOffset;
    1.63 +    int32_t value;
    1.64 +};
    1.65 +
    1.66 +void
    1.67 +UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
    1.68 +                         UnicodeString &strings, UErrorCode &errorCode) {
    1.69 +    if(U_FAILURE(errorCode)) {
    1.70 +        return;
    1.71 +    }
    1.72 +    int32_t length=s.length();
    1.73 +    if(length>0xffff) {
    1.74 +        // Too long: We store the length in 1 unit.
    1.75 +        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
    1.76 +        return;
    1.77 +    }
    1.78 +    stringOffset=strings.length();
    1.79 +    strings.append((UChar)length);
    1.80 +    value=val;
    1.81 +    strings.append(s);
    1.82 +}
    1.83 +
    1.84 +int32_t
    1.85 +UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
    1.86 +    return getString(strings).compare(other.getString(strings));
    1.87 +}
    1.88 +
    1.89 +UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
    1.90 +        : elements(NULL), elementsCapacity(0), elementsLength(0),
    1.91 +          uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
    1.92 +
    1.93 +UCharsTrieBuilder::~UCharsTrieBuilder() {
    1.94 +    delete[] elements;
    1.95 +    uprv_free(uchars);
    1.96 +}
    1.97 +
    1.98 +UCharsTrieBuilder &
    1.99 +UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
   1.100 +    if(U_FAILURE(errorCode)) {
   1.101 +        return *this;
   1.102 +    }
   1.103 +    if(ucharsLength>0) {
   1.104 +        // Cannot add elements after building.
   1.105 +        errorCode=U_NO_WRITE_PERMISSION;
   1.106 +        return *this;
   1.107 +    }
   1.108 +    if(elementsLength==elementsCapacity) {
   1.109 +        int32_t newCapacity;
   1.110 +        if(elementsCapacity==0) {
   1.111 +            newCapacity=1024;
   1.112 +        } else {
   1.113 +            newCapacity=4*elementsCapacity;
   1.114 +        }
   1.115 +        UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
   1.116 +        if(newElements==NULL) {
   1.117 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.118 +            return *this;
   1.119 +        }
   1.120 +        if(elementsLength>0) {
   1.121 +            uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
   1.122 +        }
   1.123 +        delete[] elements;
   1.124 +        elements=newElements;
   1.125 +        elementsCapacity=newCapacity;
   1.126 +    }
   1.127 +    elements[elementsLength++].setTo(s, value, strings, errorCode);
   1.128 +    if(U_SUCCESS(errorCode) && strings.isBogus()) {
   1.129 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.130 +    }
   1.131 +    return *this;
   1.132 +}
   1.133 +
   1.134 +U_CDECL_BEGIN
   1.135 +
   1.136 +static int32_t U_CALLCONV
   1.137 +compareElementStrings(const void *context, const void *left, const void *right) {
   1.138 +    const UnicodeString *strings=static_cast<const UnicodeString *>(context);
   1.139 +    const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
   1.140 +    const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
   1.141 +    return leftElement->compareStringTo(*rightElement, *strings);
   1.142 +}
   1.143 +
   1.144 +U_CDECL_END
   1.145 +
   1.146 +UCharsTrie *
   1.147 +UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   1.148 +    buildUChars(buildOption, errorCode);
   1.149 +    UCharsTrie *newTrie=NULL;
   1.150 +    if(U_SUCCESS(errorCode)) {
   1.151 +        newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
   1.152 +        if(newTrie==NULL) {
   1.153 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.154 +        } else {
   1.155 +            uchars=NULL;  // The new trie now owns the array.
   1.156 +            ucharsCapacity=0;
   1.157 +        }
   1.158 +    }
   1.159 +    return newTrie;
   1.160 +}
   1.161 +
   1.162 +UnicodeString &
   1.163 +UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
   1.164 +                                      UErrorCode &errorCode) {
   1.165 +    buildUChars(buildOption, errorCode);
   1.166 +    if(U_SUCCESS(errorCode)) {
   1.167 +        result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
   1.168 +    }
   1.169 +    return result;
   1.170 +}
   1.171 +
   1.172 +void
   1.173 +UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   1.174 +    if(U_FAILURE(errorCode)) {
   1.175 +        return;
   1.176 +    }
   1.177 +    if(uchars!=NULL && ucharsLength>0) {
   1.178 +        // Already built.
   1.179 +        return;
   1.180 +    }
   1.181 +    if(ucharsLength==0) {
   1.182 +        if(elementsLength==0) {
   1.183 +            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.184 +            return;
   1.185 +        }
   1.186 +        if(strings.isBogus()) {
   1.187 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.188 +            return;
   1.189 +        }
   1.190 +        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
   1.191 +                      compareElementStrings, &strings,
   1.192 +                      FALSE,  // need not be a stable sort
   1.193 +                      &errorCode);
   1.194 +        if(U_FAILURE(errorCode)) {
   1.195 +            return;
   1.196 +        }
   1.197 +        // Duplicate strings are not allowed.
   1.198 +        UnicodeString prev=elements[0].getString(strings);
   1.199 +        for(int32_t i=1; i<elementsLength; ++i) {
   1.200 +            UnicodeString current=elements[i].getString(strings);
   1.201 +            if(prev==current) {
   1.202 +                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.203 +                return;
   1.204 +            }
   1.205 +            prev.fastCopyFrom(current);
   1.206 +        }
   1.207 +    }
   1.208 +    // Create and UChar-serialize the trie for the elements.
   1.209 +    ucharsLength=0;
   1.210 +    int32_t capacity=strings.length();
   1.211 +    if(capacity<1024) {
   1.212 +        capacity=1024;
   1.213 +    }
   1.214 +    if(ucharsCapacity<capacity) {
   1.215 +        uprv_free(uchars);
   1.216 +        uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
   1.217 +        if(uchars==NULL) {
   1.218 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.219 +            ucharsCapacity=0;
   1.220 +            return;
   1.221 +        }
   1.222 +        ucharsCapacity=capacity;
   1.223 +    }
   1.224 +    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
   1.225 +    if(uchars==NULL) {
   1.226 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.227 +    }
   1.228 +}
   1.229 +
   1.230 +int32_t
   1.231 +UCharsTrieBuilder::getElementStringLength(int32_t i) const {
   1.232 +    return elements[i].getStringLength(strings);
   1.233 +}
   1.234 +
   1.235 +UChar
   1.236 +UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
   1.237 +    return elements[i].charAt(unitIndex, strings);
   1.238 +}
   1.239 +
   1.240 +int32_t
   1.241 +UCharsTrieBuilder::getElementValue(int32_t i) const {
   1.242 +    return elements[i].getValue();
   1.243 +}
   1.244 +
   1.245 +int32_t
   1.246 +UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
   1.247 +    const UCharsTrieElement &firstElement=elements[first];
   1.248 +    const UCharsTrieElement &lastElement=elements[last];
   1.249 +    int32_t minStringLength=firstElement.getStringLength(strings);
   1.250 +    while(++unitIndex<minStringLength &&
   1.251 +            firstElement.charAt(unitIndex, strings)==
   1.252 +            lastElement.charAt(unitIndex, strings)) {}
   1.253 +    return unitIndex;
   1.254 +}
   1.255 +
   1.256 +int32_t
   1.257 +UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
   1.258 +    int32_t length=0;  // Number of different units at unitIndex.
   1.259 +    int32_t i=start;
   1.260 +    do {
   1.261 +        UChar unit=elements[i++].charAt(unitIndex, strings);
   1.262 +        while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
   1.263 +            ++i;
   1.264 +        }
   1.265 +        ++length;
   1.266 +    } while(i<limit);
   1.267 +    return length;
   1.268 +}
   1.269 +
   1.270 +int32_t
   1.271 +UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
   1.272 +    do {
   1.273 +        UChar unit=elements[i++].charAt(unitIndex, strings);
   1.274 +        while(unit==elements[i].charAt(unitIndex, strings)) {
   1.275 +            ++i;
   1.276 +        }
   1.277 +    } while(--count>0);
   1.278 +    return i;
   1.279 +}
   1.280 +
   1.281 +int32_t
   1.282 +UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
   1.283 +    while(unit==elements[i].charAt(unitIndex, strings)) {
   1.284 +        ++i;
   1.285 +    }
   1.286 +    return i;
   1.287 +}
   1.288 +
   1.289 +UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
   1.290 +        : LinearMatchNode(len, nextNode), s(units) {
   1.291 +    hash=hash*37+ustr_hashUCharsN(units, len);
   1.292 +}
   1.293 +
   1.294 +UBool
   1.295 +UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
   1.296 +    if(this==&other) {
   1.297 +        return TRUE;
   1.298 +    }
   1.299 +    if(!LinearMatchNode::operator==(other)) {
   1.300 +        return FALSE;
   1.301 +    }
   1.302 +    const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
   1.303 +    return 0==u_memcmp(s, o.s, length);
   1.304 +}
   1.305 +
   1.306 +void
   1.307 +UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
   1.308 +    UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
   1.309 +    next->write(builder);
   1.310 +    b.write(s, length);
   1.311 +    offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
   1.312 +}
   1.313 +
   1.314 +StringTrieBuilder::Node *
   1.315 +UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
   1.316 +                                         Node *nextNode) const {
   1.317 +    return new UCTLinearMatchNode(
   1.318 +            elements[i].getString(strings).getBuffer()+unitIndex,
   1.319 +            length,
   1.320 +            nextNode);
   1.321 +}
   1.322 +
   1.323 +UBool
   1.324 +UCharsTrieBuilder::ensureCapacity(int32_t length) {
   1.325 +    if(uchars==NULL) {
   1.326 +        return FALSE;  // previous memory allocation had failed
   1.327 +    }
   1.328 +    if(length>ucharsCapacity) {
   1.329 +        int32_t newCapacity=ucharsCapacity;
   1.330 +        do {
   1.331 +            newCapacity*=2;
   1.332 +        } while(newCapacity<=length);
   1.333 +        UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
   1.334 +        if(newUChars==NULL) {
   1.335 +            // unable to allocate memory
   1.336 +            uprv_free(uchars);
   1.337 +            uchars=NULL;
   1.338 +            ucharsCapacity=0;
   1.339 +            return FALSE;
   1.340 +        }
   1.341 +        u_memcpy(newUChars+(newCapacity-ucharsLength),
   1.342 +                 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
   1.343 +        uprv_free(uchars);
   1.344 +        uchars=newUChars;
   1.345 +        ucharsCapacity=newCapacity;
   1.346 +    }
   1.347 +    return TRUE;
   1.348 +}
   1.349 +
   1.350 +int32_t
   1.351 +UCharsTrieBuilder::write(int32_t unit) {
   1.352 +    int32_t newLength=ucharsLength+1;
   1.353 +    if(ensureCapacity(newLength)) {
   1.354 +        ucharsLength=newLength;
   1.355 +        uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
   1.356 +    }
   1.357 +    return ucharsLength;
   1.358 +}
   1.359 +
   1.360 +int32_t
   1.361 +UCharsTrieBuilder::write(const UChar *s, int32_t length) {
   1.362 +    int32_t newLength=ucharsLength+length;
   1.363 +    if(ensureCapacity(newLength)) {
   1.364 +        ucharsLength=newLength;
   1.365 +        u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
   1.366 +    }
   1.367 +    return ucharsLength;
   1.368 +}
   1.369 +
   1.370 +int32_t
   1.371 +UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
   1.372 +    return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
   1.373 +}
   1.374 +
   1.375 +int32_t
   1.376 +UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
   1.377 +    if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
   1.378 +        return write(i|(isFinal<<15));
   1.379 +    }
   1.380 +    UChar intUnits[3];
   1.381 +    int32_t length;
   1.382 +    if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
   1.383 +        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
   1.384 +        intUnits[1]=(UChar)((uint32_t)i>>16);
   1.385 +        intUnits[2]=(UChar)i;
   1.386 +        length=3;
   1.387 +    // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
   1.388 +    //     intUnits[0]=(UChar)(i);
   1.389 +    //     length=1;
   1.390 +    } else {
   1.391 +        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
   1.392 +        intUnits[1]=(UChar)i;
   1.393 +        length=2;
   1.394 +    }
   1.395 +    intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
   1.396 +    return write(intUnits, length);
   1.397 +}
   1.398 +
   1.399 +int32_t
   1.400 +UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
   1.401 +    if(!hasValue) {
   1.402 +        return write(node);
   1.403 +    }
   1.404 +    UChar intUnits[3];
   1.405 +    int32_t length;
   1.406 +    if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
   1.407 +        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
   1.408 +        intUnits[1]=(UChar)((uint32_t)value>>16);
   1.409 +        intUnits[2]=(UChar)value;
   1.410 +        length=3;
   1.411 +    } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
   1.412 +        intUnits[0]=(UChar)((value+1)<<6);
   1.413 +        length=1;
   1.414 +    } else {
   1.415 +        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
   1.416 +        intUnits[1]=(UChar)value;
   1.417 +        length=2;
   1.418 +    }
   1.419 +    intUnits[0]|=(UChar)node;
   1.420 +    return write(intUnits, length);
   1.421 +}
   1.422 +
   1.423 +int32_t
   1.424 +UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
   1.425 +    int32_t i=ucharsLength-jumpTarget;
   1.426 +    U_ASSERT(i>=0);
   1.427 +    if(i<=UCharsTrie::kMaxOneUnitDelta) {
   1.428 +        return write(i);
   1.429 +    }
   1.430 +    UChar intUnits[3];
   1.431 +    int32_t length;
   1.432 +    if(i<=UCharsTrie::kMaxTwoUnitDelta) {
   1.433 +        intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
   1.434 +        length=1;
   1.435 +    } else {
   1.436 +        intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
   1.437 +        intUnits[1]=(UChar)(i>>16);
   1.438 +        length=2;
   1.439 +    }
   1.440 +    intUnits[length++]=(UChar)i;
   1.441 +    return write(intUnits, length);
   1.442 +}
   1.443 +
   1.444 +U_NAMESPACE_END

mercurial