intl/icu/source/common/bytestriebuilder.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/bytestriebuilder.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,501 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2010-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  bytestriebuilder.cpp
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2010sep25
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#include "unicode/utypes.h"
    1.19 +#include "unicode/bytestrie.h"
    1.20 +#include "unicode/bytestriebuilder.h"
    1.21 +#include "unicode/stringpiece.h"
    1.22 +#include "charstr.h"
    1.23 +#include "cmemory.h"
    1.24 +#include "uhash.h"
    1.25 +#include "uarrsort.h"
    1.26 +#include "uassert.h"
    1.27 +#include "ustr_imp.h"
    1.28 +
    1.29 +U_NAMESPACE_BEGIN
    1.30 +
    1.31 +/*
    1.32 + * Note: This builder implementation stores (bytes, value) pairs with full copies
    1.33 + * of the byte sequences, until the BytesTrie is built.
    1.34 + * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
    1.35 + */
    1.36 +
    1.37 +class BytesTrieElement : public UMemory {
    1.38 +public:
    1.39 +    // Use compiler's default constructor, initializes nothing.
    1.40 +
    1.41 +    void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
    1.42 +
    1.43 +    StringPiece getString(const CharString &strings) const {
    1.44 +        int32_t offset=stringOffset;
    1.45 +        int32_t length;
    1.46 +        if(offset>=0) {
    1.47 +            length=(uint8_t)strings[offset++];
    1.48 +        } else {
    1.49 +            offset=~offset;
    1.50 +            length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
    1.51 +            offset+=2;
    1.52 +        }
    1.53 +        return StringPiece(strings.data()+offset, length);
    1.54 +    }
    1.55 +    int32_t getStringLength(const CharString &strings) const {
    1.56 +        int32_t offset=stringOffset;
    1.57 +        if(offset>=0) {
    1.58 +            return (uint8_t)strings[offset];
    1.59 +        } else {
    1.60 +            offset=~offset;
    1.61 +            return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
    1.62 +        }
    1.63 +    }
    1.64 +
    1.65 +    char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
    1.66 +
    1.67 +    int32_t getValue() const { return value; }
    1.68 +
    1.69 +    int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
    1.70 +
    1.71 +private:
    1.72 +    const char *data(const CharString &strings) const {
    1.73 +        int32_t offset=stringOffset;
    1.74 +        if(offset>=0) {
    1.75 +            ++offset;
    1.76 +        } else {
    1.77 +            offset=~offset+2;
    1.78 +        }
    1.79 +        return strings.data()+offset;
    1.80 +    }
    1.81 +
    1.82 +    // If the stringOffset is non-negative, then the first strings byte contains
    1.83 +    // the string length.
    1.84 +    // If the stringOffset is negative, then the first two strings bytes contain
    1.85 +    // the string length (big-endian), and the offset needs to be bit-inverted.
    1.86 +    // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
    1.87 +    int32_t stringOffset;
    1.88 +    int32_t value;
    1.89 +};
    1.90 +
    1.91 +void
    1.92 +BytesTrieElement::setTo(const StringPiece &s, int32_t val,
    1.93 +                        CharString &strings, UErrorCode &errorCode) {
    1.94 +    if(U_FAILURE(errorCode)) {
    1.95 +        return;
    1.96 +    }
    1.97 +    int32_t length=s.length();
    1.98 +    if(length>0xffff) {
    1.99 +        // Too long: We store the length in 1 or 2 bytes.
   1.100 +        errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.101 +        return;
   1.102 +    }
   1.103 +    int32_t offset=strings.length();
   1.104 +    if(length>0xff) {
   1.105 +        offset=~offset;
   1.106 +        strings.append((char)(length>>8), errorCode);
   1.107 +    }
   1.108 +    strings.append((char)length, errorCode);
   1.109 +    stringOffset=offset;
   1.110 +    value=val;
   1.111 +    strings.append(s, errorCode);
   1.112 +}
   1.113 +
   1.114 +int32_t
   1.115 +BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
   1.116 +    // TODO: add StringPiece::compare(), see ticket #8187
   1.117 +    StringPiece thisString=getString(strings);
   1.118 +    StringPiece otherString=other.getString(strings);
   1.119 +    int32_t lengthDiff=thisString.length()-otherString.length();
   1.120 +    int32_t commonLength;
   1.121 +    if(lengthDiff<=0) {
   1.122 +        commonLength=thisString.length();
   1.123 +    } else {
   1.124 +        commonLength=otherString.length();
   1.125 +    }
   1.126 +    int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
   1.127 +    return diff!=0 ? diff : lengthDiff;
   1.128 +}
   1.129 +
   1.130 +BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
   1.131 +        : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
   1.132 +          bytes(NULL), bytesCapacity(0), bytesLength(0) {
   1.133 +    if(U_FAILURE(errorCode)) {
   1.134 +        return;
   1.135 +    }
   1.136 +    strings=new CharString();
   1.137 +    if(strings==NULL) {
   1.138 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.139 +    }
   1.140 +}
   1.141 +
   1.142 +BytesTrieBuilder::~BytesTrieBuilder() {
   1.143 +    delete strings;
   1.144 +    delete[] elements;
   1.145 +    uprv_free(bytes);
   1.146 +}
   1.147 +
   1.148 +BytesTrieBuilder &
   1.149 +BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
   1.150 +    if(U_FAILURE(errorCode)) {
   1.151 +        return *this;
   1.152 +    }
   1.153 +    if(bytesLength>0) {
   1.154 +        // Cannot add elements after building.
   1.155 +        errorCode=U_NO_WRITE_PERMISSION;
   1.156 +        return *this;
   1.157 +    }
   1.158 +    if(elementsLength==elementsCapacity) {
   1.159 +        int32_t newCapacity;
   1.160 +        if(elementsCapacity==0) {
   1.161 +            newCapacity=1024;
   1.162 +        } else {
   1.163 +            newCapacity=4*elementsCapacity;
   1.164 +        }
   1.165 +        BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
   1.166 +        if(newElements==NULL) {
   1.167 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.168 +            return *this; // error instead of dereferencing null
   1.169 +        }
   1.170 +        if(elementsLength>0) {
   1.171 +            uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
   1.172 +        }
   1.173 +        delete[] elements;
   1.174 +        elements=newElements;
   1.175 +        elementsCapacity=newCapacity;
   1.176 +    }
   1.177 +    elements[elementsLength++].setTo(s, value, *strings, errorCode);
   1.178 +    return *this;
   1.179 +}
   1.180 +
   1.181 +U_CDECL_BEGIN
   1.182 +
   1.183 +static int32_t U_CALLCONV
   1.184 +compareElementStrings(const void *context, const void *left, const void *right) {
   1.185 +    const CharString *strings=static_cast<const CharString *>(context);
   1.186 +    const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
   1.187 +    const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
   1.188 +    return leftElement->compareStringTo(*rightElement, *strings);
   1.189 +}
   1.190 +
   1.191 +U_CDECL_END
   1.192 +
   1.193 +BytesTrie *
   1.194 +BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   1.195 +    buildBytes(buildOption, errorCode);
   1.196 +    BytesTrie *newTrie=NULL;
   1.197 +    if(U_SUCCESS(errorCode)) {
   1.198 +        newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
   1.199 +        if(newTrie==NULL) {
   1.200 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.201 +        } else {
   1.202 +            bytes=NULL;  // The new trie now owns the array.
   1.203 +            bytesCapacity=0;
   1.204 +        }
   1.205 +    }
   1.206 +    return newTrie;
   1.207 +}
   1.208 +
   1.209 +StringPiece
   1.210 +BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   1.211 +    buildBytes(buildOption, errorCode);
   1.212 +    StringPiece result;
   1.213 +    if(U_SUCCESS(errorCode)) {
   1.214 +        result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
   1.215 +    }
   1.216 +    return result;
   1.217 +}
   1.218 +
   1.219 +void
   1.220 +BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
   1.221 +    if(U_FAILURE(errorCode)) {
   1.222 +        return;
   1.223 +    }
   1.224 +    if(bytes!=NULL && bytesLength>0) {
   1.225 +        // Already built.
   1.226 +        return;
   1.227 +    }
   1.228 +    if(bytesLength==0) {
   1.229 +        if(elementsLength==0) {
   1.230 +            errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   1.231 +            return;
   1.232 +        }
   1.233 +        uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
   1.234 +                      compareElementStrings, strings,
   1.235 +                      FALSE,  // need not be a stable sort
   1.236 +                      &errorCode);
   1.237 +        if(U_FAILURE(errorCode)) {
   1.238 +            return;
   1.239 +        }
   1.240 +        // Duplicate strings are not allowed.
   1.241 +        StringPiece prev=elements[0].getString(*strings);
   1.242 +        for(int32_t i=1; i<elementsLength; ++i) {
   1.243 +            StringPiece current=elements[i].getString(*strings);
   1.244 +            if(prev==current) {
   1.245 +                errorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.246 +                return;
   1.247 +            }
   1.248 +            prev=current;
   1.249 +        }
   1.250 +    }
   1.251 +    // Create and byte-serialize the trie for the elements.
   1.252 +    bytesLength=0;
   1.253 +    int32_t capacity=strings->length();
   1.254 +    if(capacity<1024) {
   1.255 +        capacity=1024;
   1.256 +    }
   1.257 +    if(bytesCapacity<capacity) {
   1.258 +        uprv_free(bytes);
   1.259 +        bytes=static_cast<char *>(uprv_malloc(capacity));
   1.260 +        if(bytes==NULL) {
   1.261 +            errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.262 +            bytesCapacity=0;
   1.263 +            return;
   1.264 +        }
   1.265 +        bytesCapacity=capacity;
   1.266 +    }
   1.267 +    StringTrieBuilder::build(buildOption, elementsLength, errorCode);
   1.268 +    if(bytes==NULL) {
   1.269 +        errorCode=U_MEMORY_ALLOCATION_ERROR;
   1.270 +    }
   1.271 +}
   1.272 +
   1.273 +BytesTrieBuilder &
   1.274 +BytesTrieBuilder::clear() {
   1.275 +    strings->clear();
   1.276 +    elementsLength=0;
   1.277 +    bytesLength=0;
   1.278 +    return *this;
   1.279 +}
   1.280 +
   1.281 +int32_t
   1.282 +BytesTrieBuilder::getElementStringLength(int32_t i) const {
   1.283 +    return elements[i].getStringLength(*strings);
   1.284 +}
   1.285 +
   1.286 +UChar
   1.287 +BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
   1.288 +    return (uint8_t)elements[i].charAt(byteIndex, *strings);
   1.289 +}
   1.290 +
   1.291 +int32_t
   1.292 +BytesTrieBuilder::getElementValue(int32_t i) const {
   1.293 +    return elements[i].getValue();
   1.294 +}
   1.295 +
   1.296 +int32_t
   1.297 +BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
   1.298 +    const BytesTrieElement &firstElement=elements[first];
   1.299 +    const BytesTrieElement &lastElement=elements[last];
   1.300 +    int32_t minStringLength=firstElement.getStringLength(*strings);
   1.301 +    while(++byteIndex<minStringLength &&
   1.302 +            firstElement.charAt(byteIndex, *strings)==
   1.303 +            lastElement.charAt(byteIndex, *strings)) {}
   1.304 +    return byteIndex;
   1.305 +}
   1.306 +
   1.307 +int32_t
   1.308 +BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
   1.309 +    int32_t length=0;  // Number of different bytes at byteIndex.
   1.310 +    int32_t i=start;
   1.311 +    do {
   1.312 +        char byte=elements[i++].charAt(byteIndex, *strings);
   1.313 +        while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
   1.314 +            ++i;
   1.315 +        }
   1.316 +        ++length;
   1.317 +    } while(i<limit);
   1.318 +    return length;
   1.319 +}
   1.320 +
   1.321 +int32_t
   1.322 +BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
   1.323 +    do {
   1.324 +        char byte=elements[i++].charAt(byteIndex, *strings);
   1.325 +        while(byte==elements[i].charAt(byteIndex, *strings)) {
   1.326 +            ++i;
   1.327 +        }
   1.328 +    } while(--count>0);
   1.329 +    return i;
   1.330 +}
   1.331 +
   1.332 +int32_t
   1.333 +BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
   1.334 +    char b=(char)byte;
   1.335 +    while(b==elements[i].charAt(byteIndex, *strings)) {
   1.336 +        ++i;
   1.337 +    }
   1.338 +    return i;
   1.339 +}
   1.340 +
   1.341 +BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
   1.342 +        : LinearMatchNode(len, nextNode), s(bytes) {
   1.343 +    hash=hash*37+ustr_hashCharsN(bytes, len);
   1.344 +}
   1.345 +
   1.346 +UBool
   1.347 +BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
   1.348 +    if(this==&other) {
   1.349 +        return TRUE;
   1.350 +    }
   1.351 +    if(!LinearMatchNode::operator==(other)) {
   1.352 +        return FALSE;
   1.353 +    }
   1.354 +    const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
   1.355 +    return 0==uprv_memcmp(s, o.s, length);
   1.356 +}
   1.357 +
   1.358 +void
   1.359 +BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
   1.360 +    BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
   1.361 +    next->write(builder);
   1.362 +    b.write(s, length);
   1.363 +    offset=b.write(b.getMinLinearMatch()+length-1);
   1.364 +}
   1.365 +
   1.366 +StringTrieBuilder::Node *
   1.367 +BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
   1.368 +                                        Node *nextNode) const {
   1.369 +    return new BTLinearMatchNode(
   1.370 +            elements[i].getString(*strings).data()+byteIndex,
   1.371 +            length,
   1.372 +            nextNode);
   1.373 +}
   1.374 +
   1.375 +UBool
   1.376 +BytesTrieBuilder::ensureCapacity(int32_t length) {
   1.377 +    if(bytes==NULL) {
   1.378 +        return FALSE;  // previous memory allocation had failed
   1.379 +    }
   1.380 +    if(length>bytesCapacity) {
   1.381 +        int32_t newCapacity=bytesCapacity;
   1.382 +        do {
   1.383 +            newCapacity*=2;
   1.384 +        } while(newCapacity<=length);
   1.385 +        char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
   1.386 +        if(newBytes==NULL) {
   1.387 +            // unable to allocate memory
   1.388 +            uprv_free(bytes);
   1.389 +            bytes=NULL;
   1.390 +            bytesCapacity=0;
   1.391 +            return FALSE;
   1.392 +        }
   1.393 +        uprv_memcpy(newBytes+(newCapacity-bytesLength),
   1.394 +                    bytes+(bytesCapacity-bytesLength), bytesLength);
   1.395 +        uprv_free(bytes);
   1.396 +        bytes=newBytes;
   1.397 +        bytesCapacity=newCapacity;
   1.398 +    }
   1.399 +    return TRUE;
   1.400 +}
   1.401 +
   1.402 +int32_t
   1.403 +BytesTrieBuilder::write(int32_t byte) {
   1.404 +    int32_t newLength=bytesLength+1;
   1.405 +    if(ensureCapacity(newLength)) {
   1.406 +        bytesLength=newLength;
   1.407 +        bytes[bytesCapacity-bytesLength]=(char)byte;
   1.408 +    }
   1.409 +    return bytesLength;
   1.410 +}
   1.411 +
   1.412 +int32_t
   1.413 +BytesTrieBuilder::write(const char *b, int32_t length) {
   1.414 +    int32_t newLength=bytesLength+length;
   1.415 +    if(ensureCapacity(newLength)) {
   1.416 +        bytesLength=newLength;
   1.417 +        uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
   1.418 +    }
   1.419 +    return bytesLength;
   1.420 +}
   1.421 +
   1.422 +int32_t
   1.423 +BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
   1.424 +    return write(elements[i].getString(*strings).data()+byteIndex, length);
   1.425 +}
   1.426 +
   1.427 +int32_t
   1.428 +BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
   1.429 +    if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
   1.430 +        return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
   1.431 +    }
   1.432 +    char intBytes[5];
   1.433 +    int32_t length=1;
   1.434 +    if(i<0 || i>0xffffff) {
   1.435 +        intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
   1.436 +        intBytes[1]=(char)((uint32_t)i>>24);
   1.437 +        intBytes[2]=(char)((uint32_t)i>>16);
   1.438 +        intBytes[3]=(char)((uint32_t)i>>8);
   1.439 +        intBytes[4]=(char)i;
   1.440 +        length=5;
   1.441 +    // } else if(i<=BytesTrie::kMaxOneByteValue) {
   1.442 +    //     intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
   1.443 +    } else {
   1.444 +        if(i<=BytesTrie::kMaxTwoByteValue) {
   1.445 +            intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
   1.446 +        } else {
   1.447 +            if(i<=BytesTrie::kMaxThreeByteValue) {
   1.448 +                intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
   1.449 +            } else {
   1.450 +                intBytes[0]=(char)BytesTrie::kFourByteValueLead;
   1.451 +                intBytes[1]=(char)(i>>16);
   1.452 +                length=2;
   1.453 +            }
   1.454 +            intBytes[length++]=(char)(i>>8);
   1.455 +        }
   1.456 +        intBytes[length++]=(char)i;
   1.457 +    }
   1.458 +    intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
   1.459 +    return write(intBytes, length);
   1.460 +}
   1.461 +
   1.462 +int32_t
   1.463 +BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
   1.464 +    int32_t offset=write(node);
   1.465 +    if(hasValue) {
   1.466 +        offset=writeValueAndFinal(value, FALSE);
   1.467 +    }
   1.468 +    return offset;
   1.469 +}
   1.470 +
   1.471 +int32_t
   1.472 +BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
   1.473 +    int32_t i=bytesLength-jumpTarget;
   1.474 +    U_ASSERT(i>=0);
   1.475 +    if(i<=BytesTrie::kMaxOneByteDelta) {
   1.476 +        return write(i);
   1.477 +    }
   1.478 +    char intBytes[5];
   1.479 +    int32_t length;
   1.480 +    if(i<=BytesTrie::kMaxTwoByteDelta) {
   1.481 +        intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
   1.482 +        length=1;
   1.483 +    } else {
   1.484 +        if(i<=BytesTrie::kMaxThreeByteDelta) {
   1.485 +            intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
   1.486 +            length=2;
   1.487 +        } else {
   1.488 +            if(i<=0xffffff) {
   1.489 +                intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
   1.490 +                length=3;
   1.491 +            } else {
   1.492 +                intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
   1.493 +                intBytes[1]=(char)(i>>24);
   1.494 +                length=4;
   1.495 +            }
   1.496 +            intBytes[1]=(char)(i>>16);
   1.497 +        }
   1.498 +        intBytes[1]=(char)(i>>8);
   1.499 +    }
   1.500 +    intBytes[length++]=(char)i;
   1.501 +    return write(intBytes, length);
   1.502 +}
   1.503 +
   1.504 +U_NAMESPACE_END

mercurial