1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/bytestriebuilder.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,501 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: bytestriebuilder.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010sep25 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 +#include "unicode/bytestrie.h" 1.20 +#include "unicode/bytestriebuilder.h" 1.21 +#include "unicode/stringpiece.h" 1.22 +#include "charstr.h" 1.23 +#include "cmemory.h" 1.24 +#include "uhash.h" 1.25 +#include "uarrsort.h" 1.26 +#include "uassert.h" 1.27 +#include "ustr_imp.h" 1.28 + 1.29 +U_NAMESPACE_BEGIN 1.30 + 1.31 +/* 1.32 + * Note: This builder implementation stores (bytes, value) pairs with full copies 1.33 + * of the byte sequences, until the BytesTrie is built. 1.34 + * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 1.35 + */ 1.36 + 1.37 +class BytesTrieElement : public UMemory { 1.38 +public: 1.39 + // Use compiler's default constructor, initializes nothing. 1.40 + 1.41 + void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); 1.42 + 1.43 + StringPiece getString(const CharString &strings) const { 1.44 + int32_t offset=stringOffset; 1.45 + int32_t length; 1.46 + if(offset>=0) { 1.47 + length=(uint8_t)strings[offset++]; 1.48 + } else { 1.49 + offset=~offset; 1.50 + length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 1.51 + offset+=2; 1.52 + } 1.53 + return StringPiece(strings.data()+offset, length); 1.54 + } 1.55 + int32_t getStringLength(const CharString &strings) const { 1.56 + int32_t offset=stringOffset; 1.57 + if(offset>=0) { 1.58 + return (uint8_t)strings[offset]; 1.59 + } else { 1.60 + offset=~offset; 1.61 + return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; 1.62 + } 1.63 + } 1.64 + 1.65 + char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } 1.66 + 1.67 + int32_t getValue() const { return value; } 1.68 + 1.69 + int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; 1.70 + 1.71 +private: 1.72 + const char *data(const CharString &strings) const { 1.73 + int32_t offset=stringOffset; 1.74 + if(offset>=0) { 1.75 + ++offset; 1.76 + } else { 1.77 + offset=~offset+2; 1.78 + } 1.79 + return strings.data()+offset; 1.80 + } 1.81 + 1.82 + // If the stringOffset is non-negative, then the first strings byte contains 1.83 + // the string length. 1.84 + // If the stringOffset is negative, then the first two strings bytes contain 1.85 + // the string length (big-endian), and the offset needs to be bit-inverted. 1.86 + // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) 1.87 + int32_t stringOffset; 1.88 + int32_t value; 1.89 +}; 1.90 + 1.91 +void 1.92 +BytesTrieElement::setTo(const StringPiece &s, int32_t val, 1.93 + CharString &strings, UErrorCode &errorCode) { 1.94 + if(U_FAILURE(errorCode)) { 1.95 + return; 1.96 + } 1.97 + int32_t length=s.length(); 1.98 + if(length>0xffff) { 1.99 + // Too long: We store the length in 1 or 2 bytes. 1.100 + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.101 + return; 1.102 + } 1.103 + int32_t offset=strings.length(); 1.104 + if(length>0xff) { 1.105 + offset=~offset; 1.106 + strings.append((char)(length>>8), errorCode); 1.107 + } 1.108 + strings.append((char)length, errorCode); 1.109 + stringOffset=offset; 1.110 + value=val; 1.111 + strings.append(s, errorCode); 1.112 +} 1.113 + 1.114 +int32_t 1.115 +BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { 1.116 + // TODO: add StringPiece::compare(), see ticket #8187 1.117 + StringPiece thisString=getString(strings); 1.118 + StringPiece otherString=other.getString(strings); 1.119 + int32_t lengthDiff=thisString.length()-otherString.length(); 1.120 + int32_t commonLength; 1.121 + if(lengthDiff<=0) { 1.122 + commonLength=thisString.length(); 1.123 + } else { 1.124 + commonLength=otherString.length(); 1.125 + } 1.126 + int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); 1.127 + return diff!=0 ? diff : lengthDiff; 1.128 +} 1.129 + 1.130 +BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) 1.131 + : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), 1.132 + bytes(NULL), bytesCapacity(0), bytesLength(0) { 1.133 + if(U_FAILURE(errorCode)) { 1.134 + return; 1.135 + } 1.136 + strings=new CharString(); 1.137 + if(strings==NULL) { 1.138 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.139 + } 1.140 +} 1.141 + 1.142 +BytesTrieBuilder::~BytesTrieBuilder() { 1.143 + delete strings; 1.144 + delete[] elements; 1.145 + uprv_free(bytes); 1.146 +} 1.147 + 1.148 +BytesTrieBuilder & 1.149 +BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { 1.150 + if(U_FAILURE(errorCode)) { 1.151 + return *this; 1.152 + } 1.153 + if(bytesLength>0) { 1.154 + // Cannot add elements after building. 1.155 + errorCode=U_NO_WRITE_PERMISSION; 1.156 + return *this; 1.157 + } 1.158 + if(elementsLength==elementsCapacity) { 1.159 + int32_t newCapacity; 1.160 + if(elementsCapacity==0) { 1.161 + newCapacity=1024; 1.162 + } else { 1.163 + newCapacity=4*elementsCapacity; 1.164 + } 1.165 + BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; 1.166 + if(newElements==NULL) { 1.167 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.168 + return *this; // error instead of dereferencing null 1.169 + } 1.170 + if(elementsLength>0) { 1.171 + uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); 1.172 + } 1.173 + delete[] elements; 1.174 + elements=newElements; 1.175 + elementsCapacity=newCapacity; 1.176 + } 1.177 + elements[elementsLength++].setTo(s, value, *strings, errorCode); 1.178 + return *this; 1.179 +} 1.180 + 1.181 +U_CDECL_BEGIN 1.182 + 1.183 +static int32_t U_CALLCONV 1.184 +compareElementStrings(const void *context, const void *left, const void *right) { 1.185 + const CharString *strings=static_cast<const CharString *>(context); 1.186 + const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); 1.187 + const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); 1.188 + return leftElement->compareStringTo(*rightElement, *strings); 1.189 +} 1.190 + 1.191 +U_CDECL_END 1.192 + 1.193 +BytesTrie * 1.194 +BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 1.195 + buildBytes(buildOption, errorCode); 1.196 + BytesTrie *newTrie=NULL; 1.197 + if(U_SUCCESS(errorCode)) { 1.198 + newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); 1.199 + if(newTrie==NULL) { 1.200 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.201 + } else { 1.202 + bytes=NULL; // The new trie now owns the array. 1.203 + bytesCapacity=0; 1.204 + } 1.205 + } 1.206 + return newTrie; 1.207 +} 1.208 + 1.209 +StringPiece 1.210 +BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 1.211 + buildBytes(buildOption, errorCode); 1.212 + StringPiece result; 1.213 + if(U_SUCCESS(errorCode)) { 1.214 + result.set(bytes+(bytesCapacity-bytesLength), bytesLength); 1.215 + } 1.216 + return result; 1.217 +} 1.218 + 1.219 +void 1.220 +BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 1.221 + if(U_FAILURE(errorCode)) { 1.222 + return; 1.223 + } 1.224 + if(bytes!=NULL && bytesLength>0) { 1.225 + // Already built. 1.226 + return; 1.227 + } 1.228 + if(bytesLength==0) { 1.229 + if(elementsLength==0) { 1.230 + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.231 + return; 1.232 + } 1.233 + uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), 1.234 + compareElementStrings, strings, 1.235 + FALSE, // need not be a stable sort 1.236 + &errorCode); 1.237 + if(U_FAILURE(errorCode)) { 1.238 + return; 1.239 + } 1.240 + // Duplicate strings are not allowed. 1.241 + StringPiece prev=elements[0].getString(*strings); 1.242 + for(int32_t i=1; i<elementsLength; ++i) { 1.243 + StringPiece current=elements[i].getString(*strings); 1.244 + if(prev==current) { 1.245 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.246 + return; 1.247 + } 1.248 + prev=current; 1.249 + } 1.250 + } 1.251 + // Create and byte-serialize the trie for the elements. 1.252 + bytesLength=0; 1.253 + int32_t capacity=strings->length(); 1.254 + if(capacity<1024) { 1.255 + capacity=1024; 1.256 + } 1.257 + if(bytesCapacity<capacity) { 1.258 + uprv_free(bytes); 1.259 + bytes=static_cast<char *>(uprv_malloc(capacity)); 1.260 + if(bytes==NULL) { 1.261 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.262 + bytesCapacity=0; 1.263 + return; 1.264 + } 1.265 + bytesCapacity=capacity; 1.266 + } 1.267 + StringTrieBuilder::build(buildOption, elementsLength, errorCode); 1.268 + if(bytes==NULL) { 1.269 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.270 + } 1.271 +} 1.272 + 1.273 +BytesTrieBuilder & 1.274 +BytesTrieBuilder::clear() { 1.275 + strings->clear(); 1.276 + elementsLength=0; 1.277 + bytesLength=0; 1.278 + return *this; 1.279 +} 1.280 + 1.281 +int32_t 1.282 +BytesTrieBuilder::getElementStringLength(int32_t i) const { 1.283 + return elements[i].getStringLength(*strings); 1.284 +} 1.285 + 1.286 +UChar 1.287 +BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { 1.288 + return (uint8_t)elements[i].charAt(byteIndex, *strings); 1.289 +} 1.290 + 1.291 +int32_t 1.292 +BytesTrieBuilder::getElementValue(int32_t i) const { 1.293 + return elements[i].getValue(); 1.294 +} 1.295 + 1.296 +int32_t 1.297 +BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { 1.298 + const BytesTrieElement &firstElement=elements[first]; 1.299 + const BytesTrieElement &lastElement=elements[last]; 1.300 + int32_t minStringLength=firstElement.getStringLength(*strings); 1.301 + while(++byteIndex<minStringLength && 1.302 + firstElement.charAt(byteIndex, *strings)== 1.303 + lastElement.charAt(byteIndex, *strings)) {} 1.304 + return byteIndex; 1.305 +} 1.306 + 1.307 +int32_t 1.308 +BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { 1.309 + int32_t length=0; // Number of different bytes at byteIndex. 1.310 + int32_t i=start; 1.311 + do { 1.312 + char byte=elements[i++].charAt(byteIndex, *strings); 1.313 + while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { 1.314 + ++i; 1.315 + } 1.316 + ++length; 1.317 + } while(i<limit); 1.318 + return length; 1.319 +} 1.320 + 1.321 +int32_t 1.322 +BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { 1.323 + do { 1.324 + char byte=elements[i++].charAt(byteIndex, *strings); 1.325 + while(byte==elements[i].charAt(byteIndex, *strings)) { 1.326 + ++i; 1.327 + } 1.328 + } while(--count>0); 1.329 + return i; 1.330 +} 1.331 + 1.332 +int32_t 1.333 +BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { 1.334 + char b=(char)byte; 1.335 + while(b==elements[i].charAt(byteIndex, *strings)) { 1.336 + ++i; 1.337 + } 1.338 + return i; 1.339 +} 1.340 + 1.341 +BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) 1.342 + : LinearMatchNode(len, nextNode), s(bytes) { 1.343 + hash=hash*37+ustr_hashCharsN(bytes, len); 1.344 +} 1.345 + 1.346 +UBool 1.347 +BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { 1.348 + if(this==&other) { 1.349 + return TRUE; 1.350 + } 1.351 + if(!LinearMatchNode::operator==(other)) { 1.352 + return FALSE; 1.353 + } 1.354 + const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; 1.355 + return 0==uprv_memcmp(s, o.s, length); 1.356 +} 1.357 + 1.358 +void 1.359 +BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { 1.360 + BytesTrieBuilder &b=(BytesTrieBuilder &)builder; 1.361 + next->write(builder); 1.362 + b.write(s, length); 1.363 + offset=b.write(b.getMinLinearMatch()+length-1); 1.364 +} 1.365 + 1.366 +StringTrieBuilder::Node * 1.367 +BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, 1.368 + Node *nextNode) const { 1.369 + return new BTLinearMatchNode( 1.370 + elements[i].getString(*strings).data()+byteIndex, 1.371 + length, 1.372 + nextNode); 1.373 +} 1.374 + 1.375 +UBool 1.376 +BytesTrieBuilder::ensureCapacity(int32_t length) { 1.377 + if(bytes==NULL) { 1.378 + return FALSE; // previous memory allocation had failed 1.379 + } 1.380 + if(length>bytesCapacity) { 1.381 + int32_t newCapacity=bytesCapacity; 1.382 + do { 1.383 + newCapacity*=2; 1.384 + } while(newCapacity<=length); 1.385 + char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); 1.386 + if(newBytes==NULL) { 1.387 + // unable to allocate memory 1.388 + uprv_free(bytes); 1.389 + bytes=NULL; 1.390 + bytesCapacity=0; 1.391 + return FALSE; 1.392 + } 1.393 + uprv_memcpy(newBytes+(newCapacity-bytesLength), 1.394 + bytes+(bytesCapacity-bytesLength), bytesLength); 1.395 + uprv_free(bytes); 1.396 + bytes=newBytes; 1.397 + bytesCapacity=newCapacity; 1.398 + } 1.399 + return TRUE; 1.400 +} 1.401 + 1.402 +int32_t 1.403 +BytesTrieBuilder::write(int32_t byte) { 1.404 + int32_t newLength=bytesLength+1; 1.405 + if(ensureCapacity(newLength)) { 1.406 + bytesLength=newLength; 1.407 + bytes[bytesCapacity-bytesLength]=(char)byte; 1.408 + } 1.409 + return bytesLength; 1.410 +} 1.411 + 1.412 +int32_t 1.413 +BytesTrieBuilder::write(const char *b, int32_t length) { 1.414 + int32_t newLength=bytesLength+length; 1.415 + if(ensureCapacity(newLength)) { 1.416 + bytesLength=newLength; 1.417 + uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); 1.418 + } 1.419 + return bytesLength; 1.420 +} 1.421 + 1.422 +int32_t 1.423 +BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { 1.424 + return write(elements[i].getString(*strings).data()+byteIndex, length); 1.425 +} 1.426 + 1.427 +int32_t 1.428 +BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 1.429 + if(0<=i && i<=BytesTrie::kMaxOneByteValue) { 1.430 + return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); 1.431 + } 1.432 + char intBytes[5]; 1.433 + int32_t length=1; 1.434 + if(i<0 || i>0xffffff) { 1.435 + intBytes[0]=(char)BytesTrie::kFiveByteValueLead; 1.436 + intBytes[1]=(char)((uint32_t)i>>24); 1.437 + intBytes[2]=(char)((uint32_t)i>>16); 1.438 + intBytes[3]=(char)((uint32_t)i>>8); 1.439 + intBytes[4]=(char)i; 1.440 + length=5; 1.441 + // } else if(i<=BytesTrie::kMaxOneByteValue) { 1.442 + // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); 1.443 + } else { 1.444 + if(i<=BytesTrie::kMaxTwoByteValue) { 1.445 + intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); 1.446 + } else { 1.447 + if(i<=BytesTrie::kMaxThreeByteValue) { 1.448 + intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); 1.449 + } else { 1.450 + intBytes[0]=(char)BytesTrie::kFourByteValueLead; 1.451 + intBytes[1]=(char)(i>>16); 1.452 + length=2; 1.453 + } 1.454 + intBytes[length++]=(char)(i>>8); 1.455 + } 1.456 + intBytes[length++]=(char)i; 1.457 + } 1.458 + intBytes[0]=(char)((intBytes[0]<<1)|isFinal); 1.459 + return write(intBytes, length); 1.460 +} 1.461 + 1.462 +int32_t 1.463 +BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 1.464 + int32_t offset=write(node); 1.465 + if(hasValue) { 1.466 + offset=writeValueAndFinal(value, FALSE); 1.467 + } 1.468 + return offset; 1.469 +} 1.470 + 1.471 +int32_t 1.472 +BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 1.473 + int32_t i=bytesLength-jumpTarget; 1.474 + U_ASSERT(i>=0); 1.475 + if(i<=BytesTrie::kMaxOneByteDelta) { 1.476 + return write(i); 1.477 + } 1.478 + char intBytes[5]; 1.479 + int32_t length; 1.480 + if(i<=BytesTrie::kMaxTwoByteDelta) { 1.481 + intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); 1.482 + length=1; 1.483 + } else { 1.484 + if(i<=BytesTrie::kMaxThreeByteDelta) { 1.485 + intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); 1.486 + length=2; 1.487 + } else { 1.488 + if(i<=0xffffff) { 1.489 + intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; 1.490 + length=3; 1.491 + } else { 1.492 + intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; 1.493 + intBytes[1]=(char)(i>>24); 1.494 + length=4; 1.495 + } 1.496 + intBytes[1]=(char)(i>>16); 1.497 + } 1.498 + intBytes[1]=(char)(i>>8); 1.499 + } 1.500 + intBytes[length++]=(char)i; 1.501 + return write(intBytes, length); 1.502 +} 1.503 + 1.504 +U_NAMESPACE_END