intl/icu/source/common/bytestriebuilder.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2010-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: bytestriebuilder.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2010sep25
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16 #include "unicode/bytestrie.h"
michael@0 17 #include "unicode/bytestriebuilder.h"
michael@0 18 #include "unicode/stringpiece.h"
michael@0 19 #include "charstr.h"
michael@0 20 #include "cmemory.h"
michael@0 21 #include "uhash.h"
michael@0 22 #include "uarrsort.h"
michael@0 23 #include "uassert.h"
michael@0 24 #include "ustr_imp.h"
michael@0 25
michael@0 26 U_NAMESPACE_BEGIN
michael@0 27
michael@0 28 /*
michael@0 29 * Note: This builder implementation stores (bytes, value) pairs with full copies
michael@0 30 * of the byte sequences, until the BytesTrie is built.
michael@0 31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
michael@0 32 */
michael@0 33
michael@0 34 class BytesTrieElement : public UMemory {
michael@0 35 public:
michael@0 36 // Use compiler's default constructor, initializes nothing.
michael@0 37
michael@0 38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode);
michael@0 39
michael@0 40 StringPiece getString(const CharString &strings) const {
michael@0 41 int32_t offset=stringOffset;
michael@0 42 int32_t length;
michael@0 43 if(offset>=0) {
michael@0 44 length=(uint8_t)strings[offset++];
michael@0 45 } else {
michael@0 46 offset=~offset;
michael@0 47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
michael@0 48 offset+=2;
michael@0 49 }
michael@0 50 return StringPiece(strings.data()+offset, length);
michael@0 51 }
michael@0 52 int32_t getStringLength(const CharString &strings) const {
michael@0 53 int32_t offset=stringOffset;
michael@0 54 if(offset>=0) {
michael@0 55 return (uint8_t)strings[offset];
michael@0 56 } else {
michael@0 57 offset=~offset;
michael@0 58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1];
michael@0 59 }
michael@0 60 }
michael@0 61
michael@0 62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; }
michael@0 63
michael@0 64 int32_t getValue() const { return value; }
michael@0 65
michael@0 66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const;
michael@0 67
michael@0 68 private:
michael@0 69 const char *data(const CharString &strings) const {
michael@0 70 int32_t offset=stringOffset;
michael@0 71 if(offset>=0) {
michael@0 72 ++offset;
michael@0 73 } else {
michael@0 74 offset=~offset+2;
michael@0 75 }
michael@0 76 return strings.data()+offset;
michael@0 77 }
michael@0 78
michael@0 79 // If the stringOffset is non-negative, then the first strings byte contains
michael@0 80 // the string length.
michael@0 81 // If the stringOffset is negative, then the first two strings bytes contain
michael@0 82 // the string length (big-endian), and the offset needs to be bit-inverted.
michael@0 83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.)
michael@0 84 int32_t stringOffset;
michael@0 85 int32_t value;
michael@0 86 };
michael@0 87
michael@0 88 void
michael@0 89 BytesTrieElement::setTo(const StringPiece &s, int32_t val,
michael@0 90 CharString &strings, UErrorCode &errorCode) {
michael@0 91 if(U_FAILURE(errorCode)) {
michael@0 92 return;
michael@0 93 }
michael@0 94 int32_t length=s.length();
michael@0 95 if(length>0xffff) {
michael@0 96 // Too long: We store the length in 1 or 2 bytes.
michael@0 97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 98 return;
michael@0 99 }
michael@0 100 int32_t offset=strings.length();
michael@0 101 if(length>0xff) {
michael@0 102 offset=~offset;
michael@0 103 strings.append((char)(length>>8), errorCode);
michael@0 104 }
michael@0 105 strings.append((char)length, errorCode);
michael@0 106 stringOffset=offset;
michael@0 107 value=val;
michael@0 108 strings.append(s, errorCode);
michael@0 109 }
michael@0 110
michael@0 111 int32_t
michael@0 112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const {
michael@0 113 // TODO: add StringPiece::compare(), see ticket #8187
michael@0 114 StringPiece thisString=getString(strings);
michael@0 115 StringPiece otherString=other.getString(strings);
michael@0 116 int32_t lengthDiff=thisString.length()-otherString.length();
michael@0 117 int32_t commonLength;
michael@0 118 if(lengthDiff<=0) {
michael@0 119 commonLength=thisString.length();
michael@0 120 } else {
michael@0 121 commonLength=otherString.length();
michael@0 122 }
michael@0 123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength);
michael@0 124 return diff!=0 ? diff : lengthDiff;
michael@0 125 }
michael@0 126
michael@0 127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode)
michael@0 128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0),
michael@0 129 bytes(NULL), bytesCapacity(0), bytesLength(0) {
michael@0 130 if(U_FAILURE(errorCode)) {
michael@0 131 return;
michael@0 132 }
michael@0 133 strings=new CharString();
michael@0 134 if(strings==NULL) {
michael@0 135 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 136 }
michael@0 137 }
michael@0 138
michael@0 139 BytesTrieBuilder::~BytesTrieBuilder() {
michael@0 140 delete strings;
michael@0 141 delete[] elements;
michael@0 142 uprv_free(bytes);
michael@0 143 }
michael@0 144
michael@0 145 BytesTrieBuilder &
michael@0 146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) {
michael@0 147 if(U_FAILURE(errorCode)) {
michael@0 148 return *this;
michael@0 149 }
michael@0 150 if(bytesLength>0) {
michael@0 151 // Cannot add elements after building.
michael@0 152 errorCode=U_NO_WRITE_PERMISSION;
michael@0 153 return *this;
michael@0 154 }
michael@0 155 if(elementsLength==elementsCapacity) {
michael@0 156 int32_t newCapacity;
michael@0 157 if(elementsCapacity==0) {
michael@0 158 newCapacity=1024;
michael@0 159 } else {
michael@0 160 newCapacity=4*elementsCapacity;
michael@0 161 }
michael@0 162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity];
michael@0 163 if(newElements==NULL) {
michael@0 164 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 165 return *this; // error instead of dereferencing null
michael@0 166 }
michael@0 167 if(elementsLength>0) {
michael@0 168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement));
michael@0 169 }
michael@0 170 delete[] elements;
michael@0 171 elements=newElements;
michael@0 172 elementsCapacity=newCapacity;
michael@0 173 }
michael@0 174 elements[elementsLength++].setTo(s, value, *strings, errorCode);
michael@0 175 return *this;
michael@0 176 }
michael@0 177
michael@0 178 U_CDECL_BEGIN
michael@0 179
michael@0 180 static int32_t U_CALLCONV
michael@0 181 compareElementStrings(const void *context, const void *left, const void *right) {
michael@0 182 const CharString *strings=static_cast<const CharString *>(context);
michael@0 183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left);
michael@0 184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right);
michael@0 185 return leftElement->compareStringTo(*rightElement, *strings);
michael@0 186 }
michael@0 187
michael@0 188 U_CDECL_END
michael@0 189
michael@0 190 BytesTrie *
michael@0 191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
michael@0 192 buildBytes(buildOption, errorCode);
michael@0 193 BytesTrie *newTrie=NULL;
michael@0 194 if(U_SUCCESS(errorCode)) {
michael@0 195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength));
michael@0 196 if(newTrie==NULL) {
michael@0 197 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 198 } else {
michael@0 199 bytes=NULL; // The new trie now owns the array.
michael@0 200 bytesCapacity=0;
michael@0 201 }
michael@0 202 }
michael@0 203 return newTrie;
michael@0 204 }
michael@0 205
michael@0 206 StringPiece
michael@0 207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
michael@0 208 buildBytes(buildOption, errorCode);
michael@0 209 StringPiece result;
michael@0 210 if(U_SUCCESS(errorCode)) {
michael@0 211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength);
michael@0 212 }
michael@0 213 return result;
michael@0 214 }
michael@0 215
michael@0 216 void
michael@0 217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
michael@0 218 if(U_FAILURE(errorCode)) {
michael@0 219 return;
michael@0 220 }
michael@0 221 if(bytes!=NULL && bytesLength>0) {
michael@0 222 // Already built.
michael@0 223 return;
michael@0 224 }
michael@0 225 if(bytesLength==0) {
michael@0 226 if(elementsLength==0) {
michael@0 227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 228 return;
michael@0 229 }
michael@0 230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement),
michael@0 231 compareElementStrings, strings,
michael@0 232 FALSE, // need not be a stable sort
michael@0 233 &errorCode);
michael@0 234 if(U_FAILURE(errorCode)) {
michael@0 235 return;
michael@0 236 }
michael@0 237 // Duplicate strings are not allowed.
michael@0 238 StringPiece prev=elements[0].getString(*strings);
michael@0 239 for(int32_t i=1; i<elementsLength; ++i) {
michael@0 240 StringPiece current=elements[i].getString(*strings);
michael@0 241 if(prev==current) {
michael@0 242 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 243 return;
michael@0 244 }
michael@0 245 prev=current;
michael@0 246 }
michael@0 247 }
michael@0 248 // Create and byte-serialize the trie for the elements.
michael@0 249 bytesLength=0;
michael@0 250 int32_t capacity=strings->length();
michael@0 251 if(capacity<1024) {
michael@0 252 capacity=1024;
michael@0 253 }
michael@0 254 if(bytesCapacity<capacity) {
michael@0 255 uprv_free(bytes);
michael@0 256 bytes=static_cast<char *>(uprv_malloc(capacity));
michael@0 257 if(bytes==NULL) {
michael@0 258 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 259 bytesCapacity=0;
michael@0 260 return;
michael@0 261 }
michael@0 262 bytesCapacity=capacity;
michael@0 263 }
michael@0 264 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
michael@0 265 if(bytes==NULL) {
michael@0 266 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 267 }
michael@0 268 }
michael@0 269
michael@0 270 BytesTrieBuilder &
michael@0 271 BytesTrieBuilder::clear() {
michael@0 272 strings->clear();
michael@0 273 elementsLength=0;
michael@0 274 bytesLength=0;
michael@0 275 return *this;
michael@0 276 }
michael@0 277
michael@0 278 int32_t
michael@0 279 BytesTrieBuilder::getElementStringLength(int32_t i) const {
michael@0 280 return elements[i].getStringLength(*strings);
michael@0 281 }
michael@0 282
michael@0 283 UChar
michael@0 284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const {
michael@0 285 return (uint8_t)elements[i].charAt(byteIndex, *strings);
michael@0 286 }
michael@0 287
michael@0 288 int32_t
michael@0 289 BytesTrieBuilder::getElementValue(int32_t i) const {
michael@0 290 return elements[i].getValue();
michael@0 291 }
michael@0 292
michael@0 293 int32_t
michael@0 294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const {
michael@0 295 const BytesTrieElement &firstElement=elements[first];
michael@0 296 const BytesTrieElement &lastElement=elements[last];
michael@0 297 int32_t minStringLength=firstElement.getStringLength(*strings);
michael@0 298 while(++byteIndex<minStringLength &&
michael@0 299 firstElement.charAt(byteIndex, *strings)==
michael@0 300 lastElement.charAt(byteIndex, *strings)) {}
michael@0 301 return byteIndex;
michael@0 302 }
michael@0 303
michael@0 304 int32_t
michael@0 305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const {
michael@0 306 int32_t length=0; // Number of different bytes at byteIndex.
michael@0 307 int32_t i=start;
michael@0 308 do {
michael@0 309 char byte=elements[i++].charAt(byteIndex, *strings);
michael@0 310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) {
michael@0 311 ++i;
michael@0 312 }
michael@0 313 ++length;
michael@0 314 } while(i<limit);
michael@0 315 return length;
michael@0 316 }
michael@0 317
michael@0 318 int32_t
michael@0 319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const {
michael@0 320 do {
michael@0 321 char byte=elements[i++].charAt(byteIndex, *strings);
michael@0 322 while(byte==elements[i].charAt(byteIndex, *strings)) {
michael@0 323 ++i;
michael@0 324 }
michael@0 325 } while(--count>0);
michael@0 326 return i;
michael@0 327 }
michael@0 328
michael@0 329 int32_t
michael@0 330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const {
michael@0 331 char b=(char)byte;
michael@0 332 while(b==elements[i].charAt(byteIndex, *strings)) {
michael@0 333 ++i;
michael@0 334 }
michael@0 335 return i;
michael@0 336 }
michael@0 337
michael@0 338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode)
michael@0 339 : LinearMatchNode(len, nextNode), s(bytes) {
michael@0 340 hash=hash*37+ustr_hashCharsN(bytes, len);
michael@0 341 }
michael@0 342
michael@0 343 UBool
michael@0 344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const {
michael@0 345 if(this==&other) {
michael@0 346 return TRUE;
michael@0 347 }
michael@0 348 if(!LinearMatchNode::operator==(other)) {
michael@0 349 return FALSE;
michael@0 350 }
michael@0 351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other;
michael@0 352 return 0==uprv_memcmp(s, o.s, length);
michael@0 353 }
michael@0 354
michael@0 355 void
michael@0 356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) {
michael@0 357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder;
michael@0 358 next->write(builder);
michael@0 359 b.write(s, length);
michael@0 360 offset=b.write(b.getMinLinearMatch()+length-1);
michael@0 361 }
michael@0 362
michael@0 363 StringTrieBuilder::Node *
michael@0 364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length,
michael@0 365 Node *nextNode) const {
michael@0 366 return new BTLinearMatchNode(
michael@0 367 elements[i].getString(*strings).data()+byteIndex,
michael@0 368 length,
michael@0 369 nextNode);
michael@0 370 }
michael@0 371
michael@0 372 UBool
michael@0 373 BytesTrieBuilder::ensureCapacity(int32_t length) {
michael@0 374 if(bytes==NULL) {
michael@0 375 return FALSE; // previous memory allocation had failed
michael@0 376 }
michael@0 377 if(length>bytesCapacity) {
michael@0 378 int32_t newCapacity=bytesCapacity;
michael@0 379 do {
michael@0 380 newCapacity*=2;
michael@0 381 } while(newCapacity<=length);
michael@0 382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity));
michael@0 383 if(newBytes==NULL) {
michael@0 384 // unable to allocate memory
michael@0 385 uprv_free(bytes);
michael@0 386 bytes=NULL;
michael@0 387 bytesCapacity=0;
michael@0 388 return FALSE;
michael@0 389 }
michael@0 390 uprv_memcpy(newBytes+(newCapacity-bytesLength),
michael@0 391 bytes+(bytesCapacity-bytesLength), bytesLength);
michael@0 392 uprv_free(bytes);
michael@0 393 bytes=newBytes;
michael@0 394 bytesCapacity=newCapacity;
michael@0 395 }
michael@0 396 return TRUE;
michael@0 397 }
michael@0 398
michael@0 399 int32_t
michael@0 400 BytesTrieBuilder::write(int32_t byte) {
michael@0 401 int32_t newLength=bytesLength+1;
michael@0 402 if(ensureCapacity(newLength)) {
michael@0 403 bytesLength=newLength;
michael@0 404 bytes[bytesCapacity-bytesLength]=(char)byte;
michael@0 405 }
michael@0 406 return bytesLength;
michael@0 407 }
michael@0 408
michael@0 409 int32_t
michael@0 410 BytesTrieBuilder::write(const char *b, int32_t length) {
michael@0 411 int32_t newLength=bytesLength+length;
michael@0 412 if(ensureCapacity(newLength)) {
michael@0 413 bytesLength=newLength;
michael@0 414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length);
michael@0 415 }
michael@0 416 return bytesLength;
michael@0 417 }
michael@0 418
michael@0 419 int32_t
michael@0 420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) {
michael@0 421 return write(elements[i].getString(*strings).data()+byteIndex, length);
michael@0 422 }
michael@0 423
michael@0 424 int32_t
michael@0 425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
michael@0 426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) {
michael@0 427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal);
michael@0 428 }
michael@0 429 char intBytes[5];
michael@0 430 int32_t length=1;
michael@0 431 if(i<0 || i>0xffffff) {
michael@0 432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead;
michael@0 433 intBytes[1]=(char)((uint32_t)i>>24);
michael@0 434 intBytes[2]=(char)((uint32_t)i>>16);
michael@0 435 intBytes[3]=(char)((uint32_t)i>>8);
michael@0 436 intBytes[4]=(char)i;
michael@0 437 length=5;
michael@0 438 // } else if(i<=BytesTrie::kMaxOneByteValue) {
michael@0 439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i);
michael@0 440 } else {
michael@0 441 if(i<=BytesTrie::kMaxTwoByteValue) {
michael@0 442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8));
michael@0 443 } else {
michael@0 444 if(i<=BytesTrie::kMaxThreeByteValue) {
michael@0 445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16));
michael@0 446 } else {
michael@0 447 intBytes[0]=(char)BytesTrie::kFourByteValueLead;
michael@0 448 intBytes[1]=(char)(i>>16);
michael@0 449 length=2;
michael@0 450 }
michael@0 451 intBytes[length++]=(char)(i>>8);
michael@0 452 }
michael@0 453 intBytes[length++]=(char)i;
michael@0 454 }
michael@0 455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal);
michael@0 456 return write(intBytes, length);
michael@0 457 }
michael@0 458
michael@0 459 int32_t
michael@0 460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
michael@0 461 int32_t offset=write(node);
michael@0 462 if(hasValue) {
michael@0 463 offset=writeValueAndFinal(value, FALSE);
michael@0 464 }
michael@0 465 return offset;
michael@0 466 }
michael@0 467
michael@0 468 int32_t
michael@0 469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
michael@0 470 int32_t i=bytesLength-jumpTarget;
michael@0 471 U_ASSERT(i>=0);
michael@0 472 if(i<=BytesTrie::kMaxOneByteDelta) {
michael@0 473 return write(i);
michael@0 474 }
michael@0 475 char intBytes[5];
michael@0 476 int32_t length;
michael@0 477 if(i<=BytesTrie::kMaxTwoByteDelta) {
michael@0 478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8));
michael@0 479 length=1;
michael@0 480 } else {
michael@0 481 if(i<=BytesTrie::kMaxThreeByteDelta) {
michael@0 482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16));
michael@0 483 length=2;
michael@0 484 } else {
michael@0 485 if(i<=0xffffff) {
michael@0 486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead;
michael@0 487 length=3;
michael@0 488 } else {
michael@0 489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead;
michael@0 490 intBytes[1]=(char)(i>>24);
michael@0 491 length=4;
michael@0 492 }
michael@0 493 intBytes[1]=(char)(i>>16);
michael@0 494 }
michael@0 495 intBytes[1]=(char)(i>>8);
michael@0 496 }
michael@0 497 intBytes[length++]=(char)i;
michael@0 498 return write(intBytes, length);
michael@0 499 }
michael@0 500
michael@0 501 U_NAMESPACE_END

mercurial