intl/icu/source/common/ucharstriebuilder.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2010-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: ucharstriebuilder.h
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2010nov14
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16 #include "unicode/ucharstrie.h"
michael@0 17 #include "unicode/ucharstriebuilder.h"
michael@0 18 #include "unicode/unistr.h"
michael@0 19 #include "unicode/ustring.h"
michael@0 20 #include "cmemory.h"
michael@0 21 #include "uarrsort.h"
michael@0 22 #include "uassert.h"
michael@0 23 #include "uhash.h"
michael@0 24 #include "ustr_imp.h"
michael@0 25
michael@0 26 U_NAMESPACE_BEGIN
michael@0 27
michael@0 28 /*
michael@0 29 * Note: This builder implementation stores (string, value) pairs with full copies
michael@0 30 * of the 16-bit-unit sequences, until the UCharsTrie is built.
michael@0 31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie.
michael@0 32 */
michael@0 33
michael@0 34 class UCharsTrieElement : public UMemory {
michael@0 35 public:
michael@0 36 // Use compiler's default constructor, initializes nothing.
michael@0 37
michael@0 38 void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode);
michael@0 39
michael@0 40 UnicodeString getString(const UnicodeString &strings) const {
michael@0 41 int32_t length=strings[stringOffset];
michael@0 42 return strings.tempSubString(stringOffset+1, length);
michael@0 43 }
michael@0 44 int32_t getStringLength(const UnicodeString &strings) const {
michael@0 45 return strings[stringOffset];
michael@0 46 }
michael@0 47
michael@0 48 UChar charAt(int32_t index, const UnicodeString &strings) const {
michael@0 49 return strings[stringOffset+1+index];
michael@0 50 }
michael@0 51
michael@0 52 int32_t getValue() const { return value; }
michael@0 53
michael@0 54 int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const;
michael@0 55
michael@0 56 private:
michael@0 57 // The first strings unit contains the string length.
michael@0 58 // (Compared with a stringLength field here, this saves 2 bytes per string.)
michael@0 59 int32_t stringOffset;
michael@0 60 int32_t value;
michael@0 61 };
michael@0 62
michael@0 63 void
michael@0 64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val,
michael@0 65 UnicodeString &strings, UErrorCode &errorCode) {
michael@0 66 if(U_FAILURE(errorCode)) {
michael@0 67 return;
michael@0 68 }
michael@0 69 int32_t length=s.length();
michael@0 70 if(length>0xffff) {
michael@0 71 // Too long: We store the length in 1 unit.
michael@0 72 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 73 return;
michael@0 74 }
michael@0 75 stringOffset=strings.length();
michael@0 76 strings.append((UChar)length);
michael@0 77 value=val;
michael@0 78 strings.append(s);
michael@0 79 }
michael@0 80
michael@0 81 int32_t
michael@0 82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const {
michael@0 83 return getString(strings).compare(other.getString(strings));
michael@0 84 }
michael@0 85
michael@0 86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/)
michael@0 87 : elements(NULL), elementsCapacity(0), elementsLength(0),
michael@0 88 uchars(NULL), ucharsCapacity(0), ucharsLength(0) {}
michael@0 89
michael@0 90 UCharsTrieBuilder::~UCharsTrieBuilder() {
michael@0 91 delete[] elements;
michael@0 92 uprv_free(uchars);
michael@0 93 }
michael@0 94
michael@0 95 UCharsTrieBuilder &
michael@0 96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) {
michael@0 97 if(U_FAILURE(errorCode)) {
michael@0 98 return *this;
michael@0 99 }
michael@0 100 if(ucharsLength>0) {
michael@0 101 // Cannot add elements after building.
michael@0 102 errorCode=U_NO_WRITE_PERMISSION;
michael@0 103 return *this;
michael@0 104 }
michael@0 105 if(elementsLength==elementsCapacity) {
michael@0 106 int32_t newCapacity;
michael@0 107 if(elementsCapacity==0) {
michael@0 108 newCapacity=1024;
michael@0 109 } else {
michael@0 110 newCapacity=4*elementsCapacity;
michael@0 111 }
michael@0 112 UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity];
michael@0 113 if(newElements==NULL) {
michael@0 114 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 115 return *this;
michael@0 116 }
michael@0 117 if(elementsLength>0) {
michael@0 118 uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement));
michael@0 119 }
michael@0 120 delete[] elements;
michael@0 121 elements=newElements;
michael@0 122 elementsCapacity=newCapacity;
michael@0 123 }
michael@0 124 elements[elementsLength++].setTo(s, value, strings, errorCode);
michael@0 125 if(U_SUCCESS(errorCode) && strings.isBogus()) {
michael@0 126 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 127 }
michael@0 128 return *this;
michael@0 129 }
michael@0 130
michael@0 131 U_CDECL_BEGIN
michael@0 132
michael@0 133 static int32_t U_CALLCONV
michael@0 134 compareElementStrings(const void *context, const void *left, const void *right) {
michael@0 135 const UnicodeString *strings=static_cast<const UnicodeString *>(context);
michael@0 136 const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left);
michael@0 137 const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right);
michael@0 138 return leftElement->compareStringTo(*rightElement, *strings);
michael@0 139 }
michael@0 140
michael@0 141 U_CDECL_END
michael@0 142
michael@0 143 UCharsTrie *
michael@0 144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
michael@0 145 buildUChars(buildOption, errorCode);
michael@0 146 UCharsTrie *newTrie=NULL;
michael@0 147 if(U_SUCCESS(errorCode)) {
michael@0 148 newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength));
michael@0 149 if(newTrie==NULL) {
michael@0 150 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 151 } else {
michael@0 152 uchars=NULL; // The new trie now owns the array.
michael@0 153 ucharsCapacity=0;
michael@0 154 }
michael@0 155 }
michael@0 156 return newTrie;
michael@0 157 }
michael@0 158
michael@0 159 UnicodeString &
michael@0 160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result,
michael@0 161 UErrorCode &errorCode) {
michael@0 162 buildUChars(buildOption, errorCode);
michael@0 163 if(U_SUCCESS(errorCode)) {
michael@0 164 result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength);
michael@0 165 }
michael@0 166 return result;
michael@0 167 }
michael@0 168
michael@0 169 void
michael@0 170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) {
michael@0 171 if(U_FAILURE(errorCode)) {
michael@0 172 return;
michael@0 173 }
michael@0 174 if(uchars!=NULL && ucharsLength>0) {
michael@0 175 // Already built.
michael@0 176 return;
michael@0 177 }
michael@0 178 if(ucharsLength==0) {
michael@0 179 if(elementsLength==0) {
michael@0 180 errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 181 return;
michael@0 182 }
michael@0 183 if(strings.isBogus()) {
michael@0 184 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 185 return;
michael@0 186 }
michael@0 187 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement),
michael@0 188 compareElementStrings, &strings,
michael@0 189 FALSE, // need not be a stable sort
michael@0 190 &errorCode);
michael@0 191 if(U_FAILURE(errorCode)) {
michael@0 192 return;
michael@0 193 }
michael@0 194 // Duplicate strings are not allowed.
michael@0 195 UnicodeString prev=elements[0].getString(strings);
michael@0 196 for(int32_t i=1; i<elementsLength; ++i) {
michael@0 197 UnicodeString current=elements[i].getString(strings);
michael@0 198 if(prev==current) {
michael@0 199 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 200 return;
michael@0 201 }
michael@0 202 prev.fastCopyFrom(current);
michael@0 203 }
michael@0 204 }
michael@0 205 // Create and UChar-serialize the trie for the elements.
michael@0 206 ucharsLength=0;
michael@0 207 int32_t capacity=strings.length();
michael@0 208 if(capacity<1024) {
michael@0 209 capacity=1024;
michael@0 210 }
michael@0 211 if(ucharsCapacity<capacity) {
michael@0 212 uprv_free(uchars);
michael@0 213 uchars=static_cast<UChar *>(uprv_malloc(capacity*2));
michael@0 214 if(uchars==NULL) {
michael@0 215 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 216 ucharsCapacity=0;
michael@0 217 return;
michael@0 218 }
michael@0 219 ucharsCapacity=capacity;
michael@0 220 }
michael@0 221 StringTrieBuilder::build(buildOption, elementsLength, errorCode);
michael@0 222 if(uchars==NULL) {
michael@0 223 errorCode=U_MEMORY_ALLOCATION_ERROR;
michael@0 224 }
michael@0 225 }
michael@0 226
michael@0 227 int32_t
michael@0 228 UCharsTrieBuilder::getElementStringLength(int32_t i) const {
michael@0 229 return elements[i].getStringLength(strings);
michael@0 230 }
michael@0 231
michael@0 232 UChar
michael@0 233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const {
michael@0 234 return elements[i].charAt(unitIndex, strings);
michael@0 235 }
michael@0 236
michael@0 237 int32_t
michael@0 238 UCharsTrieBuilder::getElementValue(int32_t i) const {
michael@0 239 return elements[i].getValue();
michael@0 240 }
michael@0 241
michael@0 242 int32_t
michael@0 243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const {
michael@0 244 const UCharsTrieElement &firstElement=elements[first];
michael@0 245 const UCharsTrieElement &lastElement=elements[last];
michael@0 246 int32_t minStringLength=firstElement.getStringLength(strings);
michael@0 247 while(++unitIndex<minStringLength &&
michael@0 248 firstElement.charAt(unitIndex, strings)==
michael@0 249 lastElement.charAt(unitIndex, strings)) {}
michael@0 250 return unitIndex;
michael@0 251 }
michael@0 252
michael@0 253 int32_t
michael@0 254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const {
michael@0 255 int32_t length=0; // Number of different units at unitIndex.
michael@0 256 int32_t i=start;
michael@0 257 do {
michael@0 258 UChar unit=elements[i++].charAt(unitIndex, strings);
michael@0 259 while(i<limit && unit==elements[i].charAt(unitIndex, strings)) {
michael@0 260 ++i;
michael@0 261 }
michael@0 262 ++length;
michael@0 263 } while(i<limit);
michael@0 264 return length;
michael@0 265 }
michael@0 266
michael@0 267 int32_t
michael@0 268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const {
michael@0 269 do {
michael@0 270 UChar unit=elements[i++].charAt(unitIndex, strings);
michael@0 271 while(unit==elements[i].charAt(unitIndex, strings)) {
michael@0 272 ++i;
michael@0 273 }
michael@0 274 } while(--count>0);
michael@0 275 return i;
michael@0 276 }
michael@0 277
michael@0 278 int32_t
michael@0 279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const {
michael@0 280 while(unit==elements[i].charAt(unitIndex, strings)) {
michael@0 281 ++i;
michael@0 282 }
michael@0 283 return i;
michael@0 284 }
michael@0 285
michael@0 286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode)
michael@0 287 : LinearMatchNode(len, nextNode), s(units) {
michael@0 288 hash=hash*37+ustr_hashUCharsN(units, len);
michael@0 289 }
michael@0 290
michael@0 291 UBool
michael@0 292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const {
michael@0 293 if(this==&other) {
michael@0 294 return TRUE;
michael@0 295 }
michael@0 296 if(!LinearMatchNode::operator==(other)) {
michael@0 297 return FALSE;
michael@0 298 }
michael@0 299 const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other;
michael@0 300 return 0==u_memcmp(s, o.s, length);
michael@0 301 }
michael@0 302
michael@0 303 void
michael@0 304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) {
michael@0 305 UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder;
michael@0 306 next->write(builder);
michael@0 307 b.write(s, length);
michael@0 308 offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1);
michael@0 309 }
michael@0 310
michael@0 311 StringTrieBuilder::Node *
michael@0 312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
michael@0 313 Node *nextNode) const {
michael@0 314 return new UCTLinearMatchNode(
michael@0 315 elements[i].getString(strings).getBuffer()+unitIndex,
michael@0 316 length,
michael@0 317 nextNode);
michael@0 318 }
michael@0 319
michael@0 320 UBool
michael@0 321 UCharsTrieBuilder::ensureCapacity(int32_t length) {
michael@0 322 if(uchars==NULL) {
michael@0 323 return FALSE; // previous memory allocation had failed
michael@0 324 }
michael@0 325 if(length>ucharsCapacity) {
michael@0 326 int32_t newCapacity=ucharsCapacity;
michael@0 327 do {
michael@0 328 newCapacity*=2;
michael@0 329 } while(newCapacity<=length);
michael@0 330 UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2));
michael@0 331 if(newUChars==NULL) {
michael@0 332 // unable to allocate memory
michael@0 333 uprv_free(uchars);
michael@0 334 uchars=NULL;
michael@0 335 ucharsCapacity=0;
michael@0 336 return FALSE;
michael@0 337 }
michael@0 338 u_memcpy(newUChars+(newCapacity-ucharsLength),
michael@0 339 uchars+(ucharsCapacity-ucharsLength), ucharsLength);
michael@0 340 uprv_free(uchars);
michael@0 341 uchars=newUChars;
michael@0 342 ucharsCapacity=newCapacity;
michael@0 343 }
michael@0 344 return TRUE;
michael@0 345 }
michael@0 346
michael@0 347 int32_t
michael@0 348 UCharsTrieBuilder::write(int32_t unit) {
michael@0 349 int32_t newLength=ucharsLength+1;
michael@0 350 if(ensureCapacity(newLength)) {
michael@0 351 ucharsLength=newLength;
michael@0 352 uchars[ucharsCapacity-ucharsLength]=(UChar)unit;
michael@0 353 }
michael@0 354 return ucharsLength;
michael@0 355 }
michael@0 356
michael@0 357 int32_t
michael@0 358 UCharsTrieBuilder::write(const UChar *s, int32_t length) {
michael@0 359 int32_t newLength=ucharsLength+length;
michael@0 360 if(ensureCapacity(newLength)) {
michael@0 361 ucharsLength=newLength;
michael@0 362 u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length);
michael@0 363 }
michael@0 364 return ucharsLength;
michael@0 365 }
michael@0 366
michael@0 367 int32_t
michael@0 368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) {
michael@0 369 return write(elements[i].getString(strings).getBuffer()+unitIndex, length);
michael@0 370 }
michael@0 371
michael@0 372 int32_t
michael@0 373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) {
michael@0 374 if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) {
michael@0 375 return write(i|(isFinal<<15));
michael@0 376 }
michael@0 377 UChar intUnits[3];
michael@0 378 int32_t length;
michael@0 379 if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) {
michael@0 380 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead);
michael@0 381 intUnits[1]=(UChar)((uint32_t)i>>16);
michael@0 382 intUnits[2]=(UChar)i;
michael@0 383 length=3;
michael@0 384 // } else if(i<=UCharsTrie::kMaxOneUnitValue) {
michael@0 385 // intUnits[0]=(UChar)(i);
michael@0 386 // length=1;
michael@0 387 } else {
michael@0 388 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16));
michael@0 389 intUnits[1]=(UChar)i;
michael@0 390 length=2;
michael@0 391 }
michael@0 392 intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15));
michael@0 393 return write(intUnits, length);
michael@0 394 }
michael@0 395
michael@0 396 int32_t
michael@0 397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) {
michael@0 398 if(!hasValue) {
michael@0 399 return write(node);
michael@0 400 }
michael@0 401 UChar intUnits[3];
michael@0 402 int32_t length;
michael@0 403 if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) {
michael@0 404 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead);
michael@0 405 intUnits[1]=(UChar)((uint32_t)value>>16);
michael@0 406 intUnits[2]=(UChar)value;
michael@0 407 length=3;
michael@0 408 } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) {
michael@0 409 intUnits[0]=(UChar)((value+1)<<6);
michael@0 410 length=1;
michael@0 411 } else {
michael@0 412 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0));
michael@0 413 intUnits[1]=(UChar)value;
michael@0 414 length=2;
michael@0 415 }
michael@0 416 intUnits[0]|=(UChar)node;
michael@0 417 return write(intUnits, length);
michael@0 418 }
michael@0 419
michael@0 420 int32_t
michael@0 421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) {
michael@0 422 int32_t i=ucharsLength-jumpTarget;
michael@0 423 U_ASSERT(i>=0);
michael@0 424 if(i<=UCharsTrie::kMaxOneUnitDelta) {
michael@0 425 return write(i);
michael@0 426 }
michael@0 427 UChar intUnits[3];
michael@0 428 int32_t length;
michael@0 429 if(i<=UCharsTrie::kMaxTwoUnitDelta) {
michael@0 430 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16));
michael@0 431 length=1;
michael@0 432 } else {
michael@0 433 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead);
michael@0 434 intUnits[1]=(UChar)(i>>16);
michael@0 435 length=2;
michael@0 436 }
michael@0 437 intUnits[length++]=(UChar)i;
michael@0 438 return write(intUnits, length);
michael@0 439 }
michael@0 440
michael@0 441 U_NAMESPACE_END

mercurial