1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucharstriebuilder.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,441 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: ucharstriebuilder.h 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010nov14 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 +#include "unicode/ucharstrie.h" 1.20 +#include "unicode/ucharstriebuilder.h" 1.21 +#include "unicode/unistr.h" 1.22 +#include "unicode/ustring.h" 1.23 +#include "cmemory.h" 1.24 +#include "uarrsort.h" 1.25 +#include "uassert.h" 1.26 +#include "uhash.h" 1.27 +#include "ustr_imp.h" 1.28 + 1.29 +U_NAMESPACE_BEGIN 1.30 + 1.31 +/* 1.32 + * Note: This builder implementation stores (string, value) pairs with full copies 1.33 + * of the 16-bit-unit sequences, until the UCharsTrie is built. 1.34 + * It might(!) take less memory if we collected the data in a temporary, dynamic trie. 1.35 + */ 1.36 + 1.37 +class UCharsTrieElement : public UMemory { 1.38 +public: 1.39 + // Use compiler's default constructor, initializes nothing. 1.40 + 1.41 + void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); 1.42 + 1.43 + UnicodeString getString(const UnicodeString &strings) const { 1.44 + int32_t length=strings[stringOffset]; 1.45 + return strings.tempSubString(stringOffset+1, length); 1.46 + } 1.47 + int32_t getStringLength(const UnicodeString &strings) const { 1.48 + return strings[stringOffset]; 1.49 + } 1.50 + 1.51 + UChar charAt(int32_t index, const UnicodeString &strings) const { 1.52 + return strings[stringOffset+1+index]; 1.53 + } 1.54 + 1.55 + int32_t getValue() const { return value; } 1.56 + 1.57 + int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; 1.58 + 1.59 +private: 1.60 + // The first strings unit contains the string length. 1.61 + // (Compared with a stringLength field here, this saves 2 bytes per string.) 1.62 + int32_t stringOffset; 1.63 + int32_t value; 1.64 +}; 1.65 + 1.66 +void 1.67 +UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, 1.68 + UnicodeString &strings, UErrorCode &errorCode) { 1.69 + if(U_FAILURE(errorCode)) { 1.70 + return; 1.71 + } 1.72 + int32_t length=s.length(); 1.73 + if(length>0xffff) { 1.74 + // Too long: We store the length in 1 unit. 1.75 + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.76 + return; 1.77 + } 1.78 + stringOffset=strings.length(); 1.79 + strings.append((UChar)length); 1.80 + value=val; 1.81 + strings.append(s); 1.82 +} 1.83 + 1.84 +int32_t 1.85 +UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { 1.86 + return getString(strings).compare(other.getString(strings)); 1.87 +} 1.88 + 1.89 +UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) 1.90 + : elements(NULL), elementsCapacity(0), elementsLength(0), 1.91 + uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} 1.92 + 1.93 +UCharsTrieBuilder::~UCharsTrieBuilder() { 1.94 + delete[] elements; 1.95 + uprv_free(uchars); 1.96 +} 1.97 + 1.98 +UCharsTrieBuilder & 1.99 +UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { 1.100 + if(U_FAILURE(errorCode)) { 1.101 + return *this; 1.102 + } 1.103 + if(ucharsLength>0) { 1.104 + // Cannot add elements after building. 1.105 + errorCode=U_NO_WRITE_PERMISSION; 1.106 + return *this; 1.107 + } 1.108 + if(elementsLength==elementsCapacity) { 1.109 + int32_t newCapacity; 1.110 + if(elementsCapacity==0) { 1.111 + newCapacity=1024; 1.112 + } else { 1.113 + newCapacity=4*elementsCapacity; 1.114 + } 1.115 + UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; 1.116 + if(newElements==NULL) { 1.117 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.118 + return *this; 1.119 + } 1.120 + if(elementsLength>0) { 1.121 + uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); 1.122 + } 1.123 + delete[] elements; 1.124 + elements=newElements; 1.125 + elementsCapacity=newCapacity; 1.126 + } 1.127 + elements[elementsLength++].setTo(s, value, strings, errorCode); 1.128 + if(U_SUCCESS(errorCode) && strings.isBogus()) { 1.129 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.130 + } 1.131 + return *this; 1.132 +} 1.133 + 1.134 +U_CDECL_BEGIN 1.135 + 1.136 +static int32_t U_CALLCONV 1.137 +compareElementStrings(const void *context, const void *left, const void *right) { 1.138 + const UnicodeString *strings=static_cast<const UnicodeString *>(context); 1.139 + const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); 1.140 + const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); 1.141 + return leftElement->compareStringTo(*rightElement, *strings); 1.142 +} 1.143 + 1.144 +U_CDECL_END 1.145 + 1.146 +UCharsTrie * 1.147 +UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 1.148 + buildUChars(buildOption, errorCode); 1.149 + UCharsTrie *newTrie=NULL; 1.150 + if(U_SUCCESS(errorCode)) { 1.151 + newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); 1.152 + if(newTrie==NULL) { 1.153 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.154 + } else { 1.155 + uchars=NULL; // The new trie now owns the array. 1.156 + ucharsCapacity=0; 1.157 + } 1.158 + } 1.159 + return newTrie; 1.160 +} 1.161 + 1.162 +UnicodeString & 1.163 +UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, 1.164 + UErrorCode &errorCode) { 1.165 + buildUChars(buildOption, errorCode); 1.166 + if(U_SUCCESS(errorCode)) { 1.167 + result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); 1.168 + } 1.169 + return result; 1.170 +} 1.171 + 1.172 +void 1.173 +UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { 1.174 + if(U_FAILURE(errorCode)) { 1.175 + return; 1.176 + } 1.177 + if(uchars!=NULL && ucharsLength>0) { 1.178 + // Already built. 1.179 + return; 1.180 + } 1.181 + if(ucharsLength==0) { 1.182 + if(elementsLength==0) { 1.183 + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.184 + return; 1.185 + } 1.186 + if(strings.isBogus()) { 1.187 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.188 + return; 1.189 + } 1.190 + uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), 1.191 + compareElementStrings, &strings, 1.192 + FALSE, // need not be a stable sort 1.193 + &errorCode); 1.194 + if(U_FAILURE(errorCode)) { 1.195 + return; 1.196 + } 1.197 + // Duplicate strings are not allowed. 1.198 + UnicodeString prev=elements[0].getString(strings); 1.199 + for(int32_t i=1; i<elementsLength; ++i) { 1.200 + UnicodeString current=elements[i].getString(strings); 1.201 + if(prev==current) { 1.202 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.203 + return; 1.204 + } 1.205 + prev.fastCopyFrom(current); 1.206 + } 1.207 + } 1.208 + // Create and UChar-serialize the trie for the elements. 1.209 + ucharsLength=0; 1.210 + int32_t capacity=strings.length(); 1.211 + if(capacity<1024) { 1.212 + capacity=1024; 1.213 + } 1.214 + if(ucharsCapacity<capacity) { 1.215 + uprv_free(uchars); 1.216 + uchars=static_cast<UChar *>(uprv_malloc(capacity*2)); 1.217 + if(uchars==NULL) { 1.218 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.219 + ucharsCapacity=0; 1.220 + return; 1.221 + } 1.222 + ucharsCapacity=capacity; 1.223 + } 1.224 + StringTrieBuilder::build(buildOption, elementsLength, errorCode); 1.225 + if(uchars==NULL) { 1.226 + errorCode=U_MEMORY_ALLOCATION_ERROR; 1.227 + } 1.228 +} 1.229 + 1.230 +int32_t 1.231 +UCharsTrieBuilder::getElementStringLength(int32_t i) const { 1.232 + return elements[i].getStringLength(strings); 1.233 +} 1.234 + 1.235 +UChar 1.236 +UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { 1.237 + return elements[i].charAt(unitIndex, strings); 1.238 +} 1.239 + 1.240 +int32_t 1.241 +UCharsTrieBuilder::getElementValue(int32_t i) const { 1.242 + return elements[i].getValue(); 1.243 +} 1.244 + 1.245 +int32_t 1.246 +UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { 1.247 + const UCharsTrieElement &firstElement=elements[first]; 1.248 + const UCharsTrieElement &lastElement=elements[last]; 1.249 + int32_t minStringLength=firstElement.getStringLength(strings); 1.250 + while(++unitIndex<minStringLength && 1.251 + firstElement.charAt(unitIndex, strings)== 1.252 + lastElement.charAt(unitIndex, strings)) {} 1.253 + return unitIndex; 1.254 +} 1.255 + 1.256 +int32_t 1.257 +UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { 1.258 + int32_t length=0; // Number of different units at unitIndex. 1.259 + int32_t i=start; 1.260 + do { 1.261 + UChar unit=elements[i++].charAt(unitIndex, strings); 1.262 + while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { 1.263 + ++i; 1.264 + } 1.265 + ++length; 1.266 + } while(i<limit); 1.267 + return length; 1.268 +} 1.269 + 1.270 +int32_t 1.271 +UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { 1.272 + do { 1.273 + UChar unit=elements[i++].charAt(unitIndex, strings); 1.274 + while(unit==elements[i].charAt(unitIndex, strings)) { 1.275 + ++i; 1.276 + } 1.277 + } while(--count>0); 1.278 + return i; 1.279 +} 1.280 + 1.281 +int32_t 1.282 +UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { 1.283 + while(unit==elements[i].charAt(unitIndex, strings)) { 1.284 + ++i; 1.285 + } 1.286 + return i; 1.287 +} 1.288 + 1.289 +UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) 1.290 + : LinearMatchNode(len, nextNode), s(units) { 1.291 + hash=hash*37+ustr_hashUCharsN(units, len); 1.292 +} 1.293 + 1.294 +UBool 1.295 +UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { 1.296 + if(this==&other) { 1.297 + return TRUE; 1.298 + } 1.299 + if(!LinearMatchNode::operator==(other)) { 1.300 + return FALSE; 1.301 + } 1.302 + const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; 1.303 + return 0==u_memcmp(s, o.s, length); 1.304 +} 1.305 + 1.306 +void 1.307 +UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { 1.308 + UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; 1.309 + next->write(builder); 1.310 + b.write(s, length); 1.311 + offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); 1.312 +} 1.313 + 1.314 +StringTrieBuilder::Node * 1.315 +UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 1.316 + Node *nextNode) const { 1.317 + return new UCTLinearMatchNode( 1.318 + elements[i].getString(strings).getBuffer()+unitIndex, 1.319 + length, 1.320 + nextNode); 1.321 +} 1.322 + 1.323 +UBool 1.324 +UCharsTrieBuilder::ensureCapacity(int32_t length) { 1.325 + if(uchars==NULL) { 1.326 + return FALSE; // previous memory allocation had failed 1.327 + } 1.328 + if(length>ucharsCapacity) { 1.329 + int32_t newCapacity=ucharsCapacity; 1.330 + do { 1.331 + newCapacity*=2; 1.332 + } while(newCapacity<=length); 1.333 + UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2)); 1.334 + if(newUChars==NULL) { 1.335 + // unable to allocate memory 1.336 + uprv_free(uchars); 1.337 + uchars=NULL; 1.338 + ucharsCapacity=0; 1.339 + return FALSE; 1.340 + } 1.341 + u_memcpy(newUChars+(newCapacity-ucharsLength), 1.342 + uchars+(ucharsCapacity-ucharsLength), ucharsLength); 1.343 + uprv_free(uchars); 1.344 + uchars=newUChars; 1.345 + ucharsCapacity=newCapacity; 1.346 + } 1.347 + return TRUE; 1.348 +} 1.349 + 1.350 +int32_t 1.351 +UCharsTrieBuilder::write(int32_t unit) { 1.352 + int32_t newLength=ucharsLength+1; 1.353 + if(ensureCapacity(newLength)) { 1.354 + ucharsLength=newLength; 1.355 + uchars[ucharsCapacity-ucharsLength]=(UChar)unit; 1.356 + } 1.357 + return ucharsLength; 1.358 +} 1.359 + 1.360 +int32_t 1.361 +UCharsTrieBuilder::write(const UChar *s, int32_t length) { 1.362 + int32_t newLength=ucharsLength+length; 1.363 + if(ensureCapacity(newLength)) { 1.364 + ucharsLength=newLength; 1.365 + u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); 1.366 + } 1.367 + return ucharsLength; 1.368 +} 1.369 + 1.370 +int32_t 1.371 +UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { 1.372 + return write(elements[i].getString(strings).getBuffer()+unitIndex, length); 1.373 +} 1.374 + 1.375 +int32_t 1.376 +UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { 1.377 + if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { 1.378 + return write(i|(isFinal<<15)); 1.379 + } 1.380 + UChar intUnits[3]; 1.381 + int32_t length; 1.382 + if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { 1.383 + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); 1.384 + intUnits[1]=(UChar)((uint32_t)i>>16); 1.385 + intUnits[2]=(UChar)i; 1.386 + length=3; 1.387 + // } else if(i<=UCharsTrie::kMaxOneUnitValue) { 1.388 + // intUnits[0]=(UChar)(i); 1.389 + // length=1; 1.390 + } else { 1.391 + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); 1.392 + intUnits[1]=(UChar)i; 1.393 + length=2; 1.394 + } 1.395 + intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); 1.396 + return write(intUnits, length); 1.397 +} 1.398 + 1.399 +int32_t 1.400 +UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { 1.401 + if(!hasValue) { 1.402 + return write(node); 1.403 + } 1.404 + UChar intUnits[3]; 1.405 + int32_t length; 1.406 + if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { 1.407 + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); 1.408 + intUnits[1]=(UChar)((uint32_t)value>>16); 1.409 + intUnits[2]=(UChar)value; 1.410 + length=3; 1.411 + } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { 1.412 + intUnits[0]=(UChar)((value+1)<<6); 1.413 + length=1; 1.414 + } else { 1.415 + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); 1.416 + intUnits[1]=(UChar)value; 1.417 + length=2; 1.418 + } 1.419 + intUnits[0]|=(UChar)node; 1.420 + return write(intUnits, length); 1.421 +} 1.422 + 1.423 +int32_t 1.424 +UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { 1.425 + int32_t i=ucharsLength-jumpTarget; 1.426 + U_ASSERT(i>=0); 1.427 + if(i<=UCharsTrie::kMaxOneUnitDelta) { 1.428 + return write(i); 1.429 + } 1.430 + UChar intUnits[3]; 1.431 + int32_t length; 1.432 + if(i<=UCharsTrie::kMaxTwoUnitDelta) { 1.433 + intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); 1.434 + length=1; 1.435 + } else { 1.436 + intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); 1.437 + intUnits[1]=(UChar)(i>>16); 1.438 + length=2; 1.439 + } 1.440 + intUnits[length++]=(UChar)i; 1.441 + return write(intUnits, length); 1.442 +} 1.443 + 1.444 +U_NAMESPACE_END