1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucharstrie.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,411 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: ucharstrie.h 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010nov14 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 +#include "unicode/appendable.h" 1.20 +#include "unicode/ucharstrie.h" 1.21 +#include "unicode/uobject.h" 1.22 +#include "unicode/utf16.h" 1.23 +#include "cmemory.h" 1.24 +#include "uassert.h" 1.25 + 1.26 +U_NAMESPACE_BEGIN 1.27 + 1.28 +UCharsTrie::~UCharsTrie() { 1.29 + uprv_free(ownedArray_); 1.30 +} 1.31 + 1.32 +UStringTrieResult 1.33 +UCharsTrie::current() const { 1.34 + const UChar *pos=pos_; 1.35 + if(pos==NULL) { 1.36 + return USTRINGTRIE_NO_MATCH; 1.37 + } else { 1.38 + int32_t node; 1.39 + return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 1.40 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.41 + } 1.42 +} 1.43 + 1.44 +UStringTrieResult 1.45 +UCharsTrie::firstForCodePoint(UChar32 cp) { 1.46 + return cp<=0xffff ? 1.47 + first(cp) : 1.48 + (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? 1.49 + next(U16_TRAIL(cp)) : 1.50 + USTRINGTRIE_NO_MATCH); 1.51 +} 1.52 + 1.53 +UStringTrieResult 1.54 +UCharsTrie::nextForCodePoint(UChar32 cp) { 1.55 + return cp<=0xffff ? 1.56 + next(cp) : 1.57 + (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? 1.58 + next(U16_TRAIL(cp)) : 1.59 + USTRINGTRIE_NO_MATCH); 1.60 +} 1.61 + 1.62 +UStringTrieResult 1.63 +UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { 1.64 + // Branch according to the current unit. 1.65 + if(length==0) { 1.66 + length=*pos++; 1.67 + } 1.68 + ++length; 1.69 + // The length of the branch is the number of units to select from. 1.70 + // The data structure encodes a binary search. 1.71 + while(length>kMaxBranchLinearSubNodeLength) { 1.72 + if(uchar<*pos++) { 1.73 + length>>=1; 1.74 + pos=jumpByDelta(pos); 1.75 + } else { 1.76 + length=length-(length>>1); 1.77 + pos=skipDelta(pos); 1.78 + } 1.79 + } 1.80 + // Drop down to linear search for the last few units. 1.81 + // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 1.82 + // and divides length by 2. 1.83 + do { 1.84 + if(uchar==*pos++) { 1.85 + UStringTrieResult result; 1.86 + int32_t node=*pos; 1.87 + if(node&kValueIsFinal) { 1.88 + // Leave the final value for getValue() to read. 1.89 + result=USTRINGTRIE_FINAL_VALUE; 1.90 + } else { 1.91 + // Use the non-final value as the jump delta. 1.92 + ++pos; 1.93 + // int32_t delta=readValue(pos, node); 1.94 + int32_t delta; 1.95 + if(node<kMinTwoUnitValueLead) { 1.96 + delta=node; 1.97 + } else if(node<kThreeUnitValueLead) { 1.98 + delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; 1.99 + } else { 1.100 + delta=(pos[0]<<16)|pos[1]; 1.101 + pos+=2; 1.102 + } 1.103 + // end readValue() 1.104 + pos+=delta; 1.105 + node=*pos; 1.106 + result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 1.107 + } 1.108 + pos_=pos; 1.109 + return result; 1.110 + } 1.111 + --length; 1.112 + pos=skipValue(pos); 1.113 + } while(length>1); 1.114 + if(uchar==*pos++) { 1.115 + pos_=pos; 1.116 + int32_t node=*pos; 1.117 + return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 1.118 + } else { 1.119 + stop(); 1.120 + return USTRINGTRIE_NO_MATCH; 1.121 + } 1.122 +} 1.123 + 1.124 +UStringTrieResult 1.125 +UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { 1.126 + int32_t node=*pos++; 1.127 + for(;;) { 1.128 + if(node<kMinLinearMatch) { 1.129 + return branchNext(pos, node, uchar); 1.130 + } else if(node<kMinValueLead) { 1.131 + // Match the first of length+1 units. 1.132 + int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 1.133 + if(uchar==*pos++) { 1.134 + remainingMatchLength_=--length; 1.135 + pos_=pos; 1.136 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.137 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.138 + } else { 1.139 + // No match. 1.140 + break; 1.141 + } 1.142 + } else if(node&kValueIsFinal) { 1.143 + // No further matching units. 1.144 + break; 1.145 + } else { 1.146 + // Skip intermediate value. 1.147 + pos=skipNodeValue(pos, node); 1.148 + node&=kNodeTypeMask; 1.149 + } 1.150 + } 1.151 + stop(); 1.152 + return USTRINGTRIE_NO_MATCH; 1.153 +} 1.154 + 1.155 +UStringTrieResult 1.156 +UCharsTrie::next(int32_t uchar) { 1.157 + const UChar *pos=pos_; 1.158 + if(pos==NULL) { 1.159 + return USTRINGTRIE_NO_MATCH; 1.160 + } 1.161 + int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 1.162 + if(length>=0) { 1.163 + // Remaining part of a linear-match node. 1.164 + if(uchar==*pos++) { 1.165 + remainingMatchLength_=--length; 1.166 + pos_=pos; 1.167 + int32_t node; 1.168 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.169 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.170 + } else { 1.171 + stop(); 1.172 + return USTRINGTRIE_NO_MATCH; 1.173 + } 1.174 + } 1.175 + return nextImpl(pos, uchar); 1.176 +} 1.177 + 1.178 +UStringTrieResult 1.179 +UCharsTrie::next(const UChar *s, int32_t sLength) { 1.180 + if(sLength<0 ? *s==0 : sLength==0) { 1.181 + // Empty input. 1.182 + return current(); 1.183 + } 1.184 + const UChar *pos=pos_; 1.185 + if(pos==NULL) { 1.186 + return USTRINGTRIE_NO_MATCH; 1.187 + } 1.188 + int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 1.189 + for(;;) { 1.190 + // Fetch the next input unit, if there is one. 1.191 + // Continue a linear-match node without rechecking sLength<0. 1.192 + int32_t uchar; 1.193 + if(sLength<0) { 1.194 + for(;;) { 1.195 + if((uchar=*s++)==0) { 1.196 + remainingMatchLength_=length; 1.197 + pos_=pos; 1.198 + int32_t node; 1.199 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.200 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.201 + } 1.202 + if(length<0) { 1.203 + remainingMatchLength_=length; 1.204 + break; 1.205 + } 1.206 + if(uchar!=*pos) { 1.207 + stop(); 1.208 + return USTRINGTRIE_NO_MATCH; 1.209 + } 1.210 + ++pos; 1.211 + --length; 1.212 + } 1.213 + } else { 1.214 + for(;;) { 1.215 + if(sLength==0) { 1.216 + remainingMatchLength_=length; 1.217 + pos_=pos; 1.218 + int32_t node; 1.219 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.220 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.221 + } 1.222 + uchar=*s++; 1.223 + --sLength; 1.224 + if(length<0) { 1.225 + remainingMatchLength_=length; 1.226 + break; 1.227 + } 1.228 + if(uchar!=*pos) { 1.229 + stop(); 1.230 + return USTRINGTRIE_NO_MATCH; 1.231 + } 1.232 + ++pos; 1.233 + --length; 1.234 + } 1.235 + } 1.236 + int32_t node=*pos++; 1.237 + for(;;) { 1.238 + if(node<kMinLinearMatch) { 1.239 + UStringTrieResult result=branchNext(pos, node, uchar); 1.240 + if(result==USTRINGTRIE_NO_MATCH) { 1.241 + return USTRINGTRIE_NO_MATCH; 1.242 + } 1.243 + // Fetch the next input unit, if there is one. 1.244 + if(sLength<0) { 1.245 + if((uchar=*s++)==0) { 1.246 + return result; 1.247 + } 1.248 + } else { 1.249 + if(sLength==0) { 1.250 + return result; 1.251 + } 1.252 + uchar=*s++; 1.253 + --sLength; 1.254 + } 1.255 + if(result==USTRINGTRIE_FINAL_VALUE) { 1.256 + // No further matching units. 1.257 + stop(); 1.258 + return USTRINGTRIE_NO_MATCH; 1.259 + } 1.260 + pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 1.261 + node=*pos++; 1.262 + } else if(node<kMinValueLead) { 1.263 + // Match length+1 units. 1.264 + length=node-kMinLinearMatch; // Actual match length minus 1. 1.265 + if(uchar!=*pos) { 1.266 + stop(); 1.267 + return USTRINGTRIE_NO_MATCH; 1.268 + } 1.269 + ++pos; 1.270 + --length; 1.271 + break; 1.272 + } else if(node&kValueIsFinal) { 1.273 + // No further matching units. 1.274 + stop(); 1.275 + return USTRINGTRIE_NO_MATCH; 1.276 + } else { 1.277 + // Skip intermediate value. 1.278 + pos=skipNodeValue(pos, node); 1.279 + node&=kNodeTypeMask; 1.280 + } 1.281 + } 1.282 + } 1.283 +} 1.284 + 1.285 +const UChar * 1.286 +UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, 1.287 + UBool haveUniqueValue, int32_t &uniqueValue) { 1.288 + while(length>kMaxBranchLinearSubNodeLength) { 1.289 + ++pos; // ignore the comparison unit 1.290 + if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 1.291 + return NULL; 1.292 + } 1.293 + length=length-(length>>1); 1.294 + pos=skipDelta(pos); 1.295 + } 1.296 + do { 1.297 + ++pos; // ignore a comparison unit 1.298 + // handle its value 1.299 + int32_t node=*pos++; 1.300 + UBool isFinal=(UBool)(node>>15); 1.301 + node&=0x7fff; 1.302 + int32_t value=readValue(pos, node); 1.303 + pos=skipValue(pos, node); 1.304 + if(isFinal) { 1.305 + if(haveUniqueValue) { 1.306 + if(value!=uniqueValue) { 1.307 + return NULL; 1.308 + } 1.309 + } else { 1.310 + uniqueValue=value; 1.311 + haveUniqueValue=TRUE; 1.312 + } 1.313 + } else { 1.314 + if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 1.315 + return NULL; 1.316 + } 1.317 + haveUniqueValue=TRUE; 1.318 + } 1.319 + } while(--length>1); 1.320 + return pos+1; // ignore the last comparison unit 1.321 +} 1.322 + 1.323 +UBool 1.324 +UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 1.325 + int32_t node=*pos++; 1.326 + for(;;) { 1.327 + if(node<kMinLinearMatch) { 1.328 + if(node==0) { 1.329 + node=*pos++; 1.330 + } 1.331 + pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 1.332 + if(pos==NULL) { 1.333 + return FALSE; 1.334 + } 1.335 + haveUniqueValue=TRUE; 1.336 + node=*pos++; 1.337 + } else if(node<kMinValueLead) { 1.338 + // linear-match node 1.339 + pos+=node-kMinLinearMatch+1; // Ignore the match units. 1.340 + node=*pos++; 1.341 + } else { 1.342 + UBool isFinal=(UBool)(node>>15); 1.343 + int32_t value; 1.344 + if(isFinal) { 1.345 + value=readValue(pos, node&0x7fff); 1.346 + } else { 1.347 + value=readNodeValue(pos, node); 1.348 + } 1.349 + if(haveUniqueValue) { 1.350 + if(value!=uniqueValue) { 1.351 + return FALSE; 1.352 + } 1.353 + } else { 1.354 + uniqueValue=value; 1.355 + haveUniqueValue=TRUE; 1.356 + } 1.357 + if(isFinal) { 1.358 + return TRUE; 1.359 + } 1.360 + pos=skipNodeValue(pos, node); 1.361 + node&=kNodeTypeMask; 1.362 + } 1.363 + } 1.364 +} 1.365 + 1.366 +int32_t 1.367 +UCharsTrie::getNextUChars(Appendable &out) const { 1.368 + const UChar *pos=pos_; 1.369 + if(pos==NULL) { 1.370 + return 0; 1.371 + } 1.372 + if(remainingMatchLength_>=0) { 1.373 + out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. 1.374 + return 1; 1.375 + } 1.376 + int32_t node=*pos++; 1.377 + if(node>=kMinValueLead) { 1.378 + if(node&kValueIsFinal) { 1.379 + return 0; 1.380 + } else { 1.381 + pos=skipNodeValue(pos, node); 1.382 + node&=kNodeTypeMask; 1.383 + } 1.384 + } 1.385 + if(node<kMinLinearMatch) { 1.386 + if(node==0) { 1.387 + node=*pos++; 1.388 + } 1.389 + out.reserveAppendCapacity(++node); 1.390 + getNextBranchUChars(pos, node, out); 1.391 + return node; 1.392 + } else { 1.393 + // First unit of the linear-match node. 1.394 + out.appendCodeUnit(*pos); 1.395 + return 1; 1.396 + } 1.397 +} 1.398 + 1.399 +void 1.400 +UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { 1.401 + while(length>kMaxBranchLinearSubNodeLength) { 1.402 + ++pos; // ignore the comparison unit 1.403 + getNextBranchUChars(jumpByDelta(pos), length>>1, out); 1.404 + length=length-(length>>1); 1.405 + pos=skipDelta(pos); 1.406 + } 1.407 + do { 1.408 + out.appendCodeUnit(*pos++); 1.409 + pos=skipValue(pos); 1.410 + } while(--length>1); 1.411 + out.appendCodeUnit(*pos); 1.412 +} 1.413 + 1.414 +U_NAMESPACE_END