1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/bytestrie.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,439 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2011, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: bytestrie.cpp 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010sep25 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#include "unicode/utypes.h" 1.19 +#include "unicode/bytestream.h" 1.20 +#include "unicode/bytestrie.h" 1.21 +#include "unicode/uobject.h" 1.22 +#include "cmemory.h" 1.23 +#include "uassert.h" 1.24 + 1.25 +U_NAMESPACE_BEGIN 1.26 + 1.27 +BytesTrie::~BytesTrie() { 1.28 + uprv_free(ownedArray_); 1.29 +} 1.30 + 1.31 +// lead byte already shifted right by 1. 1.32 +int32_t 1.33 +BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) { 1.34 + int32_t value; 1.35 + if(leadByte<kMinTwoByteValueLead) { 1.36 + value=leadByte-kMinOneByteValueLead; 1.37 + } else if(leadByte<kMinThreeByteValueLead) { 1.38 + value=((leadByte-kMinTwoByteValueLead)<<8)|*pos; 1.39 + } else if(leadByte<kFourByteValueLead) { 1.40 + value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 1.41 + } else if(leadByte==kFourByteValueLead) { 1.42 + value=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 1.43 + } else { 1.44 + value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 1.45 + } 1.46 + return value; 1.47 +} 1.48 + 1.49 +const uint8_t * 1.50 +BytesTrie::jumpByDelta(const uint8_t *pos) { 1.51 + int32_t delta=*pos++; 1.52 + if(delta<kMinTwoByteDeltaLead) { 1.53 + // nothing to do 1.54 + } else if(delta<kMinThreeByteDeltaLead) { 1.55 + delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++; 1.56 + } else if(delta<kFourByteDeltaLead) { 1.57 + delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1]; 1.58 + pos+=2; 1.59 + } else if(delta==kFourByteDeltaLead) { 1.60 + delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 1.61 + pos+=3; 1.62 + } else { 1.63 + delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 1.64 + pos+=4; 1.65 + } 1.66 + return pos+delta; 1.67 +} 1.68 + 1.69 +UStringTrieResult 1.70 +BytesTrie::current() const { 1.71 + const uint8_t *pos=pos_; 1.72 + if(pos==NULL) { 1.73 + return USTRINGTRIE_NO_MATCH; 1.74 + } else { 1.75 + int32_t node; 1.76 + return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? 1.77 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.78 + } 1.79 +} 1.80 + 1.81 +UStringTrieResult 1.82 +BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) { 1.83 + // Branch according to the current byte. 1.84 + if(length==0) { 1.85 + length=*pos++; 1.86 + } 1.87 + ++length; 1.88 + // The length of the branch is the number of bytes to select from. 1.89 + // The data structure encodes a binary search. 1.90 + while(length>kMaxBranchLinearSubNodeLength) { 1.91 + if(inByte<*pos++) { 1.92 + length>>=1; 1.93 + pos=jumpByDelta(pos); 1.94 + } else { 1.95 + length=length-(length>>1); 1.96 + pos=skipDelta(pos); 1.97 + } 1.98 + } 1.99 + // Drop down to linear search for the last few bytes. 1.100 + // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 1.101 + // and divides length by 2. 1.102 + do { 1.103 + if(inByte==*pos++) { 1.104 + UStringTrieResult result; 1.105 + int32_t node=*pos; 1.106 + U_ASSERT(node>=kMinValueLead); 1.107 + if(node&kValueIsFinal) { 1.108 + // Leave the final value for getValue() to read. 1.109 + result=USTRINGTRIE_FINAL_VALUE; 1.110 + } else { 1.111 + // Use the non-final value as the jump delta. 1.112 + ++pos; 1.113 + // int32_t delta=readValue(pos, node>>1); 1.114 + node>>=1; 1.115 + int32_t delta; 1.116 + if(node<kMinTwoByteValueLead) { 1.117 + delta=node-kMinOneByteValueLead; 1.118 + } else if(node<kMinThreeByteValueLead) { 1.119 + delta=((node-kMinTwoByteValueLead)<<8)|*pos++; 1.120 + } else if(node<kFourByteValueLead) { 1.121 + delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; 1.122 + pos+=2; 1.123 + } else if(node==kFourByteValueLead) { 1.124 + delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; 1.125 + pos+=3; 1.126 + } else { 1.127 + delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; 1.128 + pos+=4; 1.129 + } 1.130 + // end readValue() 1.131 + pos+=delta; 1.132 + node=*pos; 1.133 + result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 1.134 + } 1.135 + pos_=pos; 1.136 + return result; 1.137 + } 1.138 + --length; 1.139 + pos=skipValue(pos); 1.140 + } while(length>1); 1.141 + if(inByte==*pos++) { 1.142 + pos_=pos; 1.143 + int32_t node=*pos; 1.144 + return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; 1.145 + } else { 1.146 + stop(); 1.147 + return USTRINGTRIE_NO_MATCH; 1.148 + } 1.149 +} 1.150 + 1.151 +UStringTrieResult 1.152 +BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) { 1.153 + for(;;) { 1.154 + int32_t node=*pos++; 1.155 + if(node<kMinLinearMatch) { 1.156 + return branchNext(pos, node, inByte); 1.157 + } else if(node<kMinValueLead) { 1.158 + // Match the first of length+1 bytes. 1.159 + int32_t length=node-kMinLinearMatch; // Actual match length minus 1. 1.160 + if(inByte==*pos++) { 1.161 + remainingMatchLength_=--length; 1.162 + pos_=pos; 1.163 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.164 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.165 + } else { 1.166 + // No match. 1.167 + break; 1.168 + } 1.169 + } else if(node&kValueIsFinal) { 1.170 + // No further matching bytes. 1.171 + break; 1.172 + } else { 1.173 + // Skip intermediate value. 1.174 + pos=skipValue(pos, node); 1.175 + // The next node must not also be a value node. 1.176 + U_ASSERT(*pos<kMinValueLead); 1.177 + } 1.178 + } 1.179 + stop(); 1.180 + return USTRINGTRIE_NO_MATCH; 1.181 +} 1.182 + 1.183 +UStringTrieResult 1.184 +BytesTrie::next(int32_t inByte) { 1.185 + const uint8_t *pos=pos_; 1.186 + if(pos==NULL) { 1.187 + return USTRINGTRIE_NO_MATCH; 1.188 + } 1.189 + if(inByte<0) { 1.190 + inByte+=0x100; 1.191 + } 1.192 + int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 1.193 + if(length>=0) { 1.194 + // Remaining part of a linear-match node. 1.195 + if(inByte==*pos++) { 1.196 + remainingMatchLength_=--length; 1.197 + pos_=pos; 1.198 + int32_t node; 1.199 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.200 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.201 + } else { 1.202 + stop(); 1.203 + return USTRINGTRIE_NO_MATCH; 1.204 + } 1.205 + } 1.206 + return nextImpl(pos, inByte); 1.207 +} 1.208 + 1.209 +UStringTrieResult 1.210 +BytesTrie::next(const char *s, int32_t sLength) { 1.211 + if(sLength<0 ? *s==0 : sLength==0) { 1.212 + // Empty input. 1.213 + return current(); 1.214 + } 1.215 + const uint8_t *pos=pos_; 1.216 + if(pos==NULL) { 1.217 + return USTRINGTRIE_NO_MATCH; 1.218 + } 1.219 + int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. 1.220 + for(;;) { 1.221 + // Fetch the next input byte, if there is one. 1.222 + // Continue a linear-match node without rechecking sLength<0. 1.223 + int32_t inByte; 1.224 + if(sLength<0) { 1.225 + for(;;) { 1.226 + if((inByte=*s++)==0) { 1.227 + remainingMatchLength_=length; 1.228 + pos_=pos; 1.229 + int32_t node; 1.230 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.231 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.232 + } 1.233 + if(length<0) { 1.234 + remainingMatchLength_=length; 1.235 + break; 1.236 + } 1.237 + if(inByte!=*pos) { 1.238 + stop(); 1.239 + return USTRINGTRIE_NO_MATCH; 1.240 + } 1.241 + ++pos; 1.242 + --length; 1.243 + } 1.244 + } else { 1.245 + for(;;) { 1.246 + if(sLength==0) { 1.247 + remainingMatchLength_=length; 1.248 + pos_=pos; 1.249 + int32_t node; 1.250 + return (length<0 && (node=*pos)>=kMinValueLead) ? 1.251 + valueResult(node) : USTRINGTRIE_NO_VALUE; 1.252 + } 1.253 + inByte=*s++; 1.254 + --sLength; 1.255 + if(length<0) { 1.256 + remainingMatchLength_=length; 1.257 + break; 1.258 + } 1.259 + if(inByte!=*pos) { 1.260 + stop(); 1.261 + return USTRINGTRIE_NO_MATCH; 1.262 + } 1.263 + ++pos; 1.264 + --length; 1.265 + } 1.266 + } 1.267 + for(;;) { 1.268 + int32_t node=*pos++; 1.269 + if(node<kMinLinearMatch) { 1.270 + UStringTrieResult result=branchNext(pos, node, inByte); 1.271 + if(result==USTRINGTRIE_NO_MATCH) { 1.272 + return USTRINGTRIE_NO_MATCH; 1.273 + } 1.274 + // Fetch the next input byte, if there is one. 1.275 + if(sLength<0) { 1.276 + if((inByte=*s++)==0) { 1.277 + return result; 1.278 + } 1.279 + } else { 1.280 + if(sLength==0) { 1.281 + return result; 1.282 + } 1.283 + inByte=*s++; 1.284 + --sLength; 1.285 + } 1.286 + if(result==USTRINGTRIE_FINAL_VALUE) { 1.287 + // No further matching bytes. 1.288 + stop(); 1.289 + return USTRINGTRIE_NO_MATCH; 1.290 + } 1.291 + pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 1.292 + } else if(node<kMinValueLead) { 1.293 + // Match length+1 bytes. 1.294 + length=node-kMinLinearMatch; // Actual match length minus 1. 1.295 + if(inByte!=*pos) { 1.296 + stop(); 1.297 + return USTRINGTRIE_NO_MATCH; 1.298 + } 1.299 + ++pos; 1.300 + --length; 1.301 + break; 1.302 + } else if(node&kValueIsFinal) { 1.303 + // No further matching bytes. 1.304 + stop(); 1.305 + return USTRINGTRIE_NO_MATCH; 1.306 + } else { 1.307 + // Skip intermediate value. 1.308 + pos=skipValue(pos, node); 1.309 + // The next node must not also be a value node. 1.310 + U_ASSERT(*pos<kMinValueLead); 1.311 + } 1.312 + } 1.313 + } 1.314 +} 1.315 + 1.316 +const uint8_t * 1.317 +BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 1.318 + UBool haveUniqueValue, int32_t &uniqueValue) { 1.319 + while(length>kMaxBranchLinearSubNodeLength) { 1.320 + ++pos; // ignore the comparison byte 1.321 + if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { 1.322 + return NULL; 1.323 + } 1.324 + length=length-(length>>1); 1.325 + pos=skipDelta(pos); 1.326 + } 1.327 + do { 1.328 + ++pos; // ignore a comparison byte 1.329 + // handle its value 1.330 + int32_t node=*pos++; 1.331 + UBool isFinal=(UBool)(node&kValueIsFinal); 1.332 + int32_t value=readValue(pos, node>>1); 1.333 + pos=skipValue(pos, node); 1.334 + if(isFinal) { 1.335 + if(haveUniqueValue) { 1.336 + if(value!=uniqueValue) { 1.337 + return NULL; 1.338 + } 1.339 + } else { 1.340 + uniqueValue=value; 1.341 + haveUniqueValue=TRUE; 1.342 + } 1.343 + } else { 1.344 + if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { 1.345 + return NULL; 1.346 + } 1.347 + haveUniqueValue=TRUE; 1.348 + } 1.349 + } while(--length>1); 1.350 + return pos+1; // ignore the last comparison byte 1.351 +} 1.352 + 1.353 +UBool 1.354 +BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) { 1.355 + for(;;) { 1.356 + int32_t node=*pos++; 1.357 + if(node<kMinLinearMatch) { 1.358 + if(node==0) { 1.359 + node=*pos++; 1.360 + } 1.361 + pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); 1.362 + if(pos==NULL) { 1.363 + return FALSE; 1.364 + } 1.365 + haveUniqueValue=TRUE; 1.366 + } else if(node<kMinValueLead) { 1.367 + // linear-match node 1.368 + pos+=node-kMinLinearMatch+1; // Ignore the match bytes. 1.369 + } else { 1.370 + UBool isFinal=(UBool)(node&kValueIsFinal); 1.371 + int32_t value=readValue(pos, node>>1); 1.372 + if(haveUniqueValue) { 1.373 + if(value!=uniqueValue) { 1.374 + return FALSE; 1.375 + } 1.376 + } else { 1.377 + uniqueValue=value; 1.378 + haveUniqueValue=TRUE; 1.379 + } 1.380 + if(isFinal) { 1.381 + return TRUE; 1.382 + } 1.383 + pos=skipValue(pos, node); 1.384 + } 1.385 + } 1.386 +} 1.387 + 1.388 +int32_t 1.389 +BytesTrie::getNextBytes(ByteSink &out) const { 1.390 + const uint8_t *pos=pos_; 1.391 + if(pos==NULL) { 1.392 + return 0; 1.393 + } 1.394 + if(remainingMatchLength_>=0) { 1.395 + append(out, *pos); // Next byte of a pending linear-match node. 1.396 + return 1; 1.397 + } 1.398 + int32_t node=*pos++; 1.399 + if(node>=kMinValueLead) { 1.400 + if(node&kValueIsFinal) { 1.401 + return 0; 1.402 + } else { 1.403 + pos=skipValue(pos, node); 1.404 + node=*pos++; 1.405 + U_ASSERT(node<kMinValueLead); 1.406 + } 1.407 + } 1.408 + if(node<kMinLinearMatch) { 1.409 + if(node==0) { 1.410 + node=*pos++; 1.411 + } 1.412 + getNextBranchBytes(pos, ++node, out); 1.413 + return node; 1.414 + } else { 1.415 + // First byte of the linear-match node. 1.416 + append(out, *pos); 1.417 + return 1; 1.418 + } 1.419 +} 1.420 + 1.421 +void 1.422 +BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) { 1.423 + while(length>kMaxBranchLinearSubNodeLength) { 1.424 + ++pos; // ignore the comparison byte 1.425 + getNextBranchBytes(jumpByDelta(pos), length>>1, out); 1.426 + length=length-(length>>1); 1.427 + pos=skipDelta(pos); 1.428 + } 1.429 + do { 1.430 + append(out, *pos++); 1.431 + pos=skipValue(pos); 1.432 + } while(--length>1); 1.433 + append(out, *pos); 1.434 +} 1.435 + 1.436 +void 1.437 +BytesTrie::append(ByteSink &out, int c) { 1.438 + char ch=(char)c; 1.439 + out.Append(&ch, 1); 1.440 +} 1.441 + 1.442 +U_NAMESPACE_END