intl/icu/source/common/bytestrie.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2010-2011, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: bytestrie.cpp
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2010sep25
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #include "unicode/utypes.h"
michael@0 16 #include "unicode/bytestream.h"
michael@0 17 #include "unicode/bytestrie.h"
michael@0 18 #include "unicode/uobject.h"
michael@0 19 #include "cmemory.h"
michael@0 20 #include "uassert.h"
michael@0 21
michael@0 22 U_NAMESPACE_BEGIN
michael@0 23
michael@0 24 BytesTrie::~BytesTrie() {
michael@0 25 uprv_free(ownedArray_);
michael@0 26 }
michael@0 27
michael@0 28 // lead byte already shifted right by 1.
michael@0 29 int32_t
michael@0 30 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) {
michael@0 31 int32_t value;
michael@0 32 if(leadByte<kMinTwoByteValueLead) {
michael@0 33 value=leadByte-kMinOneByteValueLead;
michael@0 34 } else if(leadByte<kMinThreeByteValueLead) {
michael@0 35 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos;
michael@0 36 } else if(leadByte<kFourByteValueLead) {
michael@0 37 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
michael@0 38 } else if(leadByte==kFourByteValueLead) {
michael@0 39 value=(pos[0]<<16)|(pos[1]<<8)|pos[2];
michael@0 40 } else {
michael@0 41 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
michael@0 42 }
michael@0 43 return value;
michael@0 44 }
michael@0 45
michael@0 46 const uint8_t *
michael@0 47 BytesTrie::jumpByDelta(const uint8_t *pos) {
michael@0 48 int32_t delta=*pos++;
michael@0 49 if(delta<kMinTwoByteDeltaLead) {
michael@0 50 // nothing to do
michael@0 51 } else if(delta<kMinThreeByteDeltaLead) {
michael@0 52 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++;
michael@0 53 } else if(delta<kFourByteDeltaLead) {
michael@0 54 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1];
michael@0 55 pos+=2;
michael@0 56 } else if(delta==kFourByteDeltaLead) {
michael@0 57 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
michael@0 58 pos+=3;
michael@0 59 } else {
michael@0 60 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
michael@0 61 pos+=4;
michael@0 62 }
michael@0 63 return pos+delta;
michael@0 64 }
michael@0 65
michael@0 66 UStringTrieResult
michael@0 67 BytesTrie::current() const {
michael@0 68 const uint8_t *pos=pos_;
michael@0 69 if(pos==NULL) {
michael@0 70 return USTRINGTRIE_NO_MATCH;
michael@0 71 } else {
michael@0 72 int32_t node;
michael@0 73 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ?
michael@0 74 valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 75 }
michael@0 76 }
michael@0 77
michael@0 78 UStringTrieResult
michael@0 79 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) {
michael@0 80 // Branch according to the current byte.
michael@0 81 if(length==0) {
michael@0 82 length=*pos++;
michael@0 83 }
michael@0 84 ++length;
michael@0 85 // The length of the branch is the number of bytes to select from.
michael@0 86 // The data structure encodes a binary search.
michael@0 87 while(length>kMaxBranchLinearSubNodeLength) {
michael@0 88 if(inByte<*pos++) {
michael@0 89 length>>=1;
michael@0 90 pos=jumpByDelta(pos);
michael@0 91 } else {
michael@0 92 length=length-(length>>1);
michael@0 93 pos=skipDelta(pos);
michael@0 94 }
michael@0 95 }
michael@0 96 // Drop down to linear search for the last few bytes.
michael@0 97 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
michael@0 98 // and divides length by 2.
michael@0 99 do {
michael@0 100 if(inByte==*pos++) {
michael@0 101 UStringTrieResult result;
michael@0 102 int32_t node=*pos;
michael@0 103 U_ASSERT(node>=kMinValueLead);
michael@0 104 if(node&kValueIsFinal) {
michael@0 105 // Leave the final value for getValue() to read.
michael@0 106 result=USTRINGTRIE_FINAL_VALUE;
michael@0 107 } else {
michael@0 108 // Use the non-final value as the jump delta.
michael@0 109 ++pos;
michael@0 110 // int32_t delta=readValue(pos, node>>1);
michael@0 111 node>>=1;
michael@0 112 int32_t delta;
michael@0 113 if(node<kMinTwoByteValueLead) {
michael@0 114 delta=node-kMinOneByteValueLead;
michael@0 115 } else if(node<kMinThreeByteValueLead) {
michael@0 116 delta=((node-kMinTwoByteValueLead)<<8)|*pos++;
michael@0 117 } else if(node<kFourByteValueLead) {
michael@0 118 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1];
michael@0 119 pos+=2;
michael@0 120 } else if(node==kFourByteValueLead) {
michael@0 121 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2];
michael@0 122 pos+=3;
michael@0 123 } else {
michael@0 124 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3];
michael@0 125 pos+=4;
michael@0 126 }
michael@0 127 // end readValue()
michael@0 128 pos+=delta;
michael@0 129 node=*pos;
michael@0 130 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 131 }
michael@0 132 pos_=pos;
michael@0 133 return result;
michael@0 134 }
michael@0 135 --length;
michael@0 136 pos=skipValue(pos);
michael@0 137 } while(length>1);
michael@0 138 if(inByte==*pos++) {
michael@0 139 pos_=pos;
michael@0 140 int32_t node=*pos;
michael@0 141 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 142 } else {
michael@0 143 stop();
michael@0 144 return USTRINGTRIE_NO_MATCH;
michael@0 145 }
michael@0 146 }
michael@0 147
michael@0 148 UStringTrieResult
michael@0 149 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) {
michael@0 150 for(;;) {
michael@0 151 int32_t node=*pos++;
michael@0 152 if(node<kMinLinearMatch) {
michael@0 153 return branchNext(pos, node, inByte);
michael@0 154 } else if(node<kMinValueLead) {
michael@0 155 // Match the first of length+1 bytes.
michael@0 156 int32_t length=node-kMinLinearMatch; // Actual match length minus 1.
michael@0 157 if(inByte==*pos++) {
michael@0 158 remainingMatchLength_=--length;
michael@0 159 pos_=pos;
michael@0 160 return (length<0 && (node=*pos)>=kMinValueLead) ?
michael@0 161 valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 162 } else {
michael@0 163 // No match.
michael@0 164 break;
michael@0 165 }
michael@0 166 } else if(node&kValueIsFinal) {
michael@0 167 // No further matching bytes.
michael@0 168 break;
michael@0 169 } else {
michael@0 170 // Skip intermediate value.
michael@0 171 pos=skipValue(pos, node);
michael@0 172 // The next node must not also be a value node.
michael@0 173 U_ASSERT(*pos<kMinValueLead);
michael@0 174 }
michael@0 175 }
michael@0 176 stop();
michael@0 177 return USTRINGTRIE_NO_MATCH;
michael@0 178 }
michael@0 179
michael@0 180 UStringTrieResult
michael@0 181 BytesTrie::next(int32_t inByte) {
michael@0 182 const uint8_t *pos=pos_;
michael@0 183 if(pos==NULL) {
michael@0 184 return USTRINGTRIE_NO_MATCH;
michael@0 185 }
michael@0 186 if(inByte<0) {
michael@0 187 inByte+=0x100;
michael@0 188 }
michael@0 189 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
michael@0 190 if(length>=0) {
michael@0 191 // Remaining part of a linear-match node.
michael@0 192 if(inByte==*pos++) {
michael@0 193 remainingMatchLength_=--length;
michael@0 194 pos_=pos;
michael@0 195 int32_t node;
michael@0 196 return (length<0 && (node=*pos)>=kMinValueLead) ?
michael@0 197 valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 198 } else {
michael@0 199 stop();
michael@0 200 return USTRINGTRIE_NO_MATCH;
michael@0 201 }
michael@0 202 }
michael@0 203 return nextImpl(pos, inByte);
michael@0 204 }
michael@0 205
michael@0 206 UStringTrieResult
michael@0 207 BytesTrie::next(const char *s, int32_t sLength) {
michael@0 208 if(sLength<0 ? *s==0 : sLength==0) {
michael@0 209 // Empty input.
michael@0 210 return current();
michael@0 211 }
michael@0 212 const uint8_t *pos=pos_;
michael@0 213 if(pos==NULL) {
michael@0 214 return USTRINGTRIE_NO_MATCH;
michael@0 215 }
michael@0 216 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1.
michael@0 217 for(;;) {
michael@0 218 // Fetch the next input byte, if there is one.
michael@0 219 // Continue a linear-match node without rechecking sLength<0.
michael@0 220 int32_t inByte;
michael@0 221 if(sLength<0) {
michael@0 222 for(;;) {
michael@0 223 if((inByte=*s++)==0) {
michael@0 224 remainingMatchLength_=length;
michael@0 225 pos_=pos;
michael@0 226 int32_t node;
michael@0 227 return (length<0 && (node=*pos)>=kMinValueLead) ?
michael@0 228 valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 229 }
michael@0 230 if(length<0) {
michael@0 231 remainingMatchLength_=length;
michael@0 232 break;
michael@0 233 }
michael@0 234 if(inByte!=*pos) {
michael@0 235 stop();
michael@0 236 return USTRINGTRIE_NO_MATCH;
michael@0 237 }
michael@0 238 ++pos;
michael@0 239 --length;
michael@0 240 }
michael@0 241 } else {
michael@0 242 for(;;) {
michael@0 243 if(sLength==0) {
michael@0 244 remainingMatchLength_=length;
michael@0 245 pos_=pos;
michael@0 246 int32_t node;
michael@0 247 return (length<0 && (node=*pos)>=kMinValueLead) ?
michael@0 248 valueResult(node) : USTRINGTRIE_NO_VALUE;
michael@0 249 }
michael@0 250 inByte=*s++;
michael@0 251 --sLength;
michael@0 252 if(length<0) {
michael@0 253 remainingMatchLength_=length;
michael@0 254 break;
michael@0 255 }
michael@0 256 if(inByte!=*pos) {
michael@0 257 stop();
michael@0 258 return USTRINGTRIE_NO_MATCH;
michael@0 259 }
michael@0 260 ++pos;
michael@0 261 --length;
michael@0 262 }
michael@0 263 }
michael@0 264 for(;;) {
michael@0 265 int32_t node=*pos++;
michael@0 266 if(node<kMinLinearMatch) {
michael@0 267 UStringTrieResult result=branchNext(pos, node, inByte);
michael@0 268 if(result==USTRINGTRIE_NO_MATCH) {
michael@0 269 return USTRINGTRIE_NO_MATCH;
michael@0 270 }
michael@0 271 // Fetch the next input byte, if there is one.
michael@0 272 if(sLength<0) {
michael@0 273 if((inByte=*s++)==0) {
michael@0 274 return result;
michael@0 275 }
michael@0 276 } else {
michael@0 277 if(sLength==0) {
michael@0 278 return result;
michael@0 279 }
michael@0 280 inByte=*s++;
michael@0 281 --sLength;
michael@0 282 }
michael@0 283 if(result==USTRINGTRIE_FINAL_VALUE) {
michael@0 284 // No further matching bytes.
michael@0 285 stop();
michael@0 286 return USTRINGTRIE_NO_MATCH;
michael@0 287 }
michael@0 288 pos=pos_; // branchNext() advanced pos and wrote it to pos_ .
michael@0 289 } else if(node<kMinValueLead) {
michael@0 290 // Match length+1 bytes.
michael@0 291 length=node-kMinLinearMatch; // Actual match length minus 1.
michael@0 292 if(inByte!=*pos) {
michael@0 293 stop();
michael@0 294 return USTRINGTRIE_NO_MATCH;
michael@0 295 }
michael@0 296 ++pos;
michael@0 297 --length;
michael@0 298 break;
michael@0 299 } else if(node&kValueIsFinal) {
michael@0 300 // No further matching bytes.
michael@0 301 stop();
michael@0 302 return USTRINGTRIE_NO_MATCH;
michael@0 303 } else {
michael@0 304 // Skip intermediate value.
michael@0 305 pos=skipValue(pos, node);
michael@0 306 // The next node must not also be a value node.
michael@0 307 U_ASSERT(*pos<kMinValueLead);
michael@0 308 }
michael@0 309 }
michael@0 310 }
michael@0 311 }
michael@0 312
michael@0 313 const uint8_t *
michael@0 314 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
michael@0 315 UBool haveUniqueValue, int32_t &uniqueValue) {
michael@0 316 while(length>kMaxBranchLinearSubNodeLength) {
michael@0 317 ++pos; // ignore the comparison byte
michael@0 318 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) {
michael@0 319 return NULL;
michael@0 320 }
michael@0 321 length=length-(length>>1);
michael@0 322 pos=skipDelta(pos);
michael@0 323 }
michael@0 324 do {
michael@0 325 ++pos; // ignore a comparison byte
michael@0 326 // handle its value
michael@0 327 int32_t node=*pos++;
michael@0 328 UBool isFinal=(UBool)(node&kValueIsFinal);
michael@0 329 int32_t value=readValue(pos, node>>1);
michael@0 330 pos=skipValue(pos, node);
michael@0 331 if(isFinal) {
michael@0 332 if(haveUniqueValue) {
michael@0 333 if(value!=uniqueValue) {
michael@0 334 return NULL;
michael@0 335 }
michael@0 336 } else {
michael@0 337 uniqueValue=value;
michael@0 338 haveUniqueValue=TRUE;
michael@0 339 }
michael@0 340 } else {
michael@0 341 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) {
michael@0 342 return NULL;
michael@0 343 }
michael@0 344 haveUniqueValue=TRUE;
michael@0 345 }
michael@0 346 } while(--length>1);
michael@0 347 return pos+1; // ignore the last comparison byte
michael@0 348 }
michael@0 349
michael@0 350 UBool
michael@0 351 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) {
michael@0 352 for(;;) {
michael@0 353 int32_t node=*pos++;
michael@0 354 if(node<kMinLinearMatch) {
michael@0 355 if(node==0) {
michael@0 356 node=*pos++;
michael@0 357 }
michael@0 358 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue);
michael@0 359 if(pos==NULL) {
michael@0 360 return FALSE;
michael@0 361 }
michael@0 362 haveUniqueValue=TRUE;
michael@0 363 } else if(node<kMinValueLead) {
michael@0 364 // linear-match node
michael@0 365 pos+=node-kMinLinearMatch+1; // Ignore the match bytes.
michael@0 366 } else {
michael@0 367 UBool isFinal=(UBool)(node&kValueIsFinal);
michael@0 368 int32_t value=readValue(pos, node>>1);
michael@0 369 if(haveUniqueValue) {
michael@0 370 if(value!=uniqueValue) {
michael@0 371 return FALSE;
michael@0 372 }
michael@0 373 } else {
michael@0 374 uniqueValue=value;
michael@0 375 haveUniqueValue=TRUE;
michael@0 376 }
michael@0 377 if(isFinal) {
michael@0 378 return TRUE;
michael@0 379 }
michael@0 380 pos=skipValue(pos, node);
michael@0 381 }
michael@0 382 }
michael@0 383 }
michael@0 384
michael@0 385 int32_t
michael@0 386 BytesTrie::getNextBytes(ByteSink &out) const {
michael@0 387 const uint8_t *pos=pos_;
michael@0 388 if(pos==NULL) {
michael@0 389 return 0;
michael@0 390 }
michael@0 391 if(remainingMatchLength_>=0) {
michael@0 392 append(out, *pos); // Next byte of a pending linear-match node.
michael@0 393 return 1;
michael@0 394 }
michael@0 395 int32_t node=*pos++;
michael@0 396 if(node>=kMinValueLead) {
michael@0 397 if(node&kValueIsFinal) {
michael@0 398 return 0;
michael@0 399 } else {
michael@0 400 pos=skipValue(pos, node);
michael@0 401 node=*pos++;
michael@0 402 U_ASSERT(node<kMinValueLead);
michael@0 403 }
michael@0 404 }
michael@0 405 if(node<kMinLinearMatch) {
michael@0 406 if(node==0) {
michael@0 407 node=*pos++;
michael@0 408 }
michael@0 409 getNextBranchBytes(pos, ++node, out);
michael@0 410 return node;
michael@0 411 } else {
michael@0 412 // First byte of the linear-match node.
michael@0 413 append(out, *pos);
michael@0 414 return 1;
michael@0 415 }
michael@0 416 }
michael@0 417
michael@0 418 void
michael@0 419 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) {
michael@0 420 while(length>kMaxBranchLinearSubNodeLength) {
michael@0 421 ++pos; // ignore the comparison byte
michael@0 422 getNextBranchBytes(jumpByDelta(pos), length>>1, out);
michael@0 423 length=length-(length>>1);
michael@0 424 pos=skipDelta(pos);
michael@0 425 }
michael@0 426 do {
michael@0 427 append(out, *pos++);
michael@0 428 pos=skipValue(pos);
michael@0 429 } while(--length>1);
michael@0 430 append(out, *pos);
michael@0 431 }
michael@0 432
michael@0 433 void
michael@0 434 BytesTrie::append(ByteSink &out, int c) {
michael@0 435 char ch=(char)c;
michael@0 436 out.Append(&ch, 1);
michael@0 437 }
michael@0 438
michael@0 439 U_NAMESPACE_END

mercurial