1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/bytestrie.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,519 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: bytestrie.h 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010sep25 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#ifndef __BYTESTRIE_H__ 1.19 +#define __BYTESTRIE_H__ 1.20 + 1.21 +/** 1.22 + * \file 1.23 + * \brief C++ API: Trie for mapping byte sequences to integer values. 1.24 + */ 1.25 + 1.26 +#include "unicode/utypes.h" 1.27 +#include "unicode/stringpiece.h" 1.28 +#include "unicode/uobject.h" 1.29 +#include "unicode/ustringtrie.h" 1.30 + 1.31 +U_NAMESPACE_BEGIN 1.32 + 1.33 +class ByteSink; 1.34 +class BytesTrieBuilder; 1.35 +class CharString; 1.36 +class UVector32; 1.37 + 1.38 +/** 1.39 + * Light-weight, non-const reader class for a BytesTrie. 1.40 + * Traverses a byte-serialized data structure with minimal state, 1.41 + * for mapping byte sequences to non-negative integer values. 1.42 + * 1.43 + * This class owns the serialized trie data only if it was constructed by 1.44 + * the builder's build() method. 1.45 + * The public constructor and the copy constructor only alias the data (only copy the pointer). 1.46 + * There is no assignment operator. 1.47 + * 1.48 + * This class is not intended for public subclassing. 1.49 + * @stable ICU 4.8 1.50 + */ 1.51 +class U_COMMON_API BytesTrie : public UMemory { 1.52 +public: 1.53 + /** 1.54 + * Constructs a BytesTrie reader instance. 1.55 + * 1.56 + * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, 1.57 + * starting with the first byte of that sequence. 1.58 + * The BytesTrie object will not read more bytes than 1.59 + * the BytesTrieBuilder generated in the corresponding build() call. 1.60 + * 1.61 + * The array is not copied/cloned and must not be modified while 1.62 + * the BytesTrie object is in use. 1.63 + * 1.64 + * @param trieBytes The byte array that contains the serialized trie. 1.65 + * @stable ICU 4.8 1.66 + */ 1.67 + BytesTrie(const void *trieBytes) 1.68 + : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)), 1.69 + pos_(bytes_), remainingMatchLength_(-1) {} 1.70 + 1.71 + /** 1.72 + * Destructor. 1.73 + * @stable ICU 4.8 1.74 + */ 1.75 + ~BytesTrie(); 1.76 + 1.77 + /** 1.78 + * Copy constructor, copies the other trie reader object and its state, 1.79 + * but not the byte array which will be shared. (Shallow copy.) 1.80 + * @param other Another BytesTrie object. 1.81 + * @stable ICU 4.8 1.82 + */ 1.83 + BytesTrie(const BytesTrie &other) 1.84 + : ownedArray_(NULL), bytes_(other.bytes_), 1.85 + pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 1.86 + 1.87 + /** 1.88 + * Resets this trie to its initial state. 1.89 + * @return *this 1.90 + * @stable ICU 4.8 1.91 + */ 1.92 + BytesTrie &reset() { 1.93 + pos_=bytes_; 1.94 + remainingMatchLength_=-1; 1.95 + return *this; 1.96 + } 1.97 + 1.98 + /** 1.99 + * BytesTrie state object, for saving a trie's current state 1.100 + * and resetting the trie back to this state later. 1.101 + * @stable ICU 4.8 1.102 + */ 1.103 + class State : public UMemory { 1.104 + public: 1.105 + /** 1.106 + * Constructs an empty State. 1.107 + * @stable ICU 4.8 1.108 + */ 1.109 + State() { bytes=NULL; } 1.110 + private: 1.111 + friend class BytesTrie; 1.112 + 1.113 + const uint8_t *bytes; 1.114 + const uint8_t *pos; 1.115 + int32_t remainingMatchLength; 1.116 + }; 1.117 + 1.118 + /** 1.119 + * Saves the state of this trie. 1.120 + * @param state The State object to hold the trie's state. 1.121 + * @return *this 1.122 + * @see resetToState 1.123 + * @stable ICU 4.8 1.124 + */ 1.125 + const BytesTrie &saveState(State &state) const { 1.126 + state.bytes=bytes_; 1.127 + state.pos=pos_; 1.128 + state.remainingMatchLength=remainingMatchLength_; 1.129 + return *this; 1.130 + } 1.131 + 1.132 + /** 1.133 + * Resets this trie to the saved state. 1.134 + * If the state object contains no state, or the state of a different trie, 1.135 + * then this trie remains unchanged. 1.136 + * @param state The State object which holds a saved trie state. 1.137 + * @return *this 1.138 + * @see saveState 1.139 + * @see reset 1.140 + * @stable ICU 4.8 1.141 + */ 1.142 + BytesTrie &resetToState(const State &state) { 1.143 + if(bytes_==state.bytes && bytes_!=NULL) { 1.144 + pos_=state.pos; 1.145 + remainingMatchLength_=state.remainingMatchLength; 1.146 + } 1.147 + return *this; 1.148 + } 1.149 + 1.150 + /** 1.151 + * Determines whether the byte sequence so far matches, whether it has a value, 1.152 + * and whether another input byte can continue a matching byte sequence. 1.153 + * @return The match/value Result. 1.154 + * @stable ICU 4.8 1.155 + */ 1.156 + UStringTrieResult current() const; 1.157 + 1.158 + /** 1.159 + * Traverses the trie from the initial state for this input byte. 1.160 + * Equivalent to reset().next(inByte). 1.161 + * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 1.162 + * Values below -0x100 and above 0xff will never match. 1.163 + * @return The match/value Result. 1.164 + * @stable ICU 4.8 1.165 + */ 1.166 + inline UStringTrieResult first(int32_t inByte) { 1.167 + remainingMatchLength_=-1; 1.168 + if(inByte<0) { 1.169 + inByte+=0x100; 1.170 + } 1.171 + return nextImpl(bytes_, inByte); 1.172 + } 1.173 + 1.174 + /** 1.175 + * Traverses the trie from the current state for this input byte. 1.176 + * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 1.177 + * Values below -0x100 and above 0xff will never match. 1.178 + * @return The match/value Result. 1.179 + * @stable ICU 4.8 1.180 + */ 1.181 + UStringTrieResult next(int32_t inByte); 1.182 + 1.183 + /** 1.184 + * Traverses the trie from the current state for this byte sequence. 1.185 + * Equivalent to 1.186 + * \code 1.187 + * Result result=current(); 1.188 + * for(each c in s) 1.189 + * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 1.190 + * result=next(c); 1.191 + * return result; 1.192 + * \endcode 1.193 + * @param s A string or byte sequence. Can be NULL if length is 0. 1.194 + * @param length The length of the byte sequence. Can be -1 if NUL-terminated. 1.195 + * @return The match/value Result. 1.196 + * @stable ICU 4.8 1.197 + */ 1.198 + UStringTrieResult next(const char *s, int32_t length); 1.199 + 1.200 + /** 1.201 + * Returns a matching byte sequence's value if called immediately after 1.202 + * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 1.203 + * getValue() can be called multiple times. 1.204 + * 1.205 + * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 1.206 + * @return The value for the byte sequence so far. 1.207 + * @stable ICU 4.8 1.208 + */ 1.209 + inline int32_t getValue() const { 1.210 + const uint8_t *pos=pos_; 1.211 + int32_t leadByte=*pos++; 1.212 + // U_ASSERT(leadByte>=kMinValueLead); 1.213 + return readValue(pos, leadByte>>1); 1.214 + } 1.215 + 1.216 + /** 1.217 + * Determines whether all byte sequences reachable from the current state 1.218 + * map to the same value. 1.219 + * @param uniqueValue Receives the unique value, if this function returns TRUE. 1.220 + * (output-only) 1.221 + * @return TRUE if all byte sequences reachable from the current state 1.222 + * map to the same value. 1.223 + * @stable ICU 4.8 1.224 + */ 1.225 + inline UBool hasUniqueValue(int32_t &uniqueValue) const { 1.226 + const uint8_t *pos=pos_; 1.227 + // Skip the rest of a pending linear-match node. 1.228 + return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 1.229 + } 1.230 + 1.231 + /** 1.232 + * Finds each byte which continues the byte sequence from the current state. 1.233 + * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. 1.234 + * @param out Each next byte is appended to this object. 1.235 + * (Only uses the out.Append(s, length) method.) 1.236 + * @return the number of bytes which continue the byte sequence from here 1.237 + * @stable ICU 4.8 1.238 + */ 1.239 + int32_t getNextBytes(ByteSink &out) const; 1.240 + 1.241 + /** 1.242 + * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 1.243 + * @stable ICU 4.8 1.244 + */ 1.245 + class U_COMMON_API Iterator : public UMemory { 1.246 + public: 1.247 + /** 1.248 + * Iterates from the root of a byte-serialized BytesTrie. 1.249 + * @param trieBytes The trie bytes. 1.250 + * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 1.251 + * Otherwise, the iterator returns strings with this maximum length. 1.252 + * @param errorCode Standard ICU error code. Its input value must 1.253 + * pass the U_SUCCESS() test, or else the function returns 1.254 + * immediately. Check for U_FAILURE() on output or use with 1.255 + * function chaining. (See User Guide for details.) 1.256 + * @stable ICU 4.8 1.257 + */ 1.258 + Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 1.259 + 1.260 + /** 1.261 + * Iterates from the current state of the specified BytesTrie. 1.262 + * @param trie The trie whose state will be copied for iteration. 1.263 + * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 1.264 + * Otherwise, the iterator returns strings with this maximum length. 1.265 + * @param errorCode Standard ICU error code. Its input value must 1.266 + * pass the U_SUCCESS() test, or else the function returns 1.267 + * immediately. Check for U_FAILURE() on output or use with 1.268 + * function chaining. (See User Guide for details.) 1.269 + * @stable ICU 4.8 1.270 + */ 1.271 + Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 1.272 + 1.273 + /** 1.274 + * Destructor. 1.275 + * @stable ICU 4.8 1.276 + */ 1.277 + ~Iterator(); 1.278 + 1.279 + /** 1.280 + * Resets this iterator to its initial state. 1.281 + * @return *this 1.282 + * @stable ICU 4.8 1.283 + */ 1.284 + Iterator &reset(); 1.285 + 1.286 + /** 1.287 + * @return TRUE if there are more elements. 1.288 + * @stable ICU 4.8 1.289 + */ 1.290 + UBool hasNext() const; 1.291 + 1.292 + /** 1.293 + * Finds the next (byte sequence, value) pair if there is one. 1.294 + * 1.295 + * If the byte sequence is truncated to the maximum length and does not 1.296 + * have a real value, then the value is set to -1. 1.297 + * In this case, this "not a real value" is indistinguishable from 1.298 + * a real value of -1. 1.299 + * @param errorCode Standard ICU error code. Its input value must 1.300 + * pass the U_SUCCESS() test, or else the function returns 1.301 + * immediately. Check for U_FAILURE() on output or use with 1.302 + * function chaining. (See User Guide for details.) 1.303 + * @return TRUE if there is another element. 1.304 + * @stable ICU 4.8 1.305 + */ 1.306 + UBool next(UErrorCode &errorCode); 1.307 + 1.308 + /** 1.309 + * @return The NUL-terminated byte sequence for the last successful next(). 1.310 + * @stable ICU 4.8 1.311 + */ 1.312 + const StringPiece &getString() const { return sp_; } 1.313 + /** 1.314 + * @return The value for the last successful next(). 1.315 + * @stable ICU 4.8 1.316 + */ 1.317 + int32_t getValue() const { return value_; } 1.318 + 1.319 + private: 1.320 + UBool truncateAndStop(); 1.321 + 1.322 + const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 1.323 + 1.324 + const uint8_t *bytes_; 1.325 + const uint8_t *pos_; 1.326 + const uint8_t *initialPos_; 1.327 + int32_t remainingMatchLength_; 1.328 + int32_t initialRemainingMatchLength_; 1.329 + 1.330 + CharString *str_; 1.331 + StringPiece sp_; 1.332 + int32_t maxLength_; 1.333 + int32_t value_; 1.334 + 1.335 + // The stack stores pairs of integers for backtracking to another 1.336 + // outbound edge of a branch node. 1.337 + // The first integer is an offset from bytes_. 1.338 + // The second integer has the str_->length() from before the node in bits 15..0, 1.339 + // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 1.340 + // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 1.341 + // but the code looks more confusing that way.) 1.342 + UVector32 *stack_; 1.343 + }; 1.344 + 1.345 +private: 1.346 + friend class BytesTrieBuilder; 1.347 + 1.348 + /** 1.349 + * Constructs a BytesTrie reader instance. 1.350 + * Unlike the public constructor which just aliases an array, 1.351 + * this constructor adopts the builder's array. 1.352 + * This constructor is only called by the builder. 1.353 + */ 1.354 + BytesTrie(void *adoptBytes, const void *trieBytes) 1.355 + : ownedArray_(static_cast<uint8_t *>(adoptBytes)), 1.356 + bytes_(static_cast<const uint8_t *>(trieBytes)), 1.357 + pos_(bytes_), remainingMatchLength_(-1) {} 1.358 + 1.359 + // No assignment operator. 1.360 + BytesTrie &operator=(const BytesTrie &other); 1.361 + 1.362 + inline void stop() { 1.363 + pos_=NULL; 1.364 + } 1.365 + 1.366 + // Reads a compact 32-bit integer. 1.367 + // pos is already after the leadByte, and the lead byte is already shifted right by 1. 1.368 + static int32_t readValue(const uint8_t *pos, int32_t leadByte); 1.369 + static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 1.370 + // U_ASSERT(leadByte>=kMinValueLead); 1.371 + if(leadByte>=(kMinTwoByteValueLead<<1)) { 1.372 + if(leadByte<(kMinThreeByteValueLead<<1)) { 1.373 + ++pos; 1.374 + } else if(leadByte<(kFourByteValueLead<<1)) { 1.375 + pos+=2; 1.376 + } else { 1.377 + pos+=3+((leadByte>>1)&1); 1.378 + } 1.379 + } 1.380 + return pos; 1.381 + } 1.382 + static inline const uint8_t *skipValue(const uint8_t *pos) { 1.383 + int32_t leadByte=*pos++; 1.384 + return skipValue(pos, leadByte); 1.385 + } 1.386 + 1.387 + // Reads a jump delta and jumps. 1.388 + static const uint8_t *jumpByDelta(const uint8_t *pos); 1.389 + 1.390 + static inline const uint8_t *skipDelta(const uint8_t *pos) { 1.391 + int32_t delta=*pos++; 1.392 + if(delta>=kMinTwoByteDeltaLead) { 1.393 + if(delta<kMinThreeByteDeltaLead) { 1.394 + ++pos; 1.395 + } else if(delta<kFourByteDeltaLead) { 1.396 + pos+=2; 1.397 + } else { 1.398 + pos+=3+(delta&1); 1.399 + } 1.400 + } 1.401 + return pos; 1.402 + } 1.403 + 1.404 + static inline UStringTrieResult valueResult(int32_t node) { 1.405 + return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 1.406 + } 1.407 + 1.408 + // Handles a branch node for both next(byte) and next(string). 1.409 + UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 1.410 + 1.411 + // Requires remainingLength_<0. 1.412 + UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 1.413 + 1.414 + // Helper functions for hasUniqueValue(). 1.415 + // Recursively finds a unique value (or whether there is not a unique one) 1.416 + // from a branch. 1.417 + static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 1.418 + UBool haveUniqueValue, int32_t &uniqueValue); 1.419 + // Recursively finds a unique value (or whether there is not a unique one) 1.420 + // starting from a position on a node lead byte. 1.421 + static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 1.422 + 1.423 + // Helper functions for getNextBytes(). 1.424 + // getNextBytes() when pos is on a branch node. 1.425 + static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 1.426 + static void append(ByteSink &out, int c); 1.427 + 1.428 + // BytesTrie data structure 1.429 + // 1.430 + // The trie consists of a series of byte-serialized nodes for incremental 1.431 + // string/byte sequence matching. The root node is at the beginning of the trie data. 1.432 + // 1.433 + // Types of nodes are distinguished by their node lead byte ranges. 1.434 + // After each node, except a final-value node, another node follows to 1.435 + // encode match values or continue matching further bytes. 1.436 + // 1.437 + // Node types: 1.438 + // - Value node: Stores a 32-bit integer in a compact, variable-length format. 1.439 + // The value is for the string/byte sequence so far. 1.440 + // One node bit indicates whether the value is final or whether 1.441 + // matching continues with the next node. 1.442 + // - Linear-match node: Matches a number of bytes. 1.443 + // - Branch node: Branches to other nodes according to the current input byte. 1.444 + // The node byte is the length of the branch (number of bytes to select from) 1.445 + // minus 1. It is followed by a sub-node: 1.446 + // - If the length is at most kMaxBranchLinearSubNodeLength, then 1.447 + // there are length-1 (key, value) pairs and then one more comparison byte. 1.448 + // If one of the key bytes matches, then the value is either a final value for 1.449 + // the string/byte sequence so far, or a "jump" delta to the next node. 1.450 + // If the last byte matches, then matching continues with the next node. 1.451 + // (Values have the same encoding as value nodes.) 1.452 + // - If the length is greater than kMaxBranchLinearSubNodeLength, then 1.453 + // there is one byte and one "jump" delta. 1.454 + // If the input byte is less than the sub-node byte, then "jump" by delta to 1.455 + // the next sub-node which will have a length of length/2. 1.456 + // (The delta has its own compact encoding.) 1.457 + // Otherwise, skip the "jump" delta to the next sub-node 1.458 + // which will have a length of length-length/2. 1.459 + 1.460 + // Node lead byte values. 1.461 + 1.462 + // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 1.463 + // the length is one more than the next byte. 1.464 + 1.465 + // For a branch sub-node with at most this many entries, we drop down 1.466 + // to a linear search. 1.467 + static const int32_t kMaxBranchLinearSubNodeLength=5; 1.468 + 1.469 + // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 1.470 + static const int32_t kMinLinearMatch=0x10; 1.471 + static const int32_t kMaxLinearMatchLength=0x10; 1.472 + 1.473 + // 20..ff: Variable-length value node. 1.474 + // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 1.475 + // Then shift-right by 1 bit. 1.476 + // The remaining lead byte value indicates the number of following bytes (0..4) 1.477 + // and contains the value's top bits. 1.478 + static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 1.479 + // It is a final value if bit 0 is set. 1.480 + static const int32_t kValueIsFinal=1; 1.481 + 1.482 + // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 1.483 + static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 1.484 + static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 1.485 + 1.486 + static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 1.487 + static const int32_t kMaxTwoByteValue=0x1aff; 1.488 + 1.489 + static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 1.490 + static const int32_t kFourByteValueLead=0x7e; 1.491 + 1.492 + // A little more than Unicode code points. (0x11ffff) 1.493 + static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 1.494 + 1.495 + static const int32_t kFiveByteValueLead=0x7f; 1.496 + 1.497 + // Compact delta integers. 1.498 + static const int32_t kMaxOneByteDelta=0xbf; 1.499 + static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 1.500 + static const int32_t kMinThreeByteDeltaLead=0xf0; 1.501 + static const int32_t kFourByteDeltaLead=0xfe; 1.502 + static const int32_t kFiveByteDeltaLead=0xff; 1.503 + 1.504 + static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 1.505 + static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 1.506 + 1.507 + uint8_t *ownedArray_; 1.508 + 1.509 + // Fixed value referencing the BytesTrie bytes. 1.510 + const uint8_t *bytes_; 1.511 + 1.512 + // Iterator variables. 1.513 + 1.514 + // Pointer to next trie byte to read. NULL if no more matches. 1.515 + const uint8_t *pos_; 1.516 + // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1.517 + int32_t remainingMatchLength_; 1.518 +}; 1.519 + 1.520 +U_NAMESPACE_END 1.521 + 1.522 +#endif // __BYTESTRIE_H__