intl/icu/source/common/unicode/bytestrie.h

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2010-2012, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  bytestrie.h
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2010sep25
    12 *   created by: Markus W. Scherer
    13 */
    15 #ifndef __BYTESTRIE_H__
    16 #define __BYTESTRIE_H__
    18 /**
    19  * \file
    20  * \brief C++ API: Trie for mapping byte sequences to integer values.
    21  */
    23 #include "unicode/utypes.h"
    24 #include "unicode/stringpiece.h"
    25 #include "unicode/uobject.h"
    26 #include "unicode/ustringtrie.h"
    28 U_NAMESPACE_BEGIN
    30 class ByteSink;
    31 class BytesTrieBuilder;
    32 class CharString;
    33 class UVector32;
    35 /**
    36  * Light-weight, non-const reader class for a BytesTrie.
    37  * Traverses a byte-serialized data structure with minimal state,
    38  * for mapping byte sequences to non-negative integer values.
    39  *
    40  * This class owns the serialized trie data only if it was constructed by
    41  * the builder's build() method.
    42  * The public constructor and the copy constructor only alias the data (only copy the pointer).
    43  * There is no assignment operator.
    44  *
    45  * This class is not intended for public subclassing.
    46  * @stable ICU 4.8
    47  */
    48 class U_COMMON_API BytesTrie : public UMemory {
    49 public:
    50     /**
    51      * Constructs a BytesTrie reader instance.
    52      *
    53      * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder,
    54      * starting with the first byte of that sequence.
    55      * The BytesTrie object will not read more bytes than
    56      * the BytesTrieBuilder generated in the corresponding build() call.
    57      *
    58      * The array is not copied/cloned and must not be modified while
    59      * the BytesTrie object is in use.
    60      *
    61      * @param trieBytes The byte array that contains the serialized trie.
    62      * @stable ICU 4.8
    63      */
    64     BytesTrie(const void *trieBytes)
    65             : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
    66               pos_(bytes_), remainingMatchLength_(-1) {}
    68     /**
    69      * Destructor.
    70      * @stable ICU 4.8
    71      */
    72     ~BytesTrie();
    74     /**
    75      * Copy constructor, copies the other trie reader object and its state,
    76      * but not the byte array which will be shared. (Shallow copy.)
    77      * @param other Another BytesTrie object.
    78      * @stable ICU 4.8
    79      */
    80     BytesTrie(const BytesTrie &other)
    81             : ownedArray_(NULL), bytes_(other.bytes_),
    82               pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
    84     /**
    85      * Resets this trie to its initial state.
    86      * @return *this
    87      * @stable ICU 4.8
    88      */
    89     BytesTrie &reset() {
    90         pos_=bytes_;
    91         remainingMatchLength_=-1;
    92         return *this;
    93     }
    95     /**
    96      * BytesTrie state object, for saving a trie's current state
    97      * and resetting the trie back to this state later.
    98      * @stable ICU 4.8
    99      */
   100     class State : public UMemory {
   101     public:
   102         /**
   103          * Constructs an empty State.
   104          * @stable ICU 4.8
   105          */
   106         State() { bytes=NULL; }
   107     private:
   108         friend class BytesTrie;
   110         const uint8_t *bytes;
   111         const uint8_t *pos;
   112         int32_t remainingMatchLength;
   113     };
   115     /**
   116      * Saves the state of this trie.
   117      * @param state The State object to hold the trie's state.
   118      * @return *this
   119      * @see resetToState
   120      * @stable ICU 4.8
   121      */
   122     const BytesTrie &saveState(State &state) const {
   123         state.bytes=bytes_;
   124         state.pos=pos_;
   125         state.remainingMatchLength=remainingMatchLength_;
   126         return *this;
   127     }
   129     /**
   130      * Resets this trie to the saved state.
   131      * If the state object contains no state, or the state of a different trie,
   132      * then this trie remains unchanged.
   133      * @param state The State object which holds a saved trie state.
   134      * @return *this
   135      * @see saveState
   136      * @see reset
   137      * @stable ICU 4.8
   138      */
   139     BytesTrie &resetToState(const State &state) {
   140         if(bytes_==state.bytes && bytes_!=NULL) {
   141             pos_=state.pos;
   142             remainingMatchLength_=state.remainingMatchLength;
   143         }
   144         return *this;
   145     }
   147     /**
   148      * Determines whether the byte sequence so far matches, whether it has a value,
   149      * and whether another input byte can continue a matching byte sequence.
   150      * @return The match/value Result.
   151      * @stable ICU 4.8
   152      */
   153     UStringTrieResult current() const;
   155     /**
   156      * Traverses the trie from the initial state for this input byte.
   157      * Equivalent to reset().next(inByte).
   158      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
   159      *               Values below -0x100 and above 0xff will never match.
   160      * @return The match/value Result.
   161      * @stable ICU 4.8
   162      */
   163     inline UStringTrieResult first(int32_t inByte) {
   164         remainingMatchLength_=-1;
   165         if(inByte<0) {
   166             inByte+=0x100;
   167         }
   168         return nextImpl(bytes_, inByte);
   169     }
   171     /**
   172      * Traverses the trie from the current state for this input byte.
   173      * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff.
   174      *               Values below -0x100 and above 0xff will never match.
   175      * @return The match/value Result.
   176      * @stable ICU 4.8
   177      */
   178     UStringTrieResult next(int32_t inByte);
   180     /**
   181      * Traverses the trie from the current state for this byte sequence.
   182      * Equivalent to
   183      * \code
   184      * Result result=current();
   185      * for(each c in s)
   186      *   if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
   187      *   result=next(c);
   188      * return result;
   189      * \endcode
   190      * @param s A string or byte sequence. Can be NULL if length is 0.
   191      * @param length The length of the byte sequence. Can be -1 if NUL-terminated.
   192      * @return The match/value Result.
   193      * @stable ICU 4.8
   194      */
   195     UStringTrieResult next(const char *s, int32_t length);
   197     /**
   198      * Returns a matching byte sequence's value if called immediately after
   199      * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
   200      * getValue() can be called multiple times.
   201      *
   202      * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
   203      * @return The value for the byte sequence so far.
   204      * @stable ICU 4.8
   205      */
   206     inline int32_t getValue() const {
   207         const uint8_t *pos=pos_;
   208         int32_t leadByte=*pos++;
   209         // U_ASSERT(leadByte>=kMinValueLead);
   210         return readValue(pos, leadByte>>1);
   211     }
   213     /**
   214      * Determines whether all byte sequences reachable from the current state
   215      * map to the same value.
   216      * @param uniqueValue Receives the unique value, if this function returns TRUE.
   217      *                    (output-only)
   218      * @return TRUE if all byte sequences reachable from the current state
   219      *         map to the same value.
   220      * @stable ICU 4.8
   221      */
   222     inline UBool hasUniqueValue(int32_t &uniqueValue) const {
   223         const uint8_t *pos=pos_;
   224         // Skip the rest of a pending linear-match node.
   225         return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
   226     }
   228     /**
   229      * Finds each byte which continues the byte sequence from the current state.
   230      * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now.
   231      * @param out Each next byte is appended to this object.
   232      *            (Only uses the out.Append(s, length) method.)
   233      * @return the number of bytes which continue the byte sequence from here
   234      * @stable ICU 4.8
   235      */
   236     int32_t getNextBytes(ByteSink &out) const;
   238     /**
   239      * Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
   240      * @stable ICU 4.8
   241      */
   242     class U_COMMON_API Iterator : public UMemory {
   243     public:
   244         /**
   245          * Iterates from the root of a byte-serialized BytesTrie.
   246          * @param trieBytes The trie bytes.
   247          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
   248          *                        Otherwise, the iterator returns strings with this maximum length.
   249          * @param errorCode Standard ICU error code. Its input value must
   250          *                  pass the U_SUCCESS() test, or else the function returns
   251          *                  immediately. Check for U_FAILURE() on output or use with
   252          *                  function chaining. (See User Guide for details.)
   253          * @stable ICU 4.8
   254          */
   255         Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
   257         /**
   258          * Iterates from the current state of the specified BytesTrie.
   259          * @param trie The trie whose state will be copied for iteration.
   260          * @param maxStringLength If 0, the iterator returns full strings/byte sequences.
   261          *                        Otherwise, the iterator returns strings with this maximum length.
   262          * @param errorCode Standard ICU error code. Its input value must
   263          *                  pass the U_SUCCESS() test, or else the function returns
   264          *                  immediately. Check for U_FAILURE() on output or use with
   265          *                  function chaining. (See User Guide for details.)
   266          * @stable ICU 4.8
   267          */
   268         Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
   270         /**
   271          * Destructor.
   272          * @stable ICU 4.8
   273          */
   274         ~Iterator();
   276         /**
   277          * Resets this iterator to its initial state.
   278          * @return *this
   279          * @stable ICU 4.8
   280          */
   281         Iterator &reset();
   283         /**
   284          * @return TRUE if there are more elements.
   285          * @stable ICU 4.8
   286          */
   287         UBool hasNext() const;
   289         /**
   290          * Finds the next (byte sequence, value) pair if there is one.
   291          *
   292          * If the byte sequence is truncated to the maximum length and does not
   293          * have a real value, then the value is set to -1.
   294          * In this case, this "not a real value" is indistinguishable from
   295          * a real value of -1.
   296          * @param errorCode Standard ICU error code. Its input value must
   297          *                  pass the U_SUCCESS() test, or else the function returns
   298          *                  immediately. Check for U_FAILURE() on output or use with
   299          *                  function chaining. (See User Guide for details.)
   300          * @return TRUE if there is another element.
   301          * @stable ICU 4.8
   302          */
   303         UBool next(UErrorCode &errorCode);
   305         /**
   306          * @return The NUL-terminated byte sequence for the last successful next().
   307          * @stable ICU 4.8
   308          */
   309         const StringPiece &getString() const { return sp_; }
   310         /**
   311          * @return The value for the last successful next().
   312          * @stable ICU 4.8
   313          */
   314         int32_t getValue() const { return value_; }
   316     private:
   317         UBool truncateAndStop();
   319         const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
   321         const uint8_t *bytes_;
   322         const uint8_t *pos_;
   323         const uint8_t *initialPos_;
   324         int32_t remainingMatchLength_;
   325         int32_t initialRemainingMatchLength_;
   327         CharString *str_;
   328         StringPiece sp_;
   329         int32_t maxLength_;
   330         int32_t value_;
   332         // The stack stores pairs of integers for backtracking to another
   333         // outbound edge of a branch node.
   334         // The first integer is an offset from bytes_.
   335         // The second integer has the str_->length() from before the node in bits 15..0,
   336         // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
   337         // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
   338         // but the code looks more confusing that way.)
   339         UVector32 *stack_;
   340     };
   342 private:
   343     friend class BytesTrieBuilder;
   345     /**
   346      * Constructs a BytesTrie reader instance.
   347      * Unlike the public constructor which just aliases an array,
   348      * this constructor adopts the builder's array.
   349      * This constructor is only called by the builder.
   350      */
   351     BytesTrie(void *adoptBytes, const void *trieBytes)
   352             : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
   353               bytes_(static_cast<const uint8_t *>(trieBytes)),
   354               pos_(bytes_), remainingMatchLength_(-1) {}
   356     // No assignment operator.
   357     BytesTrie &operator=(const BytesTrie &other);
   359     inline void stop() {
   360         pos_=NULL;
   361     }
   363     // Reads a compact 32-bit integer.
   364     // pos is already after the leadByte, and the lead byte is already shifted right by 1.
   365     static int32_t readValue(const uint8_t *pos, int32_t leadByte);
   366     static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
   367         // U_ASSERT(leadByte>=kMinValueLead);
   368         if(leadByte>=(kMinTwoByteValueLead<<1)) {
   369             if(leadByte<(kMinThreeByteValueLead<<1)) {
   370                 ++pos;
   371             } else if(leadByte<(kFourByteValueLead<<1)) {
   372                 pos+=2;
   373             } else {
   374                 pos+=3+((leadByte>>1)&1);
   375             }
   376         }
   377         return pos;
   378     }
   379     static inline const uint8_t *skipValue(const uint8_t *pos) {
   380         int32_t leadByte=*pos++;
   381         return skipValue(pos, leadByte);
   382     }
   384     // Reads a jump delta and jumps.
   385     static const uint8_t *jumpByDelta(const uint8_t *pos);
   387     static inline const uint8_t *skipDelta(const uint8_t *pos) {
   388         int32_t delta=*pos++;
   389         if(delta>=kMinTwoByteDeltaLead) {
   390             if(delta<kMinThreeByteDeltaLead) {
   391                 ++pos;
   392             } else if(delta<kFourByteDeltaLead) {
   393                 pos+=2;
   394             } else {
   395                 pos+=3+(delta&1);
   396             }
   397         }
   398         return pos;
   399     }
   401     static inline UStringTrieResult valueResult(int32_t node) {
   402         return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
   403     }
   405     // Handles a branch node for both next(byte) and next(string).
   406     UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
   408     // Requires remainingLength_<0.
   409     UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
   411     // Helper functions for hasUniqueValue().
   412     // Recursively finds a unique value (or whether there is not a unique one)
   413     // from a branch.
   414     static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
   415                                                     UBool haveUniqueValue, int32_t &uniqueValue);
   416     // Recursively finds a unique value (or whether there is not a unique one)
   417     // starting from a position on a node lead byte.
   418     static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
   420     // Helper functions for getNextBytes().
   421     // getNextBytes() when pos is on a branch node.
   422     static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
   423     static void append(ByteSink &out, int c);
   425     // BytesTrie data structure
   426     //
   427     // The trie consists of a series of byte-serialized nodes for incremental
   428     // string/byte sequence matching. The root node is at the beginning of the trie data.
   429     //
   430     // Types of nodes are distinguished by their node lead byte ranges.
   431     // After each node, except a final-value node, another node follows to
   432     // encode match values or continue matching further bytes.
   433     //
   434     // Node types:
   435     //  - Value node: Stores a 32-bit integer in a compact, variable-length format.
   436     //    The value is for the string/byte sequence so far.
   437     //    One node bit indicates whether the value is final or whether
   438     //    matching continues with the next node.
   439     //  - Linear-match node: Matches a number of bytes.
   440     //  - Branch node: Branches to other nodes according to the current input byte.
   441     //    The node byte is the length of the branch (number of bytes to select from)
   442     //    minus 1. It is followed by a sub-node:
   443     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
   444     //      there are length-1 (key, value) pairs and then one more comparison byte.
   445     //      If one of the key bytes matches, then the value is either a final value for
   446     //      the string/byte sequence so far, or a "jump" delta to the next node.
   447     //      If the last byte matches, then matching continues with the next node.
   448     //      (Values have the same encoding as value nodes.)
   449     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
   450     //      there is one byte and one "jump" delta.
   451     //      If the input byte is less than the sub-node byte, then "jump" by delta to
   452     //      the next sub-node which will have a length of length/2.
   453     //      (The delta has its own compact encoding.)
   454     //      Otherwise, skip the "jump" delta to the next sub-node
   455     //      which will have a length of length-length/2.
   457     // Node lead byte values.
   459     // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
   460     // the length is one more than the next byte.
   462     // For a branch sub-node with at most this many entries, we drop down
   463     // to a linear search.
   464     static const int32_t kMaxBranchLinearSubNodeLength=5;
   466     // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
   467     static const int32_t kMinLinearMatch=0x10;
   468     static const int32_t kMaxLinearMatchLength=0x10;
   470     // 20..ff: Variable-length value node.
   471     // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
   472     // Then shift-right by 1 bit.
   473     // The remaining lead byte value indicates the number of following bytes (0..4)
   474     // and contains the value's top bits.
   475     static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x20
   476     // It is a final value if bit 0 is set.
   477     static const int32_t kValueIsFinal=1;
   479     // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
   480     static const int32_t kMinOneByteValueLead=kMinValueLead/2;  // 0x10
   481     static const int32_t kMaxOneByteValue=0x40;  // At least 6 bits in the first byte.
   483     static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1;  // 0x51
   484     static const int32_t kMaxTwoByteValue=0x1aff;
   486     static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1;  // 0x6c
   487     static const int32_t kFourByteValueLead=0x7e;
   489     // A little more than Unicode code points. (0x11ffff)
   490     static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
   492     static const int32_t kFiveByteValueLead=0x7f;
   494     // Compact delta integers.
   495     static const int32_t kMaxOneByteDelta=0xbf;
   496     static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1;  // 0xc0
   497     static const int32_t kMinThreeByteDeltaLead=0xf0;
   498     static const int32_t kFourByteDeltaLead=0xfe;
   499     static const int32_t kFiveByteDeltaLead=0xff;
   501     static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1;  // 0x2fff
   502     static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1;  // 0xdffff
   504     uint8_t *ownedArray_;
   506     // Fixed value referencing the BytesTrie bytes.
   507     const uint8_t *bytes_;
   509     // Iterator variables.
   511     // Pointer to next trie byte to read. NULL if no more matches.
   512     const uint8_t *pos_;
   513     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
   514     int32_t remainingMatchLength_;
   515 };
   517 U_NAMESPACE_END
   519 #endif  // __BYTESTRIE_H__

mercurial