intl/icu/source/common/unicode/stringtriebuilder.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *   Copyright (C) 2010-2012, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 *******************************************************************************
     6 *   file name:  stringtriebuilder.h
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2010dec24
    12 *   created by: Markus W. Scherer
    13 */
    15 #ifndef __STRINGTRIEBUILDER_H__
    16 #define __STRINGTRIEBUILDER_H__
    18 #include "unicode/utypes.h"
    19 #include "unicode/uobject.h"
    21 /**
    22  * \file
    23  * \brief C++ API: Builder API for trie builders
    24  */
    26 // Forward declaration.
    27 struct UHashtable;
    28 typedef struct UHashtable UHashtable;
    30 /**
    31  * Build options for BytesTrieBuilder and CharsTrieBuilder.
    32  * @stable ICU 4.8
    33  */
    34 enum UStringTrieBuildOption {
    35     /**
    36      * Builds a trie quickly.
    37      * @stable ICU 4.8
    38      */
    39     USTRINGTRIE_BUILD_FAST,
    40     /**
    41      * Builds a trie more slowly, attempting to generate
    42      * a shorter but equivalent serialization.
    43      * This build option also uses more memory.
    44      *
    45      * This option can be effective when many integer values are the same
    46      * and string/byte sequence suffixes can be shared.
    47      * Runtime speed is not expected to improve.
    48      * @stable ICU 4.8
    49      */
    50     USTRINGTRIE_BUILD_SMALL
    51 };
    53 U_NAMESPACE_BEGIN
    55 /**
    56  * Base class for string trie builder classes.
    57  *
    58  * This class is not intended for public subclassing.
    59  * @stable ICU 4.8
    60  */
    61 class U_COMMON_API StringTrieBuilder : public UObject {
    62 public:
    63 #ifndef U_HIDE_INTERNAL_API
    64     /** @internal */
    65     static UBool hashNode(const void *node);
    66     /** @internal */
    67     static UBool equalNodes(const void *left, const void *right);
    68 #endif  /* U_HIDE_INTERNAL_API */
    70 protected:
    71     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
    72     // or else the compiler will create a public default constructor.
    73     /** @internal */
    74     StringTrieBuilder();
    75     /** @internal */
    76     virtual ~StringTrieBuilder();
    78 #ifndef U_HIDE_INTERNAL_API
    79     /** @internal */
    80     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
    81     /** @internal */
    82     void deleteCompactBuilder();
    84     /** @internal */
    85     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
    87     /** @internal */
    88     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
    89     /** @internal */
    90     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
    91 #endif  /* U_HIDE_INTERNAL_API */
    93     class Node;
    95 #ifndef U_HIDE_INTERNAL_API
    96     /** @internal */
    97     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
    98     /** @internal */
    99     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
   100                             int32_t length, UErrorCode &errorCode);
   101 #endif  /* U_HIDE_INTERNAL_API */
   103     /** @internal */
   104     virtual int32_t getElementStringLength(int32_t i) const = 0;
   105     /** @internal */
   106     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
   107     /** @internal */
   108     virtual int32_t getElementValue(int32_t i) const = 0;
   110     // Finds the first unit index after this one where
   111     // the first and last element have different units again.
   112     /** @internal */
   113     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
   115     // Number of different units at unitIndex.
   116     /** @internal */
   117     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
   118     /** @internal */
   119     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
   120     /** @internal */
   121     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
   123     /** @internal */
   124     virtual UBool matchNodesCanHaveValues() const = 0;
   126     /** @internal */
   127     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
   128     /** @internal */
   129     virtual int32_t getMinLinearMatch() const = 0;
   130     /** @internal */
   131     virtual int32_t getMaxLinearMatchLength() const = 0;
   133 #ifndef U_HIDE_INTERNAL_API
   134     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
   135     /** @internal */
   136     static const int32_t kMaxBranchLinearSubNodeLength=5;
   138     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
   139     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
   140     /** @internal */
   141     static const int32_t kMaxSplitBranchLevels=14;
   143     /**
   144      * Makes sure that there is only one unique node registered that is
   145      * equivalent to newNode.
   146      * @param newNode Input node. The builder takes ownership.
   147      * @param errorCode ICU in/out UErrorCode.
   148                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
   149      * @return newNode if it is the first of its kind, or
   150      *         an equivalent node if newNode is a duplicate.
   151      * @internal
   152      */
   153     Node *registerNode(Node *newNode, UErrorCode &errorCode);
   154     /**
   155      * Makes sure that there is only one unique FinalValueNode registered
   156      * with this value.
   157      * Avoids creating a node if the value is a duplicate.
   158      * @param value A final value.
   159      * @param errorCode ICU in/out UErrorCode.
   160                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
   161      * @return A FinalValueNode with the given value.
   162      * @internal
   163      */
   164     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
   166     /*
   167      * C++ note:
   168      * registerNode() and registerFinalValue() take ownership of their input nodes,
   169      * and only return owned nodes.
   170      * If they see a failure UErrorCode, they will delete the input node.
   171      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
   172      * If there is a failure, they return NULL.
   173      *
   174      * NULL Node pointers can be safely passed into other Nodes because
   175      * they call the static Node::hashCode() which checks for a NULL pointer first.
   176      *
   177      * Therefore, as long as builder functions register a new node,
   178      * they need to check for failures only before explicitly dereferencing
   179      * a Node pointer, or before setting a new UErrorCode.
   180      */
   182     // Hash set of nodes, maps from nodes to integer 1.
   183     /** @internal */
   184     UHashtable *nodes;
   186     /** @internal */
   187     class Node : public UObject {
   188     public:
   189         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
   190         inline int32_t hashCode() const { return hash; }
   191         // Handles node==NULL.
   192         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
   193         // Base class operator==() compares the actual class types.
   194         virtual UBool operator==(const Node &other) const;
   195         inline UBool operator!=(const Node &other) const { return !operator==(other); }
   196         /**
   197          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
   198          * This is to avoid writing a duplicate node twice.
   199          *
   200          * Branch nodes in this trie data structure are not symmetric.
   201          * Most branch edges "jump" to other nodes but the rightmost branch edges
   202          * just continue without a jump.
   203          * Therefore, write() must write the rightmost branch edge last
   204          * (trie units are written backwards), and must write it at that point even if
   205          * it is a duplicate of a node previously written elsewhere.
   206          *
   207          * This function visits and marks right branch edges first.
   208          * Edges are numbered with increasingly negative values because we share the
   209          * offset field which gets positive values when nodes are written.
   210          * A branch edge also remembers the first number for any of its edges.
   211          *
   212          * When a further-left branch edge has a number in the range of the rightmost
   213          * edge's numbers, then it will be written as part of the required right edge
   214          * and we can avoid writing it first.
   215          *
   216          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
   217          * edge numbers.
   218          *
   219          * @param edgeNumber The first edge number for this node and its sub-nodes.
   220          * @return An edge number that is at least the maximum-negative
   221          *         of the input edge number and the numbers of this node and all of its sub-nodes.
   222          */
   223         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   224         // write() must set the offset to a positive value.
   225         virtual void write(StringTrieBuilder &builder) = 0;
   226         // See markRightEdgesFirst.
   227         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
   228                                                StringTrieBuilder &builder) {
   229             // Note: Edge numbers are negative, lastRight<=firstRight.
   230             // If offset>0 then this node and its sub-nodes have been written already
   231             // and we need not write them again.
   232             // If this node is part of the unwritten right branch edge,
   233             // then we wait until that is written.
   234             if(offset<0 && (offset<lastRight || firstRight<offset)) {
   235                 write(builder);
   236             }
   237         }
   238         inline int32_t getOffset() const { return offset; }
   239     protected:
   240         int32_t hash;
   241         int32_t offset;
   242     };
   244     // This class should not be overridden because
   245     // registerFinalValue() compares a stack-allocated FinalValueNode
   246     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
   247     // with the input node, and the
   248     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
   249     // will be false if the typeid's are different.
   250     /** @internal */
   251     class FinalValueNode : public Node {
   252     public:
   253         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
   254         virtual UBool operator==(const Node &other) const;
   255         virtual void write(StringTrieBuilder &builder);
   256     protected:
   257         int32_t value;
   258     };
   260     /**
   261      * @internal 
   262      */
   263     class ValueNode : public Node {
   264     public:
   265         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
   266         virtual UBool operator==(const Node &other) const;
   267         void setValue(int32_t v) {
   268             hasValue=TRUE;
   269             value=v;
   270             hash=hash*37+v;
   271         }
   272     protected:
   273         UBool hasValue;
   274         int32_t value;
   275     };
   277     /** 
   278      * @internal 
   279      */
   280     class IntermediateValueNode : public ValueNode {
   281     public:
   282         IntermediateValueNode(int32_t v, Node *nextNode)
   283                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
   284         virtual UBool operator==(const Node &other) const;
   285         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   286         virtual void write(StringTrieBuilder &builder);
   287     protected:
   288         Node *next;
   289     };
   291     /**
   292      * @internal 
   293      */
   294     class LinearMatchNode : public ValueNode {
   295     public:
   296         LinearMatchNode(int32_t len, Node *nextNode)
   297                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
   298                   length(len), next(nextNode) {}
   299         virtual UBool operator==(const Node &other) const;
   300         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   301     protected:
   302         int32_t length;
   303         Node *next;
   304     };
   306     /**
   307      * @internal 
   308      */
   309     class BranchNode : public Node {
   310     public:
   311         BranchNode(int32_t initialHash) : Node(initialHash) {}
   312     protected:
   313         int32_t firstEdgeNumber;
   314     };
   316     /**
   317      * @internal 
   318      */
   319     class ListBranchNode : public BranchNode {
   320     public:
   321         ListBranchNode() : BranchNode(0x444444), length(0) {}
   322         virtual UBool operator==(const Node &other) const;
   323         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   324         virtual void write(StringTrieBuilder &builder);
   325         // Adds a unit with a final value.
   326         void add(int32_t c, int32_t value) {
   327             units[length]=(UChar)c;
   328             equal[length]=NULL;
   329             values[length]=value;
   330             ++length;
   331             hash=(hash*37+c)*37+value;
   332         }
   333         // Adds a unit which leads to another match node.
   334         void add(int32_t c, Node *node) {
   335             units[length]=(UChar)c;
   336             equal[length]=node;
   337             values[length]=0;
   338             ++length;
   339             hash=(hash*37+c)*37+hashCode(node);
   340         }
   341     protected:
   342         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
   343         int32_t length;
   344         int32_t values[kMaxBranchLinearSubNodeLength];
   345         UChar units[kMaxBranchLinearSubNodeLength];
   346     };
   348     /**
   349      * @internal 
   350      */
   351     class SplitBranchNode : public BranchNode {
   352     public:
   353         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
   354                 : BranchNode(((0x555555*37+middleUnit)*37+
   355                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
   356                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
   357         virtual UBool operator==(const Node &other) const;
   358         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   359         virtual void write(StringTrieBuilder &builder);
   360     protected:
   361         UChar unit;
   362         Node *lessThan;
   363         Node *greaterOrEqual;
   364     };
   366     // Branch head node, for writing the actual node lead unit.
   367     /** @internal */
   368     class BranchHeadNode : public ValueNode {
   369     public:
   370         BranchHeadNode(int32_t len, Node *subNode)
   371                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
   372                   length(len), next(subNode) {}
   373         virtual UBool operator==(const Node &other) const;
   374         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   375         virtual void write(StringTrieBuilder &builder);
   376     protected:
   377         int32_t length;
   378         Node *next;  // A branch sub-node.
   379     };
   380 #endif  /* U_HIDE_INTERNAL_API */
   382     /** @internal */
   383     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
   384                                         Node *nextNode) const = 0;
   386     /** @internal */
   387     virtual int32_t write(int32_t unit) = 0;
   388     /** @internal */
   389     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
   390     /** @internal */
   391     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
   392     /** @internal */
   393     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
   394     /** @internal */
   395     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
   396 };
   398 U_NAMESPACE_END
   400 #endif  // __STRINGTRIEBUILDER_H__

mercurial