intl/icu/source/common/unicode/stringtriebuilder.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/unicode/stringtriebuilder.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,400 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*   Copyright (C) 2010-2012, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +*******************************************************************************
     1.9 +*   file name:  stringtriebuilder.h
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2010dec24
    1.15 +*   created by: Markus W. Scherer
    1.16 +*/
    1.17 +
    1.18 +#ifndef __STRINGTRIEBUILDER_H__
    1.19 +#define __STRINGTRIEBUILDER_H__
    1.20 +
    1.21 +#include "unicode/utypes.h"
    1.22 +#include "unicode/uobject.h"
    1.23 +
    1.24 +/**
    1.25 + * \file
    1.26 + * \brief C++ API: Builder API for trie builders
    1.27 + */
    1.28 +
    1.29 +// Forward declaration.
    1.30 +struct UHashtable;
    1.31 +typedef struct UHashtable UHashtable;
    1.32 +
    1.33 +/**
    1.34 + * Build options for BytesTrieBuilder and CharsTrieBuilder.
    1.35 + * @stable ICU 4.8
    1.36 + */
    1.37 +enum UStringTrieBuildOption {
    1.38 +    /**
    1.39 +     * Builds a trie quickly.
    1.40 +     * @stable ICU 4.8
    1.41 +     */
    1.42 +    USTRINGTRIE_BUILD_FAST,
    1.43 +    /**
    1.44 +     * Builds a trie more slowly, attempting to generate
    1.45 +     * a shorter but equivalent serialization.
    1.46 +     * This build option also uses more memory.
    1.47 +     *
    1.48 +     * This option can be effective when many integer values are the same
    1.49 +     * and string/byte sequence suffixes can be shared.
    1.50 +     * Runtime speed is not expected to improve.
    1.51 +     * @stable ICU 4.8
    1.52 +     */
    1.53 +    USTRINGTRIE_BUILD_SMALL
    1.54 +};
    1.55 +
    1.56 +U_NAMESPACE_BEGIN
    1.57 +
    1.58 +/**
    1.59 + * Base class for string trie builder classes.
    1.60 + *
    1.61 + * This class is not intended for public subclassing.
    1.62 + * @stable ICU 4.8
    1.63 + */
    1.64 +class U_COMMON_API StringTrieBuilder : public UObject {
    1.65 +public:
    1.66 +#ifndef U_HIDE_INTERNAL_API
    1.67 +    /** @internal */
    1.68 +    static UBool hashNode(const void *node);
    1.69 +    /** @internal */
    1.70 +    static UBool equalNodes(const void *left, const void *right);
    1.71 +#endif  /* U_HIDE_INTERNAL_API */
    1.72 +
    1.73 +protected:
    1.74 +    // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
    1.75 +    // or else the compiler will create a public default constructor.
    1.76 +    /** @internal */
    1.77 +    StringTrieBuilder();
    1.78 +    /** @internal */
    1.79 +    virtual ~StringTrieBuilder();
    1.80 +
    1.81 +#ifndef U_HIDE_INTERNAL_API
    1.82 +    /** @internal */
    1.83 +    void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
    1.84 +    /** @internal */
    1.85 +    void deleteCompactBuilder();
    1.86 +
    1.87 +    /** @internal */
    1.88 +    void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
    1.89 +
    1.90 +    /** @internal */
    1.91 +    int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
    1.92 +    /** @internal */
    1.93 +    int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
    1.94 +#endif  /* U_HIDE_INTERNAL_API */
    1.95 +
    1.96 +    class Node;
    1.97 +
    1.98 +#ifndef U_HIDE_INTERNAL_API
    1.99 +    /** @internal */
   1.100 +    Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
   1.101 +    /** @internal */
   1.102 +    Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
   1.103 +                            int32_t length, UErrorCode &errorCode);
   1.104 +#endif  /* U_HIDE_INTERNAL_API */
   1.105 +
   1.106 +    /** @internal */
   1.107 +    virtual int32_t getElementStringLength(int32_t i) const = 0;
   1.108 +    /** @internal */
   1.109 +    virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
   1.110 +    /** @internal */
   1.111 +    virtual int32_t getElementValue(int32_t i) const = 0;
   1.112 +
   1.113 +    // Finds the first unit index after this one where
   1.114 +    // the first and last element have different units again.
   1.115 +    /** @internal */
   1.116 +    virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
   1.117 +
   1.118 +    // Number of different units at unitIndex.
   1.119 +    /** @internal */
   1.120 +    virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
   1.121 +    /** @internal */
   1.122 +    virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
   1.123 +    /** @internal */
   1.124 +    virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
   1.125 +
   1.126 +    /** @internal */
   1.127 +    virtual UBool matchNodesCanHaveValues() const = 0;
   1.128 +
   1.129 +    /** @internal */
   1.130 +    virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
   1.131 +    /** @internal */
   1.132 +    virtual int32_t getMinLinearMatch() const = 0;
   1.133 +    /** @internal */
   1.134 +    virtual int32_t getMaxLinearMatchLength() const = 0;
   1.135 +
   1.136 +#ifndef U_HIDE_INTERNAL_API
   1.137 +    // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
   1.138 +    /** @internal */
   1.139 +    static const int32_t kMaxBranchLinearSubNodeLength=5;
   1.140 +
   1.141 +    // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
   1.142 +    // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
   1.143 +    /** @internal */
   1.144 +    static const int32_t kMaxSplitBranchLevels=14;
   1.145 +
   1.146 +    /**
   1.147 +     * Makes sure that there is only one unique node registered that is
   1.148 +     * equivalent to newNode.
   1.149 +     * @param newNode Input node. The builder takes ownership.
   1.150 +     * @param errorCode ICU in/out UErrorCode.
   1.151 +                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
   1.152 +     * @return newNode if it is the first of its kind, or
   1.153 +     *         an equivalent node if newNode is a duplicate.
   1.154 +     * @internal
   1.155 +     */
   1.156 +    Node *registerNode(Node *newNode, UErrorCode &errorCode);
   1.157 +    /**
   1.158 +     * Makes sure that there is only one unique FinalValueNode registered
   1.159 +     * with this value.
   1.160 +     * Avoids creating a node if the value is a duplicate.
   1.161 +     * @param value A final value.
   1.162 +     * @param errorCode ICU in/out UErrorCode.
   1.163 +                        Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
   1.164 +     * @return A FinalValueNode with the given value.
   1.165 +     * @internal
   1.166 +     */
   1.167 +    Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
   1.168 +
   1.169 +    /*
   1.170 +     * C++ note:
   1.171 +     * registerNode() and registerFinalValue() take ownership of their input nodes,
   1.172 +     * and only return owned nodes.
   1.173 +     * If they see a failure UErrorCode, they will delete the input node.
   1.174 +     * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
   1.175 +     * If there is a failure, they return NULL.
   1.176 +     *
   1.177 +     * NULL Node pointers can be safely passed into other Nodes because
   1.178 +     * they call the static Node::hashCode() which checks for a NULL pointer first.
   1.179 +     *
   1.180 +     * Therefore, as long as builder functions register a new node,
   1.181 +     * they need to check for failures only before explicitly dereferencing
   1.182 +     * a Node pointer, or before setting a new UErrorCode.
   1.183 +     */
   1.184 +
   1.185 +    // Hash set of nodes, maps from nodes to integer 1.
   1.186 +    /** @internal */
   1.187 +    UHashtable *nodes;
   1.188 +
   1.189 +    /** @internal */
   1.190 +    class Node : public UObject {
   1.191 +    public:
   1.192 +        Node(int32_t initialHash) : hash(initialHash), offset(0) {}
   1.193 +        inline int32_t hashCode() const { return hash; }
   1.194 +        // Handles node==NULL.
   1.195 +        static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
   1.196 +        // Base class operator==() compares the actual class types.
   1.197 +        virtual UBool operator==(const Node &other) const;
   1.198 +        inline UBool operator!=(const Node &other) const { return !operator==(other); }
   1.199 +        /**
   1.200 +         * Traverses the Node graph and numbers branch edges, with rightmost edges first.
   1.201 +         * This is to avoid writing a duplicate node twice.
   1.202 +         *
   1.203 +         * Branch nodes in this trie data structure are not symmetric.
   1.204 +         * Most branch edges "jump" to other nodes but the rightmost branch edges
   1.205 +         * just continue without a jump.
   1.206 +         * Therefore, write() must write the rightmost branch edge last
   1.207 +         * (trie units are written backwards), and must write it at that point even if
   1.208 +         * it is a duplicate of a node previously written elsewhere.
   1.209 +         *
   1.210 +         * This function visits and marks right branch edges first.
   1.211 +         * Edges are numbered with increasingly negative values because we share the
   1.212 +         * offset field which gets positive values when nodes are written.
   1.213 +         * A branch edge also remembers the first number for any of its edges.
   1.214 +         *
   1.215 +         * When a further-left branch edge has a number in the range of the rightmost
   1.216 +         * edge's numbers, then it will be written as part of the required right edge
   1.217 +         * and we can avoid writing it first.
   1.218 +         *
   1.219 +         * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
   1.220 +         * edge numbers.
   1.221 +         *
   1.222 +         * @param edgeNumber The first edge number for this node and its sub-nodes.
   1.223 +         * @return An edge number that is at least the maximum-negative
   1.224 +         *         of the input edge number and the numbers of this node and all of its sub-nodes.
   1.225 +         */
   1.226 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.227 +        // write() must set the offset to a positive value.
   1.228 +        virtual void write(StringTrieBuilder &builder) = 0;
   1.229 +        // See markRightEdgesFirst.
   1.230 +        inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
   1.231 +                                               StringTrieBuilder &builder) {
   1.232 +            // Note: Edge numbers are negative, lastRight<=firstRight.
   1.233 +            // If offset>0 then this node and its sub-nodes have been written already
   1.234 +            // and we need not write them again.
   1.235 +            // If this node is part of the unwritten right branch edge,
   1.236 +            // then we wait until that is written.
   1.237 +            if(offset<0 && (offset<lastRight || firstRight<offset)) {
   1.238 +                write(builder);
   1.239 +            }
   1.240 +        }
   1.241 +        inline int32_t getOffset() const { return offset; }
   1.242 +    protected:
   1.243 +        int32_t hash;
   1.244 +        int32_t offset;
   1.245 +    };
   1.246 +
   1.247 +    // This class should not be overridden because
   1.248 +    // registerFinalValue() compares a stack-allocated FinalValueNode
   1.249 +    // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
   1.250 +    // with the input node, and the
   1.251 +    // !Node::operator==(other) used inside FinalValueNode::operator==(other)
   1.252 +    // will be false if the typeid's are different.
   1.253 +    /** @internal */
   1.254 +    class FinalValueNode : public Node {
   1.255 +    public:
   1.256 +        FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
   1.257 +        virtual UBool operator==(const Node &other) const;
   1.258 +        virtual void write(StringTrieBuilder &builder);
   1.259 +    protected:
   1.260 +        int32_t value;
   1.261 +    };
   1.262 +
   1.263 +    /**
   1.264 +     * @internal 
   1.265 +     */
   1.266 +    class ValueNode : public Node {
   1.267 +    public:
   1.268 +        ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
   1.269 +        virtual UBool operator==(const Node &other) const;
   1.270 +        void setValue(int32_t v) {
   1.271 +            hasValue=TRUE;
   1.272 +            value=v;
   1.273 +            hash=hash*37+v;
   1.274 +        }
   1.275 +    protected:
   1.276 +        UBool hasValue;
   1.277 +        int32_t value;
   1.278 +    };
   1.279 +
   1.280 +    /** 
   1.281 +     * @internal 
   1.282 +     */
   1.283 +    class IntermediateValueNode : public ValueNode {
   1.284 +    public:
   1.285 +        IntermediateValueNode(int32_t v, Node *nextNode)
   1.286 +                : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
   1.287 +        virtual UBool operator==(const Node &other) const;
   1.288 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.289 +        virtual void write(StringTrieBuilder &builder);
   1.290 +    protected:
   1.291 +        Node *next;
   1.292 +    };
   1.293 +
   1.294 +    /**
   1.295 +     * @internal 
   1.296 +     */
   1.297 +    class LinearMatchNode : public ValueNode {
   1.298 +    public:
   1.299 +        LinearMatchNode(int32_t len, Node *nextNode)
   1.300 +                : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
   1.301 +                  length(len), next(nextNode) {}
   1.302 +        virtual UBool operator==(const Node &other) const;
   1.303 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.304 +    protected:
   1.305 +        int32_t length;
   1.306 +        Node *next;
   1.307 +    };
   1.308 +
   1.309 +    /**
   1.310 +     * @internal 
   1.311 +     */
   1.312 +    class BranchNode : public Node {
   1.313 +    public:
   1.314 +        BranchNode(int32_t initialHash) : Node(initialHash) {}
   1.315 +    protected:
   1.316 +        int32_t firstEdgeNumber;
   1.317 +    };
   1.318 +
   1.319 +    /**
   1.320 +     * @internal 
   1.321 +     */
   1.322 +    class ListBranchNode : public BranchNode {
   1.323 +    public:
   1.324 +        ListBranchNode() : BranchNode(0x444444), length(0) {}
   1.325 +        virtual UBool operator==(const Node &other) const;
   1.326 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.327 +        virtual void write(StringTrieBuilder &builder);
   1.328 +        // Adds a unit with a final value.
   1.329 +        void add(int32_t c, int32_t value) {
   1.330 +            units[length]=(UChar)c;
   1.331 +            equal[length]=NULL;
   1.332 +            values[length]=value;
   1.333 +            ++length;
   1.334 +            hash=(hash*37+c)*37+value;
   1.335 +        }
   1.336 +        // Adds a unit which leads to another match node.
   1.337 +        void add(int32_t c, Node *node) {
   1.338 +            units[length]=(UChar)c;
   1.339 +            equal[length]=node;
   1.340 +            values[length]=0;
   1.341 +            ++length;
   1.342 +            hash=(hash*37+c)*37+hashCode(node);
   1.343 +        }
   1.344 +    protected:
   1.345 +        Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
   1.346 +        int32_t length;
   1.347 +        int32_t values[kMaxBranchLinearSubNodeLength];
   1.348 +        UChar units[kMaxBranchLinearSubNodeLength];
   1.349 +    };
   1.350 +
   1.351 +    /**
   1.352 +     * @internal 
   1.353 +     */
   1.354 +    class SplitBranchNode : public BranchNode {
   1.355 +    public:
   1.356 +        SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
   1.357 +                : BranchNode(((0x555555*37+middleUnit)*37+
   1.358 +                              hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
   1.359 +                  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
   1.360 +        virtual UBool operator==(const Node &other) const;
   1.361 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.362 +        virtual void write(StringTrieBuilder &builder);
   1.363 +    protected:
   1.364 +        UChar unit;
   1.365 +        Node *lessThan;
   1.366 +        Node *greaterOrEqual;
   1.367 +    };
   1.368 +
   1.369 +    // Branch head node, for writing the actual node lead unit.
   1.370 +    /** @internal */
   1.371 +    class BranchHeadNode : public ValueNode {
   1.372 +    public:
   1.373 +        BranchHeadNode(int32_t len, Node *subNode)
   1.374 +                : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
   1.375 +                  length(len), next(subNode) {}
   1.376 +        virtual UBool operator==(const Node &other) const;
   1.377 +        virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
   1.378 +        virtual void write(StringTrieBuilder &builder);
   1.379 +    protected:
   1.380 +        int32_t length;
   1.381 +        Node *next;  // A branch sub-node.
   1.382 +    };
   1.383 +#endif  /* U_HIDE_INTERNAL_API */
   1.384 +
   1.385 +    /** @internal */
   1.386 +    virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
   1.387 +                                        Node *nextNode) const = 0;
   1.388 +
   1.389 +    /** @internal */
   1.390 +    virtual int32_t write(int32_t unit) = 0;
   1.391 +    /** @internal */
   1.392 +    virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
   1.393 +    /** @internal */
   1.394 +    virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
   1.395 +    /** @internal */
   1.396 +    virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
   1.397 +    /** @internal */
   1.398 +    virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
   1.399 +};
   1.400 +
   1.401 +U_NAMESPACE_END
   1.402 +
   1.403 +#endif  // __STRINGTRIEBUILDER_H__

mercurial