michael@0: /* michael@0: ******************************************************************************* michael@0: * Copyright (C) 2010-2012, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ******************************************************************************* michael@0: * file name: stringtriebuilder.h michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2010dec24 michael@0: * created by: Markus W. Scherer michael@0: */ michael@0: michael@0: #ifndef __STRINGTRIEBUILDER_H__ michael@0: #define __STRINGTRIEBUILDER_H__ michael@0: michael@0: #include "unicode/utypes.h" michael@0: #include "unicode/uobject.h" michael@0: michael@0: /** michael@0: * \file michael@0: * \brief C++ API: Builder API for trie builders michael@0: */ michael@0: michael@0: // Forward declaration. michael@0: struct UHashtable; michael@0: typedef struct UHashtable UHashtable; michael@0: michael@0: /** michael@0: * Build options for BytesTrieBuilder and CharsTrieBuilder. michael@0: * @stable ICU 4.8 michael@0: */ michael@0: enum UStringTrieBuildOption { michael@0: /** michael@0: * Builds a trie quickly. michael@0: * @stable ICU 4.8 michael@0: */ michael@0: USTRINGTRIE_BUILD_FAST, michael@0: /** michael@0: * Builds a trie more slowly, attempting to generate michael@0: * a shorter but equivalent serialization. michael@0: * This build option also uses more memory. michael@0: * michael@0: * This option can be effective when many integer values are the same michael@0: * and string/byte sequence suffixes can be shared. michael@0: * Runtime speed is not expected to improve. michael@0: * @stable ICU 4.8 michael@0: */ michael@0: USTRINGTRIE_BUILD_SMALL michael@0: }; michael@0: michael@0: U_NAMESPACE_BEGIN michael@0: michael@0: /** michael@0: * Base class for string trie builder classes. michael@0: * michael@0: * This class is not intended for public subclassing. michael@0: * @stable ICU 4.8 michael@0: */ michael@0: class U_COMMON_API StringTrieBuilder : public UObject { michael@0: public: michael@0: #ifndef U_HIDE_INTERNAL_API michael@0: /** @internal */ michael@0: static UBool hashNode(const void *node); michael@0: /** @internal */ michael@0: static UBool equalNodes(const void *left, const void *right); michael@0: #endif /* U_HIDE_INTERNAL_API */ michael@0: michael@0: protected: michael@0: // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API michael@0: // or else the compiler will create a public default constructor. michael@0: /** @internal */ michael@0: StringTrieBuilder(); michael@0: /** @internal */ michael@0: virtual ~StringTrieBuilder(); michael@0: michael@0: #ifndef U_HIDE_INTERNAL_API michael@0: /** @internal */ michael@0: void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); michael@0: /** @internal */ michael@0: void deleteCompactBuilder(); michael@0: michael@0: /** @internal */ michael@0: void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); michael@0: michael@0: /** @internal */ michael@0: int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); michael@0: /** @internal */ michael@0: int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); michael@0: #endif /* U_HIDE_INTERNAL_API */ michael@0: michael@0: class Node; michael@0: michael@0: #ifndef U_HIDE_INTERNAL_API michael@0: /** @internal */ michael@0: Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); michael@0: /** @internal */ michael@0: Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, michael@0: int32_t length, UErrorCode &errorCode); michael@0: #endif /* U_HIDE_INTERNAL_API */ michael@0: michael@0: /** @internal */ michael@0: virtual int32_t getElementStringLength(int32_t i) const = 0; michael@0: /** @internal */ michael@0: virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; michael@0: /** @internal */ michael@0: virtual int32_t getElementValue(int32_t i) const = 0; michael@0: michael@0: // Finds the first unit index after this one where michael@0: // the first and last element have different units again. michael@0: /** @internal */ michael@0: virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; michael@0: michael@0: // Number of different units at unitIndex. michael@0: /** @internal */ michael@0: virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; michael@0: /** @internal */ michael@0: virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; michael@0: /** @internal */ michael@0: virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; michael@0: michael@0: /** @internal */ michael@0: virtual UBool matchNodesCanHaveValues() const = 0; michael@0: michael@0: /** @internal */ michael@0: virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; michael@0: /** @internal */ michael@0: virtual int32_t getMinLinearMatch() const = 0; michael@0: /** @internal */ michael@0: virtual int32_t getMaxLinearMatchLength() const = 0; michael@0: michael@0: #ifndef U_HIDE_INTERNAL_API michael@0: // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). michael@0: /** @internal */ michael@0: static const int32_t kMaxBranchLinearSubNodeLength=5; michael@0: michael@0: // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. michael@0: // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. michael@0: /** @internal */ michael@0: static const int32_t kMaxSplitBranchLevels=14; michael@0: michael@0: /** michael@0: * Makes sure that there is only one unique node registered that is michael@0: * equivalent to newNode. michael@0: * @param newNode Input node. The builder takes ownership. michael@0: * @param errorCode ICU in/out UErrorCode. michael@0: Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. michael@0: * @return newNode if it is the first of its kind, or michael@0: * an equivalent node if newNode is a duplicate. michael@0: * @internal michael@0: */ michael@0: Node *registerNode(Node *newNode, UErrorCode &errorCode); michael@0: /** michael@0: * Makes sure that there is only one unique FinalValueNode registered michael@0: * with this value. michael@0: * Avoids creating a node if the value is a duplicate. michael@0: * @param value A final value. michael@0: * @param errorCode ICU in/out UErrorCode. michael@0: Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. michael@0: * @return A FinalValueNode with the given value. michael@0: * @internal michael@0: */ michael@0: Node *registerFinalValue(int32_t value, UErrorCode &errorCode); michael@0: michael@0: /* michael@0: * C++ note: michael@0: * registerNode() and registerFinalValue() take ownership of their input nodes, michael@0: * and only return owned nodes. michael@0: * If they see a failure UErrorCode, they will delete the input node. michael@0: * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. michael@0: * If there is a failure, they return NULL. michael@0: * michael@0: * NULL Node pointers can be safely passed into other Nodes because michael@0: * they call the static Node::hashCode() which checks for a NULL pointer first. michael@0: * michael@0: * Therefore, as long as builder functions register a new node, michael@0: * they need to check for failures only before explicitly dereferencing michael@0: * a Node pointer, or before setting a new UErrorCode. michael@0: */ michael@0: michael@0: // Hash set of nodes, maps from nodes to integer 1. michael@0: /** @internal */ michael@0: UHashtable *nodes; michael@0: michael@0: /** @internal */ michael@0: class Node : public UObject { michael@0: public: michael@0: Node(int32_t initialHash) : hash(initialHash), offset(0) {} michael@0: inline int32_t hashCode() const { return hash; } michael@0: // Handles node==NULL. michael@0: static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } michael@0: // Base class operator==() compares the actual class types. michael@0: virtual UBool operator==(const Node &other) const; michael@0: inline UBool operator!=(const Node &other) const { return !operator==(other); } michael@0: /** michael@0: * Traverses the Node graph and numbers branch edges, with rightmost edges first. michael@0: * This is to avoid writing a duplicate node twice. michael@0: * michael@0: * Branch nodes in this trie data structure are not symmetric. michael@0: * Most branch edges "jump" to other nodes but the rightmost branch edges michael@0: * just continue without a jump. michael@0: * Therefore, write() must write the rightmost branch edge last michael@0: * (trie units are written backwards), and must write it at that point even if michael@0: * it is a duplicate of a node previously written elsewhere. michael@0: * michael@0: * This function visits and marks right branch edges first. michael@0: * Edges are numbered with increasingly negative values because we share the michael@0: * offset field which gets positive values when nodes are written. michael@0: * A branch edge also remembers the first number for any of its edges. michael@0: * michael@0: * When a further-left branch edge has a number in the range of the rightmost michael@0: * edge's numbers, then it will be written as part of the required right edge michael@0: * and we can avoid writing it first. michael@0: * michael@0: * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative michael@0: * edge numbers. michael@0: * michael@0: * @param edgeNumber The first edge number for this node and its sub-nodes. michael@0: * @return An edge number that is at least the maximum-negative michael@0: * of the input edge number and the numbers of this node and all of its sub-nodes. michael@0: */ michael@0: virtual int32_t markRightEdgesFirst(int32_t edgeNumber); michael@0: // write() must set the offset to a positive value. michael@0: virtual void write(StringTrieBuilder &builder) = 0; michael@0: // See markRightEdgesFirst. michael@0: inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, michael@0: StringTrieBuilder &builder) { michael@0: // Note: Edge numbers are negative, lastRight<=firstRight. michael@0: // If offset>0 then this node and its sub-nodes have been written already michael@0: // and we need not write them again. michael@0: // If this node is part of the unwritten right branch edge, michael@0: // then we wait until that is written. michael@0: if(offset<0 && (offset