1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unicode/stringtriebuilder.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,400 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* Copyright (C) 2010-2012, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +******************************************************************************* 1.9 +* file name: stringtriebuilder.h 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2010dec24 1.15 +* created by: Markus W. Scherer 1.16 +*/ 1.17 + 1.18 +#ifndef __STRINGTRIEBUILDER_H__ 1.19 +#define __STRINGTRIEBUILDER_H__ 1.20 + 1.21 +#include "unicode/utypes.h" 1.22 +#include "unicode/uobject.h" 1.23 + 1.24 +/** 1.25 + * \file 1.26 + * \brief C++ API: Builder API for trie builders 1.27 + */ 1.28 + 1.29 +// Forward declaration. 1.30 +struct UHashtable; 1.31 +typedef struct UHashtable UHashtable; 1.32 + 1.33 +/** 1.34 + * Build options for BytesTrieBuilder and CharsTrieBuilder. 1.35 + * @stable ICU 4.8 1.36 + */ 1.37 +enum UStringTrieBuildOption { 1.38 + /** 1.39 + * Builds a trie quickly. 1.40 + * @stable ICU 4.8 1.41 + */ 1.42 + USTRINGTRIE_BUILD_FAST, 1.43 + /** 1.44 + * Builds a trie more slowly, attempting to generate 1.45 + * a shorter but equivalent serialization. 1.46 + * This build option also uses more memory. 1.47 + * 1.48 + * This option can be effective when many integer values are the same 1.49 + * and string/byte sequence suffixes can be shared. 1.50 + * Runtime speed is not expected to improve. 1.51 + * @stable ICU 4.8 1.52 + */ 1.53 + USTRINGTRIE_BUILD_SMALL 1.54 +}; 1.55 + 1.56 +U_NAMESPACE_BEGIN 1.57 + 1.58 +/** 1.59 + * Base class for string trie builder classes. 1.60 + * 1.61 + * This class is not intended for public subclassing. 1.62 + * @stable ICU 4.8 1.63 + */ 1.64 +class U_COMMON_API StringTrieBuilder : public UObject { 1.65 +public: 1.66 +#ifndef U_HIDE_INTERNAL_API 1.67 + /** @internal */ 1.68 + static UBool hashNode(const void *node); 1.69 + /** @internal */ 1.70 + static UBool equalNodes(const void *left, const void *right); 1.71 +#endif /* U_HIDE_INTERNAL_API */ 1.72 + 1.73 +protected: 1.74 + // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API 1.75 + // or else the compiler will create a public default constructor. 1.76 + /** @internal */ 1.77 + StringTrieBuilder(); 1.78 + /** @internal */ 1.79 + virtual ~StringTrieBuilder(); 1.80 + 1.81 +#ifndef U_HIDE_INTERNAL_API 1.82 + /** @internal */ 1.83 + void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 1.84 + /** @internal */ 1.85 + void deleteCompactBuilder(); 1.86 + 1.87 + /** @internal */ 1.88 + void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 1.89 + 1.90 + /** @internal */ 1.91 + int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 1.92 + /** @internal */ 1.93 + int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 1.94 +#endif /* U_HIDE_INTERNAL_API */ 1.95 + 1.96 + class Node; 1.97 + 1.98 +#ifndef U_HIDE_INTERNAL_API 1.99 + /** @internal */ 1.100 + Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 1.101 + /** @internal */ 1.102 + Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 1.103 + int32_t length, UErrorCode &errorCode); 1.104 +#endif /* U_HIDE_INTERNAL_API */ 1.105 + 1.106 + /** @internal */ 1.107 + virtual int32_t getElementStringLength(int32_t i) const = 0; 1.108 + /** @internal */ 1.109 + virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; 1.110 + /** @internal */ 1.111 + virtual int32_t getElementValue(int32_t i) const = 0; 1.112 + 1.113 + // Finds the first unit index after this one where 1.114 + // the first and last element have different units again. 1.115 + /** @internal */ 1.116 + virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 1.117 + 1.118 + // Number of different units at unitIndex. 1.119 + /** @internal */ 1.120 + virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 1.121 + /** @internal */ 1.122 + virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 1.123 + /** @internal */ 1.124 + virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; 1.125 + 1.126 + /** @internal */ 1.127 + virtual UBool matchNodesCanHaveValues() const = 0; 1.128 + 1.129 + /** @internal */ 1.130 + virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 1.131 + /** @internal */ 1.132 + virtual int32_t getMinLinearMatch() const = 0; 1.133 + /** @internal */ 1.134 + virtual int32_t getMaxLinearMatchLength() const = 0; 1.135 + 1.136 +#ifndef U_HIDE_INTERNAL_API 1.137 + // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 1.138 + /** @internal */ 1.139 + static const int32_t kMaxBranchLinearSubNodeLength=5; 1.140 + 1.141 + // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. 1.142 + // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 1.143 + /** @internal */ 1.144 + static const int32_t kMaxSplitBranchLevels=14; 1.145 + 1.146 + /** 1.147 + * Makes sure that there is only one unique node registered that is 1.148 + * equivalent to newNode. 1.149 + * @param newNode Input node. The builder takes ownership. 1.150 + * @param errorCode ICU in/out UErrorCode. 1.151 + Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 1.152 + * @return newNode if it is the first of its kind, or 1.153 + * an equivalent node if newNode is a duplicate. 1.154 + * @internal 1.155 + */ 1.156 + Node *registerNode(Node *newNode, UErrorCode &errorCode); 1.157 + /** 1.158 + * Makes sure that there is only one unique FinalValueNode registered 1.159 + * with this value. 1.160 + * Avoids creating a node if the value is a duplicate. 1.161 + * @param value A final value. 1.162 + * @param errorCode ICU in/out UErrorCode. 1.163 + Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 1.164 + * @return A FinalValueNode with the given value. 1.165 + * @internal 1.166 + */ 1.167 + Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 1.168 + 1.169 + /* 1.170 + * C++ note: 1.171 + * registerNode() and registerFinalValue() take ownership of their input nodes, 1.172 + * and only return owned nodes. 1.173 + * If they see a failure UErrorCode, they will delete the input node. 1.174 + * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 1.175 + * If there is a failure, they return NULL. 1.176 + * 1.177 + * NULL Node pointers can be safely passed into other Nodes because 1.178 + * they call the static Node::hashCode() which checks for a NULL pointer first. 1.179 + * 1.180 + * Therefore, as long as builder functions register a new node, 1.181 + * they need to check for failures only before explicitly dereferencing 1.182 + * a Node pointer, or before setting a new UErrorCode. 1.183 + */ 1.184 + 1.185 + // Hash set of nodes, maps from nodes to integer 1. 1.186 + /** @internal */ 1.187 + UHashtable *nodes; 1.188 + 1.189 + /** @internal */ 1.190 + class Node : public UObject { 1.191 + public: 1.192 + Node(int32_t initialHash) : hash(initialHash), offset(0) {} 1.193 + inline int32_t hashCode() const { return hash; } 1.194 + // Handles node==NULL. 1.195 + static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 1.196 + // Base class operator==() compares the actual class types. 1.197 + virtual UBool operator==(const Node &other) const; 1.198 + inline UBool operator!=(const Node &other) const { return !operator==(other); } 1.199 + /** 1.200 + * Traverses the Node graph and numbers branch edges, with rightmost edges first. 1.201 + * This is to avoid writing a duplicate node twice. 1.202 + * 1.203 + * Branch nodes in this trie data structure are not symmetric. 1.204 + * Most branch edges "jump" to other nodes but the rightmost branch edges 1.205 + * just continue without a jump. 1.206 + * Therefore, write() must write the rightmost branch edge last 1.207 + * (trie units are written backwards), and must write it at that point even if 1.208 + * it is a duplicate of a node previously written elsewhere. 1.209 + * 1.210 + * This function visits and marks right branch edges first. 1.211 + * Edges are numbered with increasingly negative values because we share the 1.212 + * offset field which gets positive values when nodes are written. 1.213 + * A branch edge also remembers the first number for any of its edges. 1.214 + * 1.215 + * When a further-left branch edge has a number in the range of the rightmost 1.216 + * edge's numbers, then it will be written as part of the required right edge 1.217 + * and we can avoid writing it first. 1.218 + * 1.219 + * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 1.220 + * edge numbers. 1.221 + * 1.222 + * @param edgeNumber The first edge number for this node and its sub-nodes. 1.223 + * @return An edge number that is at least the maximum-negative 1.224 + * of the input edge number and the numbers of this node and all of its sub-nodes. 1.225 + */ 1.226 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.227 + // write() must set the offset to a positive value. 1.228 + virtual void write(StringTrieBuilder &builder) = 0; 1.229 + // See markRightEdgesFirst. 1.230 + inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 1.231 + StringTrieBuilder &builder) { 1.232 + // Note: Edge numbers are negative, lastRight<=firstRight. 1.233 + // If offset>0 then this node and its sub-nodes have been written already 1.234 + // and we need not write them again. 1.235 + // If this node is part of the unwritten right branch edge, 1.236 + // then we wait until that is written. 1.237 + if(offset<0 && (offset<lastRight || firstRight<offset)) { 1.238 + write(builder); 1.239 + } 1.240 + } 1.241 + inline int32_t getOffset() const { return offset; } 1.242 + protected: 1.243 + int32_t hash; 1.244 + int32_t offset; 1.245 + }; 1.246 + 1.247 + // This class should not be overridden because 1.248 + // registerFinalValue() compares a stack-allocated FinalValueNode 1.249 + // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 1.250 + // with the input node, and the 1.251 + // !Node::operator==(other) used inside FinalValueNode::operator==(other) 1.252 + // will be false if the typeid's are different. 1.253 + /** @internal */ 1.254 + class FinalValueNode : public Node { 1.255 + public: 1.256 + FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} 1.257 + virtual UBool operator==(const Node &other) const; 1.258 + virtual void write(StringTrieBuilder &builder); 1.259 + protected: 1.260 + int32_t value; 1.261 + }; 1.262 + 1.263 + /** 1.264 + * @internal 1.265 + */ 1.266 + class ValueNode : public Node { 1.267 + public: 1.268 + ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 1.269 + virtual UBool operator==(const Node &other) const; 1.270 + void setValue(int32_t v) { 1.271 + hasValue=TRUE; 1.272 + value=v; 1.273 + hash=hash*37+v; 1.274 + } 1.275 + protected: 1.276 + UBool hasValue; 1.277 + int32_t value; 1.278 + }; 1.279 + 1.280 + /** 1.281 + * @internal 1.282 + */ 1.283 + class IntermediateValueNode : public ValueNode { 1.284 + public: 1.285 + IntermediateValueNode(int32_t v, Node *nextNode) 1.286 + : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } 1.287 + virtual UBool operator==(const Node &other) const; 1.288 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.289 + virtual void write(StringTrieBuilder &builder); 1.290 + protected: 1.291 + Node *next; 1.292 + }; 1.293 + 1.294 + /** 1.295 + * @internal 1.296 + */ 1.297 + class LinearMatchNode : public ValueNode { 1.298 + public: 1.299 + LinearMatchNode(int32_t len, Node *nextNode) 1.300 + : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), 1.301 + length(len), next(nextNode) {} 1.302 + virtual UBool operator==(const Node &other) const; 1.303 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.304 + protected: 1.305 + int32_t length; 1.306 + Node *next; 1.307 + }; 1.308 + 1.309 + /** 1.310 + * @internal 1.311 + */ 1.312 + class BranchNode : public Node { 1.313 + public: 1.314 + BranchNode(int32_t initialHash) : Node(initialHash) {} 1.315 + protected: 1.316 + int32_t firstEdgeNumber; 1.317 + }; 1.318 + 1.319 + /** 1.320 + * @internal 1.321 + */ 1.322 + class ListBranchNode : public BranchNode { 1.323 + public: 1.324 + ListBranchNode() : BranchNode(0x444444), length(0) {} 1.325 + virtual UBool operator==(const Node &other) const; 1.326 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.327 + virtual void write(StringTrieBuilder &builder); 1.328 + // Adds a unit with a final value. 1.329 + void add(int32_t c, int32_t value) { 1.330 + units[length]=(UChar)c; 1.331 + equal[length]=NULL; 1.332 + values[length]=value; 1.333 + ++length; 1.334 + hash=(hash*37+c)*37+value; 1.335 + } 1.336 + // Adds a unit which leads to another match node. 1.337 + void add(int32_t c, Node *node) { 1.338 + units[length]=(UChar)c; 1.339 + equal[length]=node; 1.340 + values[length]=0; 1.341 + ++length; 1.342 + hash=(hash*37+c)*37+hashCode(node); 1.343 + } 1.344 + protected: 1.345 + Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 1.346 + int32_t length; 1.347 + int32_t values[kMaxBranchLinearSubNodeLength]; 1.348 + UChar units[kMaxBranchLinearSubNodeLength]; 1.349 + }; 1.350 + 1.351 + /** 1.352 + * @internal 1.353 + */ 1.354 + class SplitBranchNode : public BranchNode { 1.355 + public: 1.356 + SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 1.357 + : BranchNode(((0x555555*37+middleUnit)*37+ 1.358 + hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), 1.359 + unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 1.360 + virtual UBool operator==(const Node &other) const; 1.361 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.362 + virtual void write(StringTrieBuilder &builder); 1.363 + protected: 1.364 + UChar unit; 1.365 + Node *lessThan; 1.366 + Node *greaterOrEqual; 1.367 + }; 1.368 + 1.369 + // Branch head node, for writing the actual node lead unit. 1.370 + /** @internal */ 1.371 + class BranchHeadNode : public ValueNode { 1.372 + public: 1.373 + BranchHeadNode(int32_t len, Node *subNode) 1.374 + : ValueNode((0x666666*37+len)*37+hashCode(subNode)), 1.375 + length(len), next(subNode) {} 1.376 + virtual UBool operator==(const Node &other) const; 1.377 + virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 1.378 + virtual void write(StringTrieBuilder &builder); 1.379 + protected: 1.380 + int32_t length; 1.381 + Node *next; // A branch sub-node. 1.382 + }; 1.383 +#endif /* U_HIDE_INTERNAL_API */ 1.384 + 1.385 + /** @internal */ 1.386 + virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 1.387 + Node *nextNode) const = 0; 1.388 + 1.389 + /** @internal */ 1.390 + virtual int32_t write(int32_t unit) = 0; 1.391 + /** @internal */ 1.392 + virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 1.393 + /** @internal */ 1.394 + virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 1.395 + /** @internal */ 1.396 + virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 1.397 + /** @internal */ 1.398 + virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 1.399 +}; 1.400 + 1.401 +U_NAMESPACE_END 1.402 + 1.403 +#endif // __STRINGTRIEBUILDER_H__