intl/icu/source/common/unicode/stringtriebuilder.h

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 * Copyright (C) 2010-2012, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 *******************************************************************************
michael@0 6 * file name: stringtriebuilder.h
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2010dec24
michael@0 12 * created by: Markus W. Scherer
michael@0 13 */
michael@0 14
michael@0 15 #ifndef __STRINGTRIEBUILDER_H__
michael@0 16 #define __STRINGTRIEBUILDER_H__
michael@0 17
michael@0 18 #include "unicode/utypes.h"
michael@0 19 #include "unicode/uobject.h"
michael@0 20
michael@0 21 /**
michael@0 22 * \file
michael@0 23 * \brief C++ API: Builder API for trie builders
michael@0 24 */
michael@0 25
michael@0 26 // Forward declaration.
michael@0 27 struct UHashtable;
michael@0 28 typedef struct UHashtable UHashtable;
michael@0 29
michael@0 30 /**
michael@0 31 * Build options for BytesTrieBuilder and CharsTrieBuilder.
michael@0 32 * @stable ICU 4.8
michael@0 33 */
michael@0 34 enum UStringTrieBuildOption {
michael@0 35 /**
michael@0 36 * Builds a trie quickly.
michael@0 37 * @stable ICU 4.8
michael@0 38 */
michael@0 39 USTRINGTRIE_BUILD_FAST,
michael@0 40 /**
michael@0 41 * Builds a trie more slowly, attempting to generate
michael@0 42 * a shorter but equivalent serialization.
michael@0 43 * This build option also uses more memory.
michael@0 44 *
michael@0 45 * This option can be effective when many integer values are the same
michael@0 46 * and string/byte sequence suffixes can be shared.
michael@0 47 * Runtime speed is not expected to improve.
michael@0 48 * @stable ICU 4.8
michael@0 49 */
michael@0 50 USTRINGTRIE_BUILD_SMALL
michael@0 51 };
michael@0 52
michael@0 53 U_NAMESPACE_BEGIN
michael@0 54
michael@0 55 /**
michael@0 56 * Base class for string trie builder classes.
michael@0 57 *
michael@0 58 * This class is not intended for public subclassing.
michael@0 59 * @stable ICU 4.8
michael@0 60 */
michael@0 61 class U_COMMON_API StringTrieBuilder : public UObject {
michael@0 62 public:
michael@0 63 #ifndef U_HIDE_INTERNAL_API
michael@0 64 /** @internal */
michael@0 65 static UBool hashNode(const void *node);
michael@0 66 /** @internal */
michael@0 67 static UBool equalNodes(const void *left, const void *right);
michael@0 68 #endif /* U_HIDE_INTERNAL_API */
michael@0 69
michael@0 70 protected:
michael@0 71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
michael@0 72 // or else the compiler will create a public default constructor.
michael@0 73 /** @internal */
michael@0 74 StringTrieBuilder();
michael@0 75 /** @internal */
michael@0 76 virtual ~StringTrieBuilder();
michael@0 77
michael@0 78 #ifndef U_HIDE_INTERNAL_API
michael@0 79 /** @internal */
michael@0 80 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
michael@0 81 /** @internal */
michael@0 82 void deleteCompactBuilder();
michael@0 83
michael@0 84 /** @internal */
michael@0 85 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
michael@0 86
michael@0 87 /** @internal */
michael@0 88 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
michael@0 89 /** @internal */
michael@0 90 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
michael@0 91 #endif /* U_HIDE_INTERNAL_API */
michael@0 92
michael@0 93 class Node;
michael@0 94
michael@0 95 #ifndef U_HIDE_INTERNAL_API
michael@0 96 /** @internal */
michael@0 97 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
michael@0 98 /** @internal */
michael@0 99 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
michael@0 100 int32_t length, UErrorCode &errorCode);
michael@0 101 #endif /* U_HIDE_INTERNAL_API */
michael@0 102
michael@0 103 /** @internal */
michael@0 104 virtual int32_t getElementStringLength(int32_t i) const = 0;
michael@0 105 /** @internal */
michael@0 106 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
michael@0 107 /** @internal */
michael@0 108 virtual int32_t getElementValue(int32_t i) const = 0;
michael@0 109
michael@0 110 // Finds the first unit index after this one where
michael@0 111 // the first and last element have different units again.
michael@0 112 /** @internal */
michael@0 113 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
michael@0 114
michael@0 115 // Number of different units at unitIndex.
michael@0 116 /** @internal */
michael@0 117 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
michael@0 118 /** @internal */
michael@0 119 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
michael@0 120 /** @internal */
michael@0 121 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
michael@0 122
michael@0 123 /** @internal */
michael@0 124 virtual UBool matchNodesCanHaveValues() const = 0;
michael@0 125
michael@0 126 /** @internal */
michael@0 127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
michael@0 128 /** @internal */
michael@0 129 virtual int32_t getMinLinearMatch() const = 0;
michael@0 130 /** @internal */
michael@0 131 virtual int32_t getMaxLinearMatchLength() const = 0;
michael@0 132
michael@0 133 #ifndef U_HIDE_INTERNAL_API
michael@0 134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
michael@0 135 /** @internal */
michael@0 136 static const int32_t kMaxBranchLinearSubNodeLength=5;
michael@0 137
michael@0 138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
michael@0 139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
michael@0 140 /** @internal */
michael@0 141 static const int32_t kMaxSplitBranchLevels=14;
michael@0 142
michael@0 143 /**
michael@0 144 * Makes sure that there is only one unique node registered that is
michael@0 145 * equivalent to newNode.
michael@0 146 * @param newNode Input node. The builder takes ownership.
michael@0 147 * @param errorCode ICU in/out UErrorCode.
michael@0 148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
michael@0 149 * @return newNode if it is the first of its kind, or
michael@0 150 * an equivalent node if newNode is a duplicate.
michael@0 151 * @internal
michael@0 152 */
michael@0 153 Node *registerNode(Node *newNode, UErrorCode &errorCode);
michael@0 154 /**
michael@0 155 * Makes sure that there is only one unique FinalValueNode registered
michael@0 156 * with this value.
michael@0 157 * Avoids creating a node if the value is a duplicate.
michael@0 158 * @param value A final value.
michael@0 159 * @param errorCode ICU in/out UErrorCode.
michael@0 160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
michael@0 161 * @return A FinalValueNode with the given value.
michael@0 162 * @internal
michael@0 163 */
michael@0 164 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
michael@0 165
michael@0 166 /*
michael@0 167 * C++ note:
michael@0 168 * registerNode() and registerFinalValue() take ownership of their input nodes,
michael@0 169 * and only return owned nodes.
michael@0 170 * If they see a failure UErrorCode, they will delete the input node.
michael@0 171 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
michael@0 172 * If there is a failure, they return NULL.
michael@0 173 *
michael@0 174 * NULL Node pointers can be safely passed into other Nodes because
michael@0 175 * they call the static Node::hashCode() which checks for a NULL pointer first.
michael@0 176 *
michael@0 177 * Therefore, as long as builder functions register a new node,
michael@0 178 * they need to check for failures only before explicitly dereferencing
michael@0 179 * a Node pointer, or before setting a new UErrorCode.
michael@0 180 */
michael@0 181
michael@0 182 // Hash set of nodes, maps from nodes to integer 1.
michael@0 183 /** @internal */
michael@0 184 UHashtable *nodes;
michael@0 185
michael@0 186 /** @internal */
michael@0 187 class Node : public UObject {
michael@0 188 public:
michael@0 189 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
michael@0 190 inline int32_t hashCode() const { return hash; }
michael@0 191 // Handles node==NULL.
michael@0 192 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
michael@0 193 // Base class operator==() compares the actual class types.
michael@0 194 virtual UBool operator==(const Node &other) const;
michael@0 195 inline UBool operator!=(const Node &other) const { return !operator==(other); }
michael@0 196 /**
michael@0 197 * Traverses the Node graph and numbers branch edges, with rightmost edges first.
michael@0 198 * This is to avoid writing a duplicate node twice.
michael@0 199 *
michael@0 200 * Branch nodes in this trie data structure are not symmetric.
michael@0 201 * Most branch edges "jump" to other nodes but the rightmost branch edges
michael@0 202 * just continue without a jump.
michael@0 203 * Therefore, write() must write the rightmost branch edge last
michael@0 204 * (trie units are written backwards), and must write it at that point even if
michael@0 205 * it is a duplicate of a node previously written elsewhere.
michael@0 206 *
michael@0 207 * This function visits and marks right branch edges first.
michael@0 208 * Edges are numbered with increasingly negative values because we share the
michael@0 209 * offset field which gets positive values when nodes are written.
michael@0 210 * A branch edge also remembers the first number for any of its edges.
michael@0 211 *
michael@0 212 * When a further-left branch edge has a number in the range of the rightmost
michael@0 213 * edge's numbers, then it will be written as part of the required right edge
michael@0 214 * and we can avoid writing it first.
michael@0 215 *
michael@0 216 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
michael@0 217 * edge numbers.
michael@0 218 *
michael@0 219 * @param edgeNumber The first edge number for this node and its sub-nodes.
michael@0 220 * @return An edge number that is at least the maximum-negative
michael@0 221 * of the input edge number and the numbers of this node and all of its sub-nodes.
michael@0 222 */
michael@0 223 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 224 // write() must set the offset to a positive value.
michael@0 225 virtual void write(StringTrieBuilder &builder) = 0;
michael@0 226 // See markRightEdgesFirst.
michael@0 227 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
michael@0 228 StringTrieBuilder &builder) {
michael@0 229 // Note: Edge numbers are negative, lastRight<=firstRight.
michael@0 230 // If offset>0 then this node and its sub-nodes have been written already
michael@0 231 // and we need not write them again.
michael@0 232 // If this node is part of the unwritten right branch edge,
michael@0 233 // then we wait until that is written.
michael@0 234 if(offset<0 && (offset<lastRight || firstRight<offset)) {
michael@0 235 write(builder);
michael@0 236 }
michael@0 237 }
michael@0 238 inline int32_t getOffset() const { return offset; }
michael@0 239 protected:
michael@0 240 int32_t hash;
michael@0 241 int32_t offset;
michael@0 242 };
michael@0 243
michael@0 244 // This class should not be overridden because
michael@0 245 // registerFinalValue() compares a stack-allocated FinalValueNode
michael@0 246 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
michael@0 247 // with the input node, and the
michael@0 248 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
michael@0 249 // will be false if the typeid's are different.
michael@0 250 /** @internal */
michael@0 251 class FinalValueNode : public Node {
michael@0 252 public:
michael@0 253 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
michael@0 254 virtual UBool operator==(const Node &other) const;
michael@0 255 virtual void write(StringTrieBuilder &builder);
michael@0 256 protected:
michael@0 257 int32_t value;
michael@0 258 };
michael@0 259
michael@0 260 /**
michael@0 261 * @internal
michael@0 262 */
michael@0 263 class ValueNode : public Node {
michael@0 264 public:
michael@0 265 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
michael@0 266 virtual UBool operator==(const Node &other) const;
michael@0 267 void setValue(int32_t v) {
michael@0 268 hasValue=TRUE;
michael@0 269 value=v;
michael@0 270 hash=hash*37+v;
michael@0 271 }
michael@0 272 protected:
michael@0 273 UBool hasValue;
michael@0 274 int32_t value;
michael@0 275 };
michael@0 276
michael@0 277 /**
michael@0 278 * @internal
michael@0 279 */
michael@0 280 class IntermediateValueNode : public ValueNode {
michael@0 281 public:
michael@0 282 IntermediateValueNode(int32_t v, Node *nextNode)
michael@0 283 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
michael@0 284 virtual UBool operator==(const Node &other) const;
michael@0 285 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 286 virtual void write(StringTrieBuilder &builder);
michael@0 287 protected:
michael@0 288 Node *next;
michael@0 289 };
michael@0 290
michael@0 291 /**
michael@0 292 * @internal
michael@0 293 */
michael@0 294 class LinearMatchNode : public ValueNode {
michael@0 295 public:
michael@0 296 LinearMatchNode(int32_t len, Node *nextNode)
michael@0 297 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
michael@0 298 length(len), next(nextNode) {}
michael@0 299 virtual UBool operator==(const Node &other) const;
michael@0 300 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 301 protected:
michael@0 302 int32_t length;
michael@0 303 Node *next;
michael@0 304 };
michael@0 305
michael@0 306 /**
michael@0 307 * @internal
michael@0 308 */
michael@0 309 class BranchNode : public Node {
michael@0 310 public:
michael@0 311 BranchNode(int32_t initialHash) : Node(initialHash) {}
michael@0 312 protected:
michael@0 313 int32_t firstEdgeNumber;
michael@0 314 };
michael@0 315
michael@0 316 /**
michael@0 317 * @internal
michael@0 318 */
michael@0 319 class ListBranchNode : public BranchNode {
michael@0 320 public:
michael@0 321 ListBranchNode() : BranchNode(0x444444), length(0) {}
michael@0 322 virtual UBool operator==(const Node &other) const;
michael@0 323 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 324 virtual void write(StringTrieBuilder &builder);
michael@0 325 // Adds a unit with a final value.
michael@0 326 void add(int32_t c, int32_t value) {
michael@0 327 units[length]=(UChar)c;
michael@0 328 equal[length]=NULL;
michael@0 329 values[length]=value;
michael@0 330 ++length;
michael@0 331 hash=(hash*37+c)*37+value;
michael@0 332 }
michael@0 333 // Adds a unit which leads to another match node.
michael@0 334 void add(int32_t c, Node *node) {
michael@0 335 units[length]=(UChar)c;
michael@0 336 equal[length]=node;
michael@0 337 values[length]=0;
michael@0 338 ++length;
michael@0 339 hash=(hash*37+c)*37+hashCode(node);
michael@0 340 }
michael@0 341 protected:
michael@0 342 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
michael@0 343 int32_t length;
michael@0 344 int32_t values[kMaxBranchLinearSubNodeLength];
michael@0 345 UChar units[kMaxBranchLinearSubNodeLength];
michael@0 346 };
michael@0 347
michael@0 348 /**
michael@0 349 * @internal
michael@0 350 */
michael@0 351 class SplitBranchNode : public BranchNode {
michael@0 352 public:
michael@0 353 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
michael@0 354 : BranchNode(((0x555555*37+middleUnit)*37+
michael@0 355 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
michael@0 356 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
michael@0 357 virtual UBool operator==(const Node &other) const;
michael@0 358 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 359 virtual void write(StringTrieBuilder &builder);
michael@0 360 protected:
michael@0 361 UChar unit;
michael@0 362 Node *lessThan;
michael@0 363 Node *greaterOrEqual;
michael@0 364 };
michael@0 365
michael@0 366 // Branch head node, for writing the actual node lead unit.
michael@0 367 /** @internal */
michael@0 368 class BranchHeadNode : public ValueNode {
michael@0 369 public:
michael@0 370 BranchHeadNode(int32_t len, Node *subNode)
michael@0 371 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
michael@0 372 length(len), next(subNode) {}
michael@0 373 virtual UBool operator==(const Node &other) const;
michael@0 374 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
michael@0 375 virtual void write(StringTrieBuilder &builder);
michael@0 376 protected:
michael@0 377 int32_t length;
michael@0 378 Node *next; // A branch sub-node.
michael@0 379 };
michael@0 380 #endif /* U_HIDE_INTERNAL_API */
michael@0 381
michael@0 382 /** @internal */
michael@0 383 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
michael@0 384 Node *nextNode) const = 0;
michael@0 385
michael@0 386 /** @internal */
michael@0 387 virtual int32_t write(int32_t unit) = 0;
michael@0 388 /** @internal */
michael@0 389 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
michael@0 390 /** @internal */
michael@0 391 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
michael@0 392 /** @internal */
michael@0 393 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
michael@0 394 /** @internal */
michael@0 395 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
michael@0 396 };
michael@0 397
michael@0 398 U_NAMESPACE_END
michael@0 399
michael@0 400 #endif // __STRINGTRIEBUILDER_H__

mercurial