|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: stringtriebuilder.h |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010dec24 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #ifndef __STRINGTRIEBUILDER_H__ |
|
16 #define __STRINGTRIEBUILDER_H__ |
|
17 |
|
18 #include "unicode/utypes.h" |
|
19 #include "unicode/uobject.h" |
|
20 |
|
21 /** |
|
22 * \file |
|
23 * \brief C++ API: Builder API for trie builders |
|
24 */ |
|
25 |
|
26 // Forward declaration. |
|
27 struct UHashtable; |
|
28 typedef struct UHashtable UHashtable; |
|
29 |
|
30 /** |
|
31 * Build options for BytesTrieBuilder and CharsTrieBuilder. |
|
32 * @stable ICU 4.8 |
|
33 */ |
|
34 enum UStringTrieBuildOption { |
|
35 /** |
|
36 * Builds a trie quickly. |
|
37 * @stable ICU 4.8 |
|
38 */ |
|
39 USTRINGTRIE_BUILD_FAST, |
|
40 /** |
|
41 * Builds a trie more slowly, attempting to generate |
|
42 * a shorter but equivalent serialization. |
|
43 * This build option also uses more memory. |
|
44 * |
|
45 * This option can be effective when many integer values are the same |
|
46 * and string/byte sequence suffixes can be shared. |
|
47 * Runtime speed is not expected to improve. |
|
48 * @stable ICU 4.8 |
|
49 */ |
|
50 USTRINGTRIE_BUILD_SMALL |
|
51 }; |
|
52 |
|
53 U_NAMESPACE_BEGIN |
|
54 |
|
55 /** |
|
56 * Base class for string trie builder classes. |
|
57 * |
|
58 * This class is not intended for public subclassing. |
|
59 * @stable ICU 4.8 |
|
60 */ |
|
61 class U_COMMON_API StringTrieBuilder : public UObject { |
|
62 public: |
|
63 #ifndef U_HIDE_INTERNAL_API |
|
64 /** @internal */ |
|
65 static UBool hashNode(const void *node); |
|
66 /** @internal */ |
|
67 static UBool equalNodes(const void *left, const void *right); |
|
68 #endif /* U_HIDE_INTERNAL_API */ |
|
69 |
|
70 protected: |
|
71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API |
|
72 // or else the compiler will create a public default constructor. |
|
73 /** @internal */ |
|
74 StringTrieBuilder(); |
|
75 /** @internal */ |
|
76 virtual ~StringTrieBuilder(); |
|
77 |
|
78 #ifndef U_HIDE_INTERNAL_API |
|
79 /** @internal */ |
|
80 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); |
|
81 /** @internal */ |
|
82 void deleteCompactBuilder(); |
|
83 |
|
84 /** @internal */ |
|
85 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); |
|
86 |
|
87 /** @internal */ |
|
88 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); |
|
89 /** @internal */ |
|
90 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); |
|
91 #endif /* U_HIDE_INTERNAL_API */ |
|
92 |
|
93 class Node; |
|
94 |
|
95 #ifndef U_HIDE_INTERNAL_API |
|
96 /** @internal */ |
|
97 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); |
|
98 /** @internal */ |
|
99 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, |
|
100 int32_t length, UErrorCode &errorCode); |
|
101 #endif /* U_HIDE_INTERNAL_API */ |
|
102 |
|
103 /** @internal */ |
|
104 virtual int32_t getElementStringLength(int32_t i) const = 0; |
|
105 /** @internal */ |
|
106 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; |
|
107 /** @internal */ |
|
108 virtual int32_t getElementValue(int32_t i) const = 0; |
|
109 |
|
110 // Finds the first unit index after this one where |
|
111 // the first and last element have different units again. |
|
112 /** @internal */ |
|
113 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; |
|
114 |
|
115 // Number of different units at unitIndex. |
|
116 /** @internal */ |
|
117 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; |
|
118 /** @internal */ |
|
119 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; |
|
120 /** @internal */ |
|
121 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; |
|
122 |
|
123 /** @internal */ |
|
124 virtual UBool matchNodesCanHaveValues() const = 0; |
|
125 |
|
126 /** @internal */ |
|
127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; |
|
128 /** @internal */ |
|
129 virtual int32_t getMinLinearMatch() const = 0; |
|
130 /** @internal */ |
|
131 virtual int32_t getMaxLinearMatchLength() const = 0; |
|
132 |
|
133 #ifndef U_HIDE_INTERNAL_API |
|
134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). |
|
135 /** @internal */ |
|
136 static const int32_t kMaxBranchLinearSubNodeLength=5; |
|
137 |
|
138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. |
|
139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. |
|
140 /** @internal */ |
|
141 static const int32_t kMaxSplitBranchLevels=14; |
|
142 |
|
143 /** |
|
144 * Makes sure that there is only one unique node registered that is |
|
145 * equivalent to newNode. |
|
146 * @param newNode Input node. The builder takes ownership. |
|
147 * @param errorCode ICU in/out UErrorCode. |
|
148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. |
|
149 * @return newNode if it is the first of its kind, or |
|
150 * an equivalent node if newNode is a duplicate. |
|
151 * @internal |
|
152 */ |
|
153 Node *registerNode(Node *newNode, UErrorCode &errorCode); |
|
154 /** |
|
155 * Makes sure that there is only one unique FinalValueNode registered |
|
156 * with this value. |
|
157 * Avoids creating a node if the value is a duplicate. |
|
158 * @param value A final value. |
|
159 * @param errorCode ICU in/out UErrorCode. |
|
160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. |
|
161 * @return A FinalValueNode with the given value. |
|
162 * @internal |
|
163 */ |
|
164 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); |
|
165 |
|
166 /* |
|
167 * C++ note: |
|
168 * registerNode() and registerFinalValue() take ownership of their input nodes, |
|
169 * and only return owned nodes. |
|
170 * If they see a failure UErrorCode, they will delete the input node. |
|
171 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. |
|
172 * If there is a failure, they return NULL. |
|
173 * |
|
174 * NULL Node pointers can be safely passed into other Nodes because |
|
175 * they call the static Node::hashCode() which checks for a NULL pointer first. |
|
176 * |
|
177 * Therefore, as long as builder functions register a new node, |
|
178 * they need to check for failures only before explicitly dereferencing |
|
179 * a Node pointer, or before setting a new UErrorCode. |
|
180 */ |
|
181 |
|
182 // Hash set of nodes, maps from nodes to integer 1. |
|
183 /** @internal */ |
|
184 UHashtable *nodes; |
|
185 |
|
186 /** @internal */ |
|
187 class Node : public UObject { |
|
188 public: |
|
189 Node(int32_t initialHash) : hash(initialHash), offset(0) {} |
|
190 inline int32_t hashCode() const { return hash; } |
|
191 // Handles node==NULL. |
|
192 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } |
|
193 // Base class operator==() compares the actual class types. |
|
194 virtual UBool operator==(const Node &other) const; |
|
195 inline UBool operator!=(const Node &other) const { return !operator==(other); } |
|
196 /** |
|
197 * Traverses the Node graph and numbers branch edges, with rightmost edges first. |
|
198 * This is to avoid writing a duplicate node twice. |
|
199 * |
|
200 * Branch nodes in this trie data structure are not symmetric. |
|
201 * Most branch edges "jump" to other nodes but the rightmost branch edges |
|
202 * just continue without a jump. |
|
203 * Therefore, write() must write the rightmost branch edge last |
|
204 * (trie units are written backwards), and must write it at that point even if |
|
205 * it is a duplicate of a node previously written elsewhere. |
|
206 * |
|
207 * This function visits and marks right branch edges first. |
|
208 * Edges are numbered with increasingly negative values because we share the |
|
209 * offset field which gets positive values when nodes are written. |
|
210 * A branch edge also remembers the first number for any of its edges. |
|
211 * |
|
212 * When a further-left branch edge has a number in the range of the rightmost |
|
213 * edge's numbers, then it will be written as part of the required right edge |
|
214 * and we can avoid writing it first. |
|
215 * |
|
216 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative |
|
217 * edge numbers. |
|
218 * |
|
219 * @param edgeNumber The first edge number for this node and its sub-nodes. |
|
220 * @return An edge number that is at least the maximum-negative |
|
221 * of the input edge number and the numbers of this node and all of its sub-nodes. |
|
222 */ |
|
223 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
224 // write() must set the offset to a positive value. |
|
225 virtual void write(StringTrieBuilder &builder) = 0; |
|
226 // See markRightEdgesFirst. |
|
227 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, |
|
228 StringTrieBuilder &builder) { |
|
229 // Note: Edge numbers are negative, lastRight<=firstRight. |
|
230 // If offset>0 then this node and its sub-nodes have been written already |
|
231 // and we need not write them again. |
|
232 // If this node is part of the unwritten right branch edge, |
|
233 // then we wait until that is written. |
|
234 if(offset<0 && (offset<lastRight || firstRight<offset)) { |
|
235 write(builder); |
|
236 } |
|
237 } |
|
238 inline int32_t getOffset() const { return offset; } |
|
239 protected: |
|
240 int32_t hash; |
|
241 int32_t offset; |
|
242 }; |
|
243 |
|
244 // This class should not be overridden because |
|
245 // registerFinalValue() compares a stack-allocated FinalValueNode |
|
246 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) |
|
247 // with the input node, and the |
|
248 // !Node::operator==(other) used inside FinalValueNode::operator==(other) |
|
249 // will be false if the typeid's are different. |
|
250 /** @internal */ |
|
251 class FinalValueNode : public Node { |
|
252 public: |
|
253 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} |
|
254 virtual UBool operator==(const Node &other) const; |
|
255 virtual void write(StringTrieBuilder &builder); |
|
256 protected: |
|
257 int32_t value; |
|
258 }; |
|
259 |
|
260 /** |
|
261 * @internal |
|
262 */ |
|
263 class ValueNode : public Node { |
|
264 public: |
|
265 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} |
|
266 virtual UBool operator==(const Node &other) const; |
|
267 void setValue(int32_t v) { |
|
268 hasValue=TRUE; |
|
269 value=v; |
|
270 hash=hash*37+v; |
|
271 } |
|
272 protected: |
|
273 UBool hasValue; |
|
274 int32_t value; |
|
275 }; |
|
276 |
|
277 /** |
|
278 * @internal |
|
279 */ |
|
280 class IntermediateValueNode : public ValueNode { |
|
281 public: |
|
282 IntermediateValueNode(int32_t v, Node *nextNode) |
|
283 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } |
|
284 virtual UBool operator==(const Node &other) const; |
|
285 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
286 virtual void write(StringTrieBuilder &builder); |
|
287 protected: |
|
288 Node *next; |
|
289 }; |
|
290 |
|
291 /** |
|
292 * @internal |
|
293 */ |
|
294 class LinearMatchNode : public ValueNode { |
|
295 public: |
|
296 LinearMatchNode(int32_t len, Node *nextNode) |
|
297 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), |
|
298 length(len), next(nextNode) {} |
|
299 virtual UBool operator==(const Node &other) const; |
|
300 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
301 protected: |
|
302 int32_t length; |
|
303 Node *next; |
|
304 }; |
|
305 |
|
306 /** |
|
307 * @internal |
|
308 */ |
|
309 class BranchNode : public Node { |
|
310 public: |
|
311 BranchNode(int32_t initialHash) : Node(initialHash) {} |
|
312 protected: |
|
313 int32_t firstEdgeNumber; |
|
314 }; |
|
315 |
|
316 /** |
|
317 * @internal |
|
318 */ |
|
319 class ListBranchNode : public BranchNode { |
|
320 public: |
|
321 ListBranchNode() : BranchNode(0x444444), length(0) {} |
|
322 virtual UBool operator==(const Node &other) const; |
|
323 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
324 virtual void write(StringTrieBuilder &builder); |
|
325 // Adds a unit with a final value. |
|
326 void add(int32_t c, int32_t value) { |
|
327 units[length]=(UChar)c; |
|
328 equal[length]=NULL; |
|
329 values[length]=value; |
|
330 ++length; |
|
331 hash=(hash*37+c)*37+value; |
|
332 } |
|
333 // Adds a unit which leads to another match node. |
|
334 void add(int32_t c, Node *node) { |
|
335 units[length]=(UChar)c; |
|
336 equal[length]=node; |
|
337 values[length]=0; |
|
338 ++length; |
|
339 hash=(hash*37+c)*37+hashCode(node); |
|
340 } |
|
341 protected: |
|
342 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". |
|
343 int32_t length; |
|
344 int32_t values[kMaxBranchLinearSubNodeLength]; |
|
345 UChar units[kMaxBranchLinearSubNodeLength]; |
|
346 }; |
|
347 |
|
348 /** |
|
349 * @internal |
|
350 */ |
|
351 class SplitBranchNode : public BranchNode { |
|
352 public: |
|
353 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) |
|
354 : BranchNode(((0x555555*37+middleUnit)*37+ |
|
355 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), |
|
356 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} |
|
357 virtual UBool operator==(const Node &other) const; |
|
358 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
359 virtual void write(StringTrieBuilder &builder); |
|
360 protected: |
|
361 UChar unit; |
|
362 Node *lessThan; |
|
363 Node *greaterOrEqual; |
|
364 }; |
|
365 |
|
366 // Branch head node, for writing the actual node lead unit. |
|
367 /** @internal */ |
|
368 class BranchHeadNode : public ValueNode { |
|
369 public: |
|
370 BranchHeadNode(int32_t len, Node *subNode) |
|
371 : ValueNode((0x666666*37+len)*37+hashCode(subNode)), |
|
372 length(len), next(subNode) {} |
|
373 virtual UBool operator==(const Node &other) const; |
|
374 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); |
|
375 virtual void write(StringTrieBuilder &builder); |
|
376 protected: |
|
377 int32_t length; |
|
378 Node *next; // A branch sub-node. |
|
379 }; |
|
380 #endif /* U_HIDE_INTERNAL_API */ |
|
381 |
|
382 /** @internal */ |
|
383 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, |
|
384 Node *nextNode) const = 0; |
|
385 |
|
386 /** @internal */ |
|
387 virtual int32_t write(int32_t unit) = 0; |
|
388 /** @internal */ |
|
389 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; |
|
390 /** @internal */ |
|
391 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; |
|
392 /** @internal */ |
|
393 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; |
|
394 /** @internal */ |
|
395 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; |
|
396 }; |
|
397 |
|
398 U_NAMESPACE_END |
|
399 |
|
400 #endif // __STRINGTRIEBUILDER_H__ |