|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: bytestrie.h |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010sep25 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #ifndef __BYTESTRIE_H__ |
|
16 #define __BYTESTRIE_H__ |
|
17 |
|
18 /** |
|
19 * \file |
|
20 * \brief C++ API: Trie for mapping byte sequences to integer values. |
|
21 */ |
|
22 |
|
23 #include "unicode/utypes.h" |
|
24 #include "unicode/stringpiece.h" |
|
25 #include "unicode/uobject.h" |
|
26 #include "unicode/ustringtrie.h" |
|
27 |
|
28 U_NAMESPACE_BEGIN |
|
29 |
|
30 class ByteSink; |
|
31 class BytesTrieBuilder; |
|
32 class CharString; |
|
33 class UVector32; |
|
34 |
|
35 /** |
|
36 * Light-weight, non-const reader class for a BytesTrie. |
|
37 * Traverses a byte-serialized data structure with minimal state, |
|
38 * for mapping byte sequences to non-negative integer values. |
|
39 * |
|
40 * This class owns the serialized trie data only if it was constructed by |
|
41 * the builder's build() method. |
|
42 * The public constructor and the copy constructor only alias the data (only copy the pointer). |
|
43 * There is no assignment operator. |
|
44 * |
|
45 * This class is not intended for public subclassing. |
|
46 * @stable ICU 4.8 |
|
47 */ |
|
48 class U_COMMON_API BytesTrie : public UMemory { |
|
49 public: |
|
50 /** |
|
51 * Constructs a BytesTrie reader instance. |
|
52 * |
|
53 * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, |
|
54 * starting with the first byte of that sequence. |
|
55 * The BytesTrie object will not read more bytes than |
|
56 * the BytesTrieBuilder generated in the corresponding build() call. |
|
57 * |
|
58 * The array is not copied/cloned and must not be modified while |
|
59 * the BytesTrie object is in use. |
|
60 * |
|
61 * @param trieBytes The byte array that contains the serialized trie. |
|
62 * @stable ICU 4.8 |
|
63 */ |
|
64 BytesTrie(const void *trieBytes) |
|
65 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)), |
|
66 pos_(bytes_), remainingMatchLength_(-1) {} |
|
67 |
|
68 /** |
|
69 * Destructor. |
|
70 * @stable ICU 4.8 |
|
71 */ |
|
72 ~BytesTrie(); |
|
73 |
|
74 /** |
|
75 * Copy constructor, copies the other trie reader object and its state, |
|
76 * but not the byte array which will be shared. (Shallow copy.) |
|
77 * @param other Another BytesTrie object. |
|
78 * @stable ICU 4.8 |
|
79 */ |
|
80 BytesTrie(const BytesTrie &other) |
|
81 : ownedArray_(NULL), bytes_(other.bytes_), |
|
82 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} |
|
83 |
|
84 /** |
|
85 * Resets this trie to its initial state. |
|
86 * @return *this |
|
87 * @stable ICU 4.8 |
|
88 */ |
|
89 BytesTrie &reset() { |
|
90 pos_=bytes_; |
|
91 remainingMatchLength_=-1; |
|
92 return *this; |
|
93 } |
|
94 |
|
95 /** |
|
96 * BytesTrie state object, for saving a trie's current state |
|
97 * and resetting the trie back to this state later. |
|
98 * @stable ICU 4.8 |
|
99 */ |
|
100 class State : public UMemory { |
|
101 public: |
|
102 /** |
|
103 * Constructs an empty State. |
|
104 * @stable ICU 4.8 |
|
105 */ |
|
106 State() { bytes=NULL; } |
|
107 private: |
|
108 friend class BytesTrie; |
|
109 |
|
110 const uint8_t *bytes; |
|
111 const uint8_t *pos; |
|
112 int32_t remainingMatchLength; |
|
113 }; |
|
114 |
|
115 /** |
|
116 * Saves the state of this trie. |
|
117 * @param state The State object to hold the trie's state. |
|
118 * @return *this |
|
119 * @see resetToState |
|
120 * @stable ICU 4.8 |
|
121 */ |
|
122 const BytesTrie &saveState(State &state) const { |
|
123 state.bytes=bytes_; |
|
124 state.pos=pos_; |
|
125 state.remainingMatchLength=remainingMatchLength_; |
|
126 return *this; |
|
127 } |
|
128 |
|
129 /** |
|
130 * Resets this trie to the saved state. |
|
131 * If the state object contains no state, or the state of a different trie, |
|
132 * then this trie remains unchanged. |
|
133 * @param state The State object which holds a saved trie state. |
|
134 * @return *this |
|
135 * @see saveState |
|
136 * @see reset |
|
137 * @stable ICU 4.8 |
|
138 */ |
|
139 BytesTrie &resetToState(const State &state) { |
|
140 if(bytes_==state.bytes && bytes_!=NULL) { |
|
141 pos_=state.pos; |
|
142 remainingMatchLength_=state.remainingMatchLength; |
|
143 } |
|
144 return *this; |
|
145 } |
|
146 |
|
147 /** |
|
148 * Determines whether the byte sequence so far matches, whether it has a value, |
|
149 * and whether another input byte can continue a matching byte sequence. |
|
150 * @return The match/value Result. |
|
151 * @stable ICU 4.8 |
|
152 */ |
|
153 UStringTrieResult current() const; |
|
154 |
|
155 /** |
|
156 * Traverses the trie from the initial state for this input byte. |
|
157 * Equivalent to reset().next(inByte). |
|
158 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. |
|
159 * Values below -0x100 and above 0xff will never match. |
|
160 * @return The match/value Result. |
|
161 * @stable ICU 4.8 |
|
162 */ |
|
163 inline UStringTrieResult first(int32_t inByte) { |
|
164 remainingMatchLength_=-1; |
|
165 if(inByte<0) { |
|
166 inByte+=0x100; |
|
167 } |
|
168 return nextImpl(bytes_, inByte); |
|
169 } |
|
170 |
|
171 /** |
|
172 * Traverses the trie from the current state for this input byte. |
|
173 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. |
|
174 * Values below -0x100 and above 0xff will never match. |
|
175 * @return The match/value Result. |
|
176 * @stable ICU 4.8 |
|
177 */ |
|
178 UStringTrieResult next(int32_t inByte); |
|
179 |
|
180 /** |
|
181 * Traverses the trie from the current state for this byte sequence. |
|
182 * Equivalent to |
|
183 * \code |
|
184 * Result result=current(); |
|
185 * for(each c in s) |
|
186 * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; |
|
187 * result=next(c); |
|
188 * return result; |
|
189 * \endcode |
|
190 * @param s A string or byte sequence. Can be NULL if length is 0. |
|
191 * @param length The length of the byte sequence. Can be -1 if NUL-terminated. |
|
192 * @return The match/value Result. |
|
193 * @stable ICU 4.8 |
|
194 */ |
|
195 UStringTrieResult next(const char *s, int32_t length); |
|
196 |
|
197 /** |
|
198 * Returns a matching byte sequence's value if called immediately after |
|
199 * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. |
|
200 * getValue() can be called multiple times. |
|
201 * |
|
202 * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! |
|
203 * @return The value for the byte sequence so far. |
|
204 * @stable ICU 4.8 |
|
205 */ |
|
206 inline int32_t getValue() const { |
|
207 const uint8_t *pos=pos_; |
|
208 int32_t leadByte=*pos++; |
|
209 // U_ASSERT(leadByte>=kMinValueLead); |
|
210 return readValue(pos, leadByte>>1); |
|
211 } |
|
212 |
|
213 /** |
|
214 * Determines whether all byte sequences reachable from the current state |
|
215 * map to the same value. |
|
216 * @param uniqueValue Receives the unique value, if this function returns TRUE. |
|
217 * (output-only) |
|
218 * @return TRUE if all byte sequences reachable from the current state |
|
219 * map to the same value. |
|
220 * @stable ICU 4.8 |
|
221 */ |
|
222 inline UBool hasUniqueValue(int32_t &uniqueValue) const { |
|
223 const uint8_t *pos=pos_; |
|
224 // Skip the rest of a pending linear-match node. |
|
225 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); |
|
226 } |
|
227 |
|
228 /** |
|
229 * Finds each byte which continues the byte sequence from the current state. |
|
230 * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. |
|
231 * @param out Each next byte is appended to this object. |
|
232 * (Only uses the out.Append(s, length) method.) |
|
233 * @return the number of bytes which continue the byte sequence from here |
|
234 * @stable ICU 4.8 |
|
235 */ |
|
236 int32_t getNextBytes(ByteSink &out) const; |
|
237 |
|
238 /** |
|
239 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. |
|
240 * @stable ICU 4.8 |
|
241 */ |
|
242 class U_COMMON_API Iterator : public UMemory { |
|
243 public: |
|
244 /** |
|
245 * Iterates from the root of a byte-serialized BytesTrie. |
|
246 * @param trieBytes The trie bytes. |
|
247 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. |
|
248 * Otherwise, the iterator returns strings with this maximum length. |
|
249 * @param errorCode Standard ICU error code. Its input value must |
|
250 * pass the U_SUCCESS() test, or else the function returns |
|
251 * immediately. Check for U_FAILURE() on output or use with |
|
252 * function chaining. (See User Guide for details.) |
|
253 * @stable ICU 4.8 |
|
254 */ |
|
255 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); |
|
256 |
|
257 /** |
|
258 * Iterates from the current state of the specified BytesTrie. |
|
259 * @param trie The trie whose state will be copied for iteration. |
|
260 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. |
|
261 * Otherwise, the iterator returns strings with this maximum length. |
|
262 * @param errorCode Standard ICU error code. Its input value must |
|
263 * pass the U_SUCCESS() test, or else the function returns |
|
264 * immediately. Check for U_FAILURE() on output or use with |
|
265 * function chaining. (See User Guide for details.) |
|
266 * @stable ICU 4.8 |
|
267 */ |
|
268 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); |
|
269 |
|
270 /** |
|
271 * Destructor. |
|
272 * @stable ICU 4.8 |
|
273 */ |
|
274 ~Iterator(); |
|
275 |
|
276 /** |
|
277 * Resets this iterator to its initial state. |
|
278 * @return *this |
|
279 * @stable ICU 4.8 |
|
280 */ |
|
281 Iterator &reset(); |
|
282 |
|
283 /** |
|
284 * @return TRUE if there are more elements. |
|
285 * @stable ICU 4.8 |
|
286 */ |
|
287 UBool hasNext() const; |
|
288 |
|
289 /** |
|
290 * Finds the next (byte sequence, value) pair if there is one. |
|
291 * |
|
292 * If the byte sequence is truncated to the maximum length and does not |
|
293 * have a real value, then the value is set to -1. |
|
294 * In this case, this "not a real value" is indistinguishable from |
|
295 * a real value of -1. |
|
296 * @param errorCode Standard ICU error code. Its input value must |
|
297 * pass the U_SUCCESS() test, or else the function returns |
|
298 * immediately. Check for U_FAILURE() on output or use with |
|
299 * function chaining. (See User Guide for details.) |
|
300 * @return TRUE if there is another element. |
|
301 * @stable ICU 4.8 |
|
302 */ |
|
303 UBool next(UErrorCode &errorCode); |
|
304 |
|
305 /** |
|
306 * @return The NUL-terminated byte sequence for the last successful next(). |
|
307 * @stable ICU 4.8 |
|
308 */ |
|
309 const StringPiece &getString() const { return sp_; } |
|
310 /** |
|
311 * @return The value for the last successful next(). |
|
312 * @stable ICU 4.8 |
|
313 */ |
|
314 int32_t getValue() const { return value_; } |
|
315 |
|
316 private: |
|
317 UBool truncateAndStop(); |
|
318 |
|
319 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); |
|
320 |
|
321 const uint8_t *bytes_; |
|
322 const uint8_t *pos_; |
|
323 const uint8_t *initialPos_; |
|
324 int32_t remainingMatchLength_; |
|
325 int32_t initialRemainingMatchLength_; |
|
326 |
|
327 CharString *str_; |
|
328 StringPiece sp_; |
|
329 int32_t maxLength_; |
|
330 int32_t value_; |
|
331 |
|
332 // The stack stores pairs of integers for backtracking to another |
|
333 // outbound edge of a branch node. |
|
334 // The first integer is an offset from bytes_. |
|
335 // The second integer has the str_->length() from before the node in bits 15..0, |
|
336 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) |
|
337 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, |
|
338 // but the code looks more confusing that way.) |
|
339 UVector32 *stack_; |
|
340 }; |
|
341 |
|
342 private: |
|
343 friend class BytesTrieBuilder; |
|
344 |
|
345 /** |
|
346 * Constructs a BytesTrie reader instance. |
|
347 * Unlike the public constructor which just aliases an array, |
|
348 * this constructor adopts the builder's array. |
|
349 * This constructor is only called by the builder. |
|
350 */ |
|
351 BytesTrie(void *adoptBytes, const void *trieBytes) |
|
352 : ownedArray_(static_cast<uint8_t *>(adoptBytes)), |
|
353 bytes_(static_cast<const uint8_t *>(trieBytes)), |
|
354 pos_(bytes_), remainingMatchLength_(-1) {} |
|
355 |
|
356 // No assignment operator. |
|
357 BytesTrie &operator=(const BytesTrie &other); |
|
358 |
|
359 inline void stop() { |
|
360 pos_=NULL; |
|
361 } |
|
362 |
|
363 // Reads a compact 32-bit integer. |
|
364 // pos is already after the leadByte, and the lead byte is already shifted right by 1. |
|
365 static int32_t readValue(const uint8_t *pos, int32_t leadByte); |
|
366 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { |
|
367 // U_ASSERT(leadByte>=kMinValueLead); |
|
368 if(leadByte>=(kMinTwoByteValueLead<<1)) { |
|
369 if(leadByte<(kMinThreeByteValueLead<<1)) { |
|
370 ++pos; |
|
371 } else if(leadByte<(kFourByteValueLead<<1)) { |
|
372 pos+=2; |
|
373 } else { |
|
374 pos+=3+((leadByte>>1)&1); |
|
375 } |
|
376 } |
|
377 return pos; |
|
378 } |
|
379 static inline const uint8_t *skipValue(const uint8_t *pos) { |
|
380 int32_t leadByte=*pos++; |
|
381 return skipValue(pos, leadByte); |
|
382 } |
|
383 |
|
384 // Reads a jump delta and jumps. |
|
385 static const uint8_t *jumpByDelta(const uint8_t *pos); |
|
386 |
|
387 static inline const uint8_t *skipDelta(const uint8_t *pos) { |
|
388 int32_t delta=*pos++; |
|
389 if(delta>=kMinTwoByteDeltaLead) { |
|
390 if(delta<kMinThreeByteDeltaLead) { |
|
391 ++pos; |
|
392 } else if(delta<kFourByteDeltaLead) { |
|
393 pos+=2; |
|
394 } else { |
|
395 pos+=3+(delta&1); |
|
396 } |
|
397 } |
|
398 return pos; |
|
399 } |
|
400 |
|
401 static inline UStringTrieResult valueResult(int32_t node) { |
|
402 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); |
|
403 } |
|
404 |
|
405 // Handles a branch node for both next(byte) and next(string). |
|
406 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); |
|
407 |
|
408 // Requires remainingLength_<0. |
|
409 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); |
|
410 |
|
411 // Helper functions for hasUniqueValue(). |
|
412 // Recursively finds a unique value (or whether there is not a unique one) |
|
413 // from a branch. |
|
414 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, |
|
415 UBool haveUniqueValue, int32_t &uniqueValue); |
|
416 // Recursively finds a unique value (or whether there is not a unique one) |
|
417 // starting from a position on a node lead byte. |
|
418 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); |
|
419 |
|
420 // Helper functions for getNextBytes(). |
|
421 // getNextBytes() when pos is on a branch node. |
|
422 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); |
|
423 static void append(ByteSink &out, int c); |
|
424 |
|
425 // BytesTrie data structure |
|
426 // |
|
427 // The trie consists of a series of byte-serialized nodes for incremental |
|
428 // string/byte sequence matching. The root node is at the beginning of the trie data. |
|
429 // |
|
430 // Types of nodes are distinguished by their node lead byte ranges. |
|
431 // After each node, except a final-value node, another node follows to |
|
432 // encode match values or continue matching further bytes. |
|
433 // |
|
434 // Node types: |
|
435 // - Value node: Stores a 32-bit integer in a compact, variable-length format. |
|
436 // The value is for the string/byte sequence so far. |
|
437 // One node bit indicates whether the value is final or whether |
|
438 // matching continues with the next node. |
|
439 // - Linear-match node: Matches a number of bytes. |
|
440 // - Branch node: Branches to other nodes according to the current input byte. |
|
441 // The node byte is the length of the branch (number of bytes to select from) |
|
442 // minus 1. It is followed by a sub-node: |
|
443 // - If the length is at most kMaxBranchLinearSubNodeLength, then |
|
444 // there are length-1 (key, value) pairs and then one more comparison byte. |
|
445 // If one of the key bytes matches, then the value is either a final value for |
|
446 // the string/byte sequence so far, or a "jump" delta to the next node. |
|
447 // If the last byte matches, then matching continues with the next node. |
|
448 // (Values have the same encoding as value nodes.) |
|
449 // - If the length is greater than kMaxBranchLinearSubNodeLength, then |
|
450 // there is one byte and one "jump" delta. |
|
451 // If the input byte is less than the sub-node byte, then "jump" by delta to |
|
452 // the next sub-node which will have a length of length/2. |
|
453 // (The delta has its own compact encoding.) |
|
454 // Otherwise, skip the "jump" delta to the next sub-node |
|
455 // which will have a length of length-length/2. |
|
456 |
|
457 // Node lead byte values. |
|
458 |
|
459 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise |
|
460 // the length is one more than the next byte. |
|
461 |
|
462 // For a branch sub-node with at most this many entries, we drop down |
|
463 // to a linear search. |
|
464 static const int32_t kMaxBranchLinearSubNodeLength=5; |
|
465 |
|
466 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. |
|
467 static const int32_t kMinLinearMatch=0x10; |
|
468 static const int32_t kMaxLinearMatchLength=0x10; |
|
469 |
|
470 // 20..ff: Variable-length value node. |
|
471 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) |
|
472 // Then shift-right by 1 bit. |
|
473 // The remaining lead byte value indicates the number of following bytes (0..4) |
|
474 // and contains the value's top bits. |
|
475 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 |
|
476 // It is a final value if bit 0 is set. |
|
477 static const int32_t kValueIsFinal=1; |
|
478 |
|
479 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. |
|
480 static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 |
|
481 static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. |
|
482 |
|
483 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 |
|
484 static const int32_t kMaxTwoByteValue=0x1aff; |
|
485 |
|
486 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c |
|
487 static const int32_t kFourByteValueLead=0x7e; |
|
488 |
|
489 // A little more than Unicode code points. (0x11ffff) |
|
490 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; |
|
491 |
|
492 static const int32_t kFiveByteValueLead=0x7f; |
|
493 |
|
494 // Compact delta integers. |
|
495 static const int32_t kMaxOneByteDelta=0xbf; |
|
496 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 |
|
497 static const int32_t kMinThreeByteDeltaLead=0xf0; |
|
498 static const int32_t kFourByteDeltaLead=0xfe; |
|
499 static const int32_t kFiveByteDeltaLead=0xff; |
|
500 |
|
501 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff |
|
502 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff |
|
503 |
|
504 uint8_t *ownedArray_; |
|
505 |
|
506 // Fixed value referencing the BytesTrie bytes. |
|
507 const uint8_t *bytes_; |
|
508 |
|
509 // Iterator variables. |
|
510 |
|
511 // Pointer to next trie byte to read. NULL if no more matches. |
|
512 const uint8_t *pos_; |
|
513 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. |
|
514 int32_t remainingMatchLength_; |
|
515 }; |
|
516 |
|
517 U_NAMESPACE_END |
|
518 |
|
519 #endif // __BYTESTRIE_H__ |