|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: bytestrie.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010sep25 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 #include "unicode/bytestream.h" |
|
17 #include "unicode/bytestrie.h" |
|
18 #include "unicode/uobject.h" |
|
19 #include "cmemory.h" |
|
20 #include "uassert.h" |
|
21 |
|
22 U_NAMESPACE_BEGIN |
|
23 |
|
24 BytesTrie::~BytesTrie() { |
|
25 uprv_free(ownedArray_); |
|
26 } |
|
27 |
|
28 // lead byte already shifted right by 1. |
|
29 int32_t |
|
30 BytesTrie::readValue(const uint8_t *pos, int32_t leadByte) { |
|
31 int32_t value; |
|
32 if(leadByte<kMinTwoByteValueLead) { |
|
33 value=leadByte-kMinOneByteValueLead; |
|
34 } else if(leadByte<kMinThreeByteValueLead) { |
|
35 value=((leadByte-kMinTwoByteValueLead)<<8)|*pos; |
|
36 } else if(leadByte<kFourByteValueLead) { |
|
37 value=((leadByte-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; |
|
38 } else if(leadByte==kFourByteValueLead) { |
|
39 value=(pos[0]<<16)|(pos[1]<<8)|pos[2]; |
|
40 } else { |
|
41 value=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; |
|
42 } |
|
43 return value; |
|
44 } |
|
45 |
|
46 const uint8_t * |
|
47 BytesTrie::jumpByDelta(const uint8_t *pos) { |
|
48 int32_t delta=*pos++; |
|
49 if(delta<kMinTwoByteDeltaLead) { |
|
50 // nothing to do |
|
51 } else if(delta<kMinThreeByteDeltaLead) { |
|
52 delta=((delta-kMinTwoByteDeltaLead)<<8)|*pos++; |
|
53 } else if(delta<kFourByteDeltaLead) { |
|
54 delta=((delta-kMinThreeByteDeltaLead)<<16)|(pos[0]<<8)|pos[1]; |
|
55 pos+=2; |
|
56 } else if(delta==kFourByteDeltaLead) { |
|
57 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; |
|
58 pos+=3; |
|
59 } else { |
|
60 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; |
|
61 pos+=4; |
|
62 } |
|
63 return pos+delta; |
|
64 } |
|
65 |
|
66 UStringTrieResult |
|
67 BytesTrie::current() const { |
|
68 const uint8_t *pos=pos_; |
|
69 if(pos==NULL) { |
|
70 return USTRINGTRIE_NO_MATCH; |
|
71 } else { |
|
72 int32_t node; |
|
73 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? |
|
74 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
75 } |
|
76 } |
|
77 |
|
78 UStringTrieResult |
|
79 BytesTrie::branchNext(const uint8_t *pos, int32_t length, int32_t inByte) { |
|
80 // Branch according to the current byte. |
|
81 if(length==0) { |
|
82 length=*pos++; |
|
83 } |
|
84 ++length; |
|
85 // The length of the branch is the number of bytes to select from. |
|
86 // The data structure encodes a binary search. |
|
87 while(length>kMaxBranchLinearSubNodeLength) { |
|
88 if(inByte<*pos++) { |
|
89 length>>=1; |
|
90 pos=jumpByDelta(pos); |
|
91 } else { |
|
92 length=length-(length>>1); |
|
93 pos=skipDelta(pos); |
|
94 } |
|
95 } |
|
96 // Drop down to linear search for the last few bytes. |
|
97 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 |
|
98 // and divides length by 2. |
|
99 do { |
|
100 if(inByte==*pos++) { |
|
101 UStringTrieResult result; |
|
102 int32_t node=*pos; |
|
103 U_ASSERT(node>=kMinValueLead); |
|
104 if(node&kValueIsFinal) { |
|
105 // Leave the final value for getValue() to read. |
|
106 result=USTRINGTRIE_FINAL_VALUE; |
|
107 } else { |
|
108 // Use the non-final value as the jump delta. |
|
109 ++pos; |
|
110 // int32_t delta=readValue(pos, node>>1); |
|
111 node>>=1; |
|
112 int32_t delta; |
|
113 if(node<kMinTwoByteValueLead) { |
|
114 delta=node-kMinOneByteValueLead; |
|
115 } else if(node<kMinThreeByteValueLead) { |
|
116 delta=((node-kMinTwoByteValueLead)<<8)|*pos++; |
|
117 } else if(node<kFourByteValueLead) { |
|
118 delta=((node-kMinThreeByteValueLead)<<16)|(pos[0]<<8)|pos[1]; |
|
119 pos+=2; |
|
120 } else if(node==kFourByteValueLead) { |
|
121 delta=(pos[0]<<16)|(pos[1]<<8)|pos[2]; |
|
122 pos+=3; |
|
123 } else { |
|
124 delta=(pos[0]<<24)|(pos[1]<<16)|(pos[2]<<8)|pos[3]; |
|
125 pos+=4; |
|
126 } |
|
127 // end readValue() |
|
128 pos+=delta; |
|
129 node=*pos; |
|
130 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
131 } |
|
132 pos_=pos; |
|
133 return result; |
|
134 } |
|
135 --length; |
|
136 pos=skipValue(pos); |
|
137 } while(length>1); |
|
138 if(inByte==*pos++) { |
|
139 pos_=pos; |
|
140 int32_t node=*pos; |
|
141 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
142 } else { |
|
143 stop(); |
|
144 return USTRINGTRIE_NO_MATCH; |
|
145 } |
|
146 } |
|
147 |
|
148 UStringTrieResult |
|
149 BytesTrie::nextImpl(const uint8_t *pos, int32_t inByte) { |
|
150 for(;;) { |
|
151 int32_t node=*pos++; |
|
152 if(node<kMinLinearMatch) { |
|
153 return branchNext(pos, node, inByte); |
|
154 } else if(node<kMinValueLead) { |
|
155 // Match the first of length+1 bytes. |
|
156 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. |
|
157 if(inByte==*pos++) { |
|
158 remainingMatchLength_=--length; |
|
159 pos_=pos; |
|
160 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
161 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
162 } else { |
|
163 // No match. |
|
164 break; |
|
165 } |
|
166 } else if(node&kValueIsFinal) { |
|
167 // No further matching bytes. |
|
168 break; |
|
169 } else { |
|
170 // Skip intermediate value. |
|
171 pos=skipValue(pos, node); |
|
172 // The next node must not also be a value node. |
|
173 U_ASSERT(*pos<kMinValueLead); |
|
174 } |
|
175 } |
|
176 stop(); |
|
177 return USTRINGTRIE_NO_MATCH; |
|
178 } |
|
179 |
|
180 UStringTrieResult |
|
181 BytesTrie::next(int32_t inByte) { |
|
182 const uint8_t *pos=pos_; |
|
183 if(pos==NULL) { |
|
184 return USTRINGTRIE_NO_MATCH; |
|
185 } |
|
186 if(inByte<0) { |
|
187 inByte+=0x100; |
|
188 } |
|
189 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. |
|
190 if(length>=0) { |
|
191 // Remaining part of a linear-match node. |
|
192 if(inByte==*pos++) { |
|
193 remainingMatchLength_=--length; |
|
194 pos_=pos; |
|
195 int32_t node; |
|
196 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
197 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
198 } else { |
|
199 stop(); |
|
200 return USTRINGTRIE_NO_MATCH; |
|
201 } |
|
202 } |
|
203 return nextImpl(pos, inByte); |
|
204 } |
|
205 |
|
206 UStringTrieResult |
|
207 BytesTrie::next(const char *s, int32_t sLength) { |
|
208 if(sLength<0 ? *s==0 : sLength==0) { |
|
209 // Empty input. |
|
210 return current(); |
|
211 } |
|
212 const uint8_t *pos=pos_; |
|
213 if(pos==NULL) { |
|
214 return USTRINGTRIE_NO_MATCH; |
|
215 } |
|
216 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. |
|
217 for(;;) { |
|
218 // Fetch the next input byte, if there is one. |
|
219 // Continue a linear-match node without rechecking sLength<0. |
|
220 int32_t inByte; |
|
221 if(sLength<0) { |
|
222 for(;;) { |
|
223 if((inByte=*s++)==0) { |
|
224 remainingMatchLength_=length; |
|
225 pos_=pos; |
|
226 int32_t node; |
|
227 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
228 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
229 } |
|
230 if(length<0) { |
|
231 remainingMatchLength_=length; |
|
232 break; |
|
233 } |
|
234 if(inByte!=*pos) { |
|
235 stop(); |
|
236 return USTRINGTRIE_NO_MATCH; |
|
237 } |
|
238 ++pos; |
|
239 --length; |
|
240 } |
|
241 } else { |
|
242 for(;;) { |
|
243 if(sLength==0) { |
|
244 remainingMatchLength_=length; |
|
245 pos_=pos; |
|
246 int32_t node; |
|
247 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
248 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
249 } |
|
250 inByte=*s++; |
|
251 --sLength; |
|
252 if(length<0) { |
|
253 remainingMatchLength_=length; |
|
254 break; |
|
255 } |
|
256 if(inByte!=*pos) { |
|
257 stop(); |
|
258 return USTRINGTRIE_NO_MATCH; |
|
259 } |
|
260 ++pos; |
|
261 --length; |
|
262 } |
|
263 } |
|
264 for(;;) { |
|
265 int32_t node=*pos++; |
|
266 if(node<kMinLinearMatch) { |
|
267 UStringTrieResult result=branchNext(pos, node, inByte); |
|
268 if(result==USTRINGTRIE_NO_MATCH) { |
|
269 return USTRINGTRIE_NO_MATCH; |
|
270 } |
|
271 // Fetch the next input byte, if there is one. |
|
272 if(sLength<0) { |
|
273 if((inByte=*s++)==0) { |
|
274 return result; |
|
275 } |
|
276 } else { |
|
277 if(sLength==0) { |
|
278 return result; |
|
279 } |
|
280 inByte=*s++; |
|
281 --sLength; |
|
282 } |
|
283 if(result==USTRINGTRIE_FINAL_VALUE) { |
|
284 // No further matching bytes. |
|
285 stop(); |
|
286 return USTRINGTRIE_NO_MATCH; |
|
287 } |
|
288 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . |
|
289 } else if(node<kMinValueLead) { |
|
290 // Match length+1 bytes. |
|
291 length=node-kMinLinearMatch; // Actual match length minus 1. |
|
292 if(inByte!=*pos) { |
|
293 stop(); |
|
294 return USTRINGTRIE_NO_MATCH; |
|
295 } |
|
296 ++pos; |
|
297 --length; |
|
298 break; |
|
299 } else if(node&kValueIsFinal) { |
|
300 // No further matching bytes. |
|
301 stop(); |
|
302 return USTRINGTRIE_NO_MATCH; |
|
303 } else { |
|
304 // Skip intermediate value. |
|
305 pos=skipValue(pos, node); |
|
306 // The next node must not also be a value node. |
|
307 U_ASSERT(*pos<kMinValueLead); |
|
308 } |
|
309 } |
|
310 } |
|
311 } |
|
312 |
|
313 const uint8_t * |
|
314 BytesTrie::findUniqueValueFromBranch(const uint8_t *pos, int32_t length, |
|
315 UBool haveUniqueValue, int32_t &uniqueValue) { |
|
316 while(length>kMaxBranchLinearSubNodeLength) { |
|
317 ++pos; // ignore the comparison byte |
|
318 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { |
|
319 return NULL; |
|
320 } |
|
321 length=length-(length>>1); |
|
322 pos=skipDelta(pos); |
|
323 } |
|
324 do { |
|
325 ++pos; // ignore a comparison byte |
|
326 // handle its value |
|
327 int32_t node=*pos++; |
|
328 UBool isFinal=(UBool)(node&kValueIsFinal); |
|
329 int32_t value=readValue(pos, node>>1); |
|
330 pos=skipValue(pos, node); |
|
331 if(isFinal) { |
|
332 if(haveUniqueValue) { |
|
333 if(value!=uniqueValue) { |
|
334 return NULL; |
|
335 } |
|
336 } else { |
|
337 uniqueValue=value; |
|
338 haveUniqueValue=TRUE; |
|
339 } |
|
340 } else { |
|
341 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { |
|
342 return NULL; |
|
343 } |
|
344 haveUniqueValue=TRUE; |
|
345 } |
|
346 } while(--length>1); |
|
347 return pos+1; // ignore the last comparison byte |
|
348 } |
|
349 |
|
350 UBool |
|
351 BytesTrie::findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue) { |
|
352 for(;;) { |
|
353 int32_t node=*pos++; |
|
354 if(node<kMinLinearMatch) { |
|
355 if(node==0) { |
|
356 node=*pos++; |
|
357 } |
|
358 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); |
|
359 if(pos==NULL) { |
|
360 return FALSE; |
|
361 } |
|
362 haveUniqueValue=TRUE; |
|
363 } else if(node<kMinValueLead) { |
|
364 // linear-match node |
|
365 pos+=node-kMinLinearMatch+1; // Ignore the match bytes. |
|
366 } else { |
|
367 UBool isFinal=(UBool)(node&kValueIsFinal); |
|
368 int32_t value=readValue(pos, node>>1); |
|
369 if(haveUniqueValue) { |
|
370 if(value!=uniqueValue) { |
|
371 return FALSE; |
|
372 } |
|
373 } else { |
|
374 uniqueValue=value; |
|
375 haveUniqueValue=TRUE; |
|
376 } |
|
377 if(isFinal) { |
|
378 return TRUE; |
|
379 } |
|
380 pos=skipValue(pos, node); |
|
381 } |
|
382 } |
|
383 } |
|
384 |
|
385 int32_t |
|
386 BytesTrie::getNextBytes(ByteSink &out) const { |
|
387 const uint8_t *pos=pos_; |
|
388 if(pos==NULL) { |
|
389 return 0; |
|
390 } |
|
391 if(remainingMatchLength_>=0) { |
|
392 append(out, *pos); // Next byte of a pending linear-match node. |
|
393 return 1; |
|
394 } |
|
395 int32_t node=*pos++; |
|
396 if(node>=kMinValueLead) { |
|
397 if(node&kValueIsFinal) { |
|
398 return 0; |
|
399 } else { |
|
400 pos=skipValue(pos, node); |
|
401 node=*pos++; |
|
402 U_ASSERT(node<kMinValueLead); |
|
403 } |
|
404 } |
|
405 if(node<kMinLinearMatch) { |
|
406 if(node==0) { |
|
407 node=*pos++; |
|
408 } |
|
409 getNextBranchBytes(pos, ++node, out); |
|
410 return node; |
|
411 } else { |
|
412 // First byte of the linear-match node. |
|
413 append(out, *pos); |
|
414 return 1; |
|
415 } |
|
416 } |
|
417 |
|
418 void |
|
419 BytesTrie::getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out) { |
|
420 while(length>kMaxBranchLinearSubNodeLength) { |
|
421 ++pos; // ignore the comparison byte |
|
422 getNextBranchBytes(jumpByDelta(pos), length>>1, out); |
|
423 length=length-(length>>1); |
|
424 pos=skipDelta(pos); |
|
425 } |
|
426 do { |
|
427 append(out, *pos++); |
|
428 pos=skipValue(pos); |
|
429 } while(--length>1); |
|
430 append(out, *pos); |
|
431 } |
|
432 |
|
433 void |
|
434 BytesTrie::append(ByteSink &out, int c) { |
|
435 char ch=(char)c; |
|
436 out.Append(&ch, 1); |
|
437 } |
|
438 |
|
439 U_NAMESPACE_END |