|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2011, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: ucharstrie.h |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010nov14 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 #include "unicode/appendable.h" |
|
17 #include "unicode/ucharstrie.h" |
|
18 #include "unicode/uobject.h" |
|
19 #include "unicode/utf16.h" |
|
20 #include "cmemory.h" |
|
21 #include "uassert.h" |
|
22 |
|
23 U_NAMESPACE_BEGIN |
|
24 |
|
25 UCharsTrie::~UCharsTrie() { |
|
26 uprv_free(ownedArray_); |
|
27 } |
|
28 |
|
29 UStringTrieResult |
|
30 UCharsTrie::current() const { |
|
31 const UChar *pos=pos_; |
|
32 if(pos==NULL) { |
|
33 return USTRINGTRIE_NO_MATCH; |
|
34 } else { |
|
35 int32_t node; |
|
36 return (remainingMatchLength_<0 && (node=*pos)>=kMinValueLead) ? |
|
37 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
38 } |
|
39 } |
|
40 |
|
41 UStringTrieResult |
|
42 UCharsTrie::firstForCodePoint(UChar32 cp) { |
|
43 return cp<=0xffff ? |
|
44 first(cp) : |
|
45 (USTRINGTRIE_HAS_NEXT(first(U16_LEAD(cp))) ? |
|
46 next(U16_TRAIL(cp)) : |
|
47 USTRINGTRIE_NO_MATCH); |
|
48 } |
|
49 |
|
50 UStringTrieResult |
|
51 UCharsTrie::nextForCodePoint(UChar32 cp) { |
|
52 return cp<=0xffff ? |
|
53 next(cp) : |
|
54 (USTRINGTRIE_HAS_NEXT(next(U16_LEAD(cp))) ? |
|
55 next(U16_TRAIL(cp)) : |
|
56 USTRINGTRIE_NO_MATCH); |
|
57 } |
|
58 |
|
59 UStringTrieResult |
|
60 UCharsTrie::branchNext(const UChar *pos, int32_t length, int32_t uchar) { |
|
61 // Branch according to the current unit. |
|
62 if(length==0) { |
|
63 length=*pos++; |
|
64 } |
|
65 ++length; |
|
66 // The length of the branch is the number of units to select from. |
|
67 // The data structure encodes a binary search. |
|
68 while(length>kMaxBranchLinearSubNodeLength) { |
|
69 if(uchar<*pos++) { |
|
70 length>>=1; |
|
71 pos=jumpByDelta(pos); |
|
72 } else { |
|
73 length=length-(length>>1); |
|
74 pos=skipDelta(pos); |
|
75 } |
|
76 } |
|
77 // Drop down to linear search for the last few units. |
|
78 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 |
|
79 // and divides length by 2. |
|
80 do { |
|
81 if(uchar==*pos++) { |
|
82 UStringTrieResult result; |
|
83 int32_t node=*pos; |
|
84 if(node&kValueIsFinal) { |
|
85 // Leave the final value for getValue() to read. |
|
86 result=USTRINGTRIE_FINAL_VALUE; |
|
87 } else { |
|
88 // Use the non-final value as the jump delta. |
|
89 ++pos; |
|
90 // int32_t delta=readValue(pos, node); |
|
91 int32_t delta; |
|
92 if(node<kMinTwoUnitValueLead) { |
|
93 delta=node; |
|
94 } else if(node<kThreeUnitValueLead) { |
|
95 delta=((node-kMinTwoUnitValueLead)<<16)|*pos++; |
|
96 } else { |
|
97 delta=(pos[0]<<16)|pos[1]; |
|
98 pos+=2; |
|
99 } |
|
100 // end readValue() |
|
101 pos+=delta; |
|
102 node=*pos; |
|
103 result= node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
104 } |
|
105 pos_=pos; |
|
106 return result; |
|
107 } |
|
108 --length; |
|
109 pos=skipValue(pos); |
|
110 } while(length>1); |
|
111 if(uchar==*pos++) { |
|
112 pos_=pos; |
|
113 int32_t node=*pos; |
|
114 return node>=kMinValueLead ? valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
115 } else { |
|
116 stop(); |
|
117 return USTRINGTRIE_NO_MATCH; |
|
118 } |
|
119 } |
|
120 |
|
121 UStringTrieResult |
|
122 UCharsTrie::nextImpl(const UChar *pos, int32_t uchar) { |
|
123 int32_t node=*pos++; |
|
124 for(;;) { |
|
125 if(node<kMinLinearMatch) { |
|
126 return branchNext(pos, node, uchar); |
|
127 } else if(node<kMinValueLead) { |
|
128 // Match the first of length+1 units. |
|
129 int32_t length=node-kMinLinearMatch; // Actual match length minus 1. |
|
130 if(uchar==*pos++) { |
|
131 remainingMatchLength_=--length; |
|
132 pos_=pos; |
|
133 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
134 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
135 } else { |
|
136 // No match. |
|
137 break; |
|
138 } |
|
139 } else if(node&kValueIsFinal) { |
|
140 // No further matching units. |
|
141 break; |
|
142 } else { |
|
143 // Skip intermediate value. |
|
144 pos=skipNodeValue(pos, node); |
|
145 node&=kNodeTypeMask; |
|
146 } |
|
147 } |
|
148 stop(); |
|
149 return USTRINGTRIE_NO_MATCH; |
|
150 } |
|
151 |
|
152 UStringTrieResult |
|
153 UCharsTrie::next(int32_t uchar) { |
|
154 const UChar *pos=pos_; |
|
155 if(pos==NULL) { |
|
156 return USTRINGTRIE_NO_MATCH; |
|
157 } |
|
158 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. |
|
159 if(length>=0) { |
|
160 // Remaining part of a linear-match node. |
|
161 if(uchar==*pos++) { |
|
162 remainingMatchLength_=--length; |
|
163 pos_=pos; |
|
164 int32_t node; |
|
165 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
166 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
167 } else { |
|
168 stop(); |
|
169 return USTRINGTRIE_NO_MATCH; |
|
170 } |
|
171 } |
|
172 return nextImpl(pos, uchar); |
|
173 } |
|
174 |
|
175 UStringTrieResult |
|
176 UCharsTrie::next(const UChar *s, int32_t sLength) { |
|
177 if(sLength<0 ? *s==0 : sLength==0) { |
|
178 // Empty input. |
|
179 return current(); |
|
180 } |
|
181 const UChar *pos=pos_; |
|
182 if(pos==NULL) { |
|
183 return USTRINGTRIE_NO_MATCH; |
|
184 } |
|
185 int32_t length=remainingMatchLength_; // Actual remaining match length minus 1. |
|
186 for(;;) { |
|
187 // Fetch the next input unit, if there is one. |
|
188 // Continue a linear-match node without rechecking sLength<0. |
|
189 int32_t uchar; |
|
190 if(sLength<0) { |
|
191 for(;;) { |
|
192 if((uchar=*s++)==0) { |
|
193 remainingMatchLength_=length; |
|
194 pos_=pos; |
|
195 int32_t node; |
|
196 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
197 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
198 } |
|
199 if(length<0) { |
|
200 remainingMatchLength_=length; |
|
201 break; |
|
202 } |
|
203 if(uchar!=*pos) { |
|
204 stop(); |
|
205 return USTRINGTRIE_NO_MATCH; |
|
206 } |
|
207 ++pos; |
|
208 --length; |
|
209 } |
|
210 } else { |
|
211 for(;;) { |
|
212 if(sLength==0) { |
|
213 remainingMatchLength_=length; |
|
214 pos_=pos; |
|
215 int32_t node; |
|
216 return (length<0 && (node=*pos)>=kMinValueLead) ? |
|
217 valueResult(node) : USTRINGTRIE_NO_VALUE; |
|
218 } |
|
219 uchar=*s++; |
|
220 --sLength; |
|
221 if(length<0) { |
|
222 remainingMatchLength_=length; |
|
223 break; |
|
224 } |
|
225 if(uchar!=*pos) { |
|
226 stop(); |
|
227 return USTRINGTRIE_NO_MATCH; |
|
228 } |
|
229 ++pos; |
|
230 --length; |
|
231 } |
|
232 } |
|
233 int32_t node=*pos++; |
|
234 for(;;) { |
|
235 if(node<kMinLinearMatch) { |
|
236 UStringTrieResult result=branchNext(pos, node, uchar); |
|
237 if(result==USTRINGTRIE_NO_MATCH) { |
|
238 return USTRINGTRIE_NO_MATCH; |
|
239 } |
|
240 // Fetch the next input unit, if there is one. |
|
241 if(sLength<0) { |
|
242 if((uchar=*s++)==0) { |
|
243 return result; |
|
244 } |
|
245 } else { |
|
246 if(sLength==0) { |
|
247 return result; |
|
248 } |
|
249 uchar=*s++; |
|
250 --sLength; |
|
251 } |
|
252 if(result==USTRINGTRIE_FINAL_VALUE) { |
|
253 // No further matching units. |
|
254 stop(); |
|
255 return USTRINGTRIE_NO_MATCH; |
|
256 } |
|
257 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . |
|
258 node=*pos++; |
|
259 } else if(node<kMinValueLead) { |
|
260 // Match length+1 units. |
|
261 length=node-kMinLinearMatch; // Actual match length minus 1. |
|
262 if(uchar!=*pos) { |
|
263 stop(); |
|
264 return USTRINGTRIE_NO_MATCH; |
|
265 } |
|
266 ++pos; |
|
267 --length; |
|
268 break; |
|
269 } else if(node&kValueIsFinal) { |
|
270 // No further matching units. |
|
271 stop(); |
|
272 return USTRINGTRIE_NO_MATCH; |
|
273 } else { |
|
274 // Skip intermediate value. |
|
275 pos=skipNodeValue(pos, node); |
|
276 node&=kNodeTypeMask; |
|
277 } |
|
278 } |
|
279 } |
|
280 } |
|
281 |
|
282 const UChar * |
|
283 UCharsTrie::findUniqueValueFromBranch(const UChar *pos, int32_t length, |
|
284 UBool haveUniqueValue, int32_t &uniqueValue) { |
|
285 while(length>kMaxBranchLinearSubNodeLength) { |
|
286 ++pos; // ignore the comparison unit |
|
287 if(NULL==findUniqueValueFromBranch(jumpByDelta(pos), length>>1, haveUniqueValue, uniqueValue)) { |
|
288 return NULL; |
|
289 } |
|
290 length=length-(length>>1); |
|
291 pos=skipDelta(pos); |
|
292 } |
|
293 do { |
|
294 ++pos; // ignore a comparison unit |
|
295 // handle its value |
|
296 int32_t node=*pos++; |
|
297 UBool isFinal=(UBool)(node>>15); |
|
298 node&=0x7fff; |
|
299 int32_t value=readValue(pos, node); |
|
300 pos=skipValue(pos, node); |
|
301 if(isFinal) { |
|
302 if(haveUniqueValue) { |
|
303 if(value!=uniqueValue) { |
|
304 return NULL; |
|
305 } |
|
306 } else { |
|
307 uniqueValue=value; |
|
308 haveUniqueValue=TRUE; |
|
309 } |
|
310 } else { |
|
311 if(!findUniqueValue(pos+value, haveUniqueValue, uniqueValue)) { |
|
312 return NULL; |
|
313 } |
|
314 haveUniqueValue=TRUE; |
|
315 } |
|
316 } while(--length>1); |
|
317 return pos+1; // ignore the last comparison unit |
|
318 } |
|
319 |
|
320 UBool |
|
321 UCharsTrie::findUniqueValue(const UChar *pos, UBool haveUniqueValue, int32_t &uniqueValue) { |
|
322 int32_t node=*pos++; |
|
323 for(;;) { |
|
324 if(node<kMinLinearMatch) { |
|
325 if(node==0) { |
|
326 node=*pos++; |
|
327 } |
|
328 pos=findUniqueValueFromBranch(pos, node+1, haveUniqueValue, uniqueValue); |
|
329 if(pos==NULL) { |
|
330 return FALSE; |
|
331 } |
|
332 haveUniqueValue=TRUE; |
|
333 node=*pos++; |
|
334 } else if(node<kMinValueLead) { |
|
335 // linear-match node |
|
336 pos+=node-kMinLinearMatch+1; // Ignore the match units. |
|
337 node=*pos++; |
|
338 } else { |
|
339 UBool isFinal=(UBool)(node>>15); |
|
340 int32_t value; |
|
341 if(isFinal) { |
|
342 value=readValue(pos, node&0x7fff); |
|
343 } else { |
|
344 value=readNodeValue(pos, node); |
|
345 } |
|
346 if(haveUniqueValue) { |
|
347 if(value!=uniqueValue) { |
|
348 return FALSE; |
|
349 } |
|
350 } else { |
|
351 uniqueValue=value; |
|
352 haveUniqueValue=TRUE; |
|
353 } |
|
354 if(isFinal) { |
|
355 return TRUE; |
|
356 } |
|
357 pos=skipNodeValue(pos, node); |
|
358 node&=kNodeTypeMask; |
|
359 } |
|
360 } |
|
361 } |
|
362 |
|
363 int32_t |
|
364 UCharsTrie::getNextUChars(Appendable &out) const { |
|
365 const UChar *pos=pos_; |
|
366 if(pos==NULL) { |
|
367 return 0; |
|
368 } |
|
369 if(remainingMatchLength_>=0) { |
|
370 out.appendCodeUnit(*pos); // Next unit of a pending linear-match node. |
|
371 return 1; |
|
372 } |
|
373 int32_t node=*pos++; |
|
374 if(node>=kMinValueLead) { |
|
375 if(node&kValueIsFinal) { |
|
376 return 0; |
|
377 } else { |
|
378 pos=skipNodeValue(pos, node); |
|
379 node&=kNodeTypeMask; |
|
380 } |
|
381 } |
|
382 if(node<kMinLinearMatch) { |
|
383 if(node==0) { |
|
384 node=*pos++; |
|
385 } |
|
386 out.reserveAppendCapacity(++node); |
|
387 getNextBranchUChars(pos, node, out); |
|
388 return node; |
|
389 } else { |
|
390 // First unit of the linear-match node. |
|
391 out.appendCodeUnit(*pos); |
|
392 return 1; |
|
393 } |
|
394 } |
|
395 |
|
396 void |
|
397 UCharsTrie::getNextBranchUChars(const UChar *pos, int32_t length, Appendable &out) { |
|
398 while(length>kMaxBranchLinearSubNodeLength) { |
|
399 ++pos; // ignore the comparison unit |
|
400 getNextBranchUChars(jumpByDelta(pos), length>>1, out); |
|
401 length=length-(length>>1); |
|
402 pos=skipDelta(pos); |
|
403 } |
|
404 do { |
|
405 out.appendCodeUnit(*pos++); |
|
406 pos=skipValue(pos); |
|
407 } while(--length>1); |
|
408 out.appendCodeUnit(*pos); |
|
409 } |
|
410 |
|
411 U_NAMESPACE_END |