|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: ucharstriebuilder.h |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010nov14 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 #include "unicode/ucharstrie.h" |
|
17 #include "unicode/ucharstriebuilder.h" |
|
18 #include "unicode/unistr.h" |
|
19 #include "unicode/ustring.h" |
|
20 #include "cmemory.h" |
|
21 #include "uarrsort.h" |
|
22 #include "uassert.h" |
|
23 #include "uhash.h" |
|
24 #include "ustr_imp.h" |
|
25 |
|
26 U_NAMESPACE_BEGIN |
|
27 |
|
28 /* |
|
29 * Note: This builder implementation stores (string, value) pairs with full copies |
|
30 * of the 16-bit-unit sequences, until the UCharsTrie is built. |
|
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. |
|
32 */ |
|
33 |
|
34 class UCharsTrieElement : public UMemory { |
|
35 public: |
|
36 // Use compiler's default constructor, initializes nothing. |
|
37 |
|
38 void setTo(const UnicodeString &s, int32_t val, UnicodeString &strings, UErrorCode &errorCode); |
|
39 |
|
40 UnicodeString getString(const UnicodeString &strings) const { |
|
41 int32_t length=strings[stringOffset]; |
|
42 return strings.tempSubString(stringOffset+1, length); |
|
43 } |
|
44 int32_t getStringLength(const UnicodeString &strings) const { |
|
45 return strings[stringOffset]; |
|
46 } |
|
47 |
|
48 UChar charAt(int32_t index, const UnicodeString &strings) const { |
|
49 return strings[stringOffset+1+index]; |
|
50 } |
|
51 |
|
52 int32_t getValue() const { return value; } |
|
53 |
|
54 int32_t compareStringTo(const UCharsTrieElement &o, const UnicodeString &strings) const; |
|
55 |
|
56 private: |
|
57 // The first strings unit contains the string length. |
|
58 // (Compared with a stringLength field here, this saves 2 bytes per string.) |
|
59 int32_t stringOffset; |
|
60 int32_t value; |
|
61 }; |
|
62 |
|
63 void |
|
64 UCharsTrieElement::setTo(const UnicodeString &s, int32_t val, |
|
65 UnicodeString &strings, UErrorCode &errorCode) { |
|
66 if(U_FAILURE(errorCode)) { |
|
67 return; |
|
68 } |
|
69 int32_t length=s.length(); |
|
70 if(length>0xffff) { |
|
71 // Too long: We store the length in 1 unit. |
|
72 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
73 return; |
|
74 } |
|
75 stringOffset=strings.length(); |
|
76 strings.append((UChar)length); |
|
77 value=val; |
|
78 strings.append(s); |
|
79 } |
|
80 |
|
81 int32_t |
|
82 UCharsTrieElement::compareStringTo(const UCharsTrieElement &other, const UnicodeString &strings) const { |
|
83 return getString(strings).compare(other.getString(strings)); |
|
84 } |
|
85 |
|
86 UCharsTrieBuilder::UCharsTrieBuilder(UErrorCode & /*errorCode*/) |
|
87 : elements(NULL), elementsCapacity(0), elementsLength(0), |
|
88 uchars(NULL), ucharsCapacity(0), ucharsLength(0) {} |
|
89 |
|
90 UCharsTrieBuilder::~UCharsTrieBuilder() { |
|
91 delete[] elements; |
|
92 uprv_free(uchars); |
|
93 } |
|
94 |
|
95 UCharsTrieBuilder & |
|
96 UCharsTrieBuilder::add(const UnicodeString &s, int32_t value, UErrorCode &errorCode) { |
|
97 if(U_FAILURE(errorCode)) { |
|
98 return *this; |
|
99 } |
|
100 if(ucharsLength>0) { |
|
101 // Cannot add elements after building. |
|
102 errorCode=U_NO_WRITE_PERMISSION; |
|
103 return *this; |
|
104 } |
|
105 if(elementsLength==elementsCapacity) { |
|
106 int32_t newCapacity; |
|
107 if(elementsCapacity==0) { |
|
108 newCapacity=1024; |
|
109 } else { |
|
110 newCapacity=4*elementsCapacity; |
|
111 } |
|
112 UCharsTrieElement *newElements=new UCharsTrieElement[newCapacity]; |
|
113 if(newElements==NULL) { |
|
114 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
115 return *this; |
|
116 } |
|
117 if(elementsLength>0) { |
|
118 uprv_memcpy(newElements, elements, elementsLength*sizeof(UCharsTrieElement)); |
|
119 } |
|
120 delete[] elements; |
|
121 elements=newElements; |
|
122 elementsCapacity=newCapacity; |
|
123 } |
|
124 elements[elementsLength++].setTo(s, value, strings, errorCode); |
|
125 if(U_SUCCESS(errorCode) && strings.isBogus()) { |
|
126 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
127 } |
|
128 return *this; |
|
129 } |
|
130 |
|
131 U_CDECL_BEGIN |
|
132 |
|
133 static int32_t U_CALLCONV |
|
134 compareElementStrings(const void *context, const void *left, const void *right) { |
|
135 const UnicodeString *strings=static_cast<const UnicodeString *>(context); |
|
136 const UCharsTrieElement *leftElement=static_cast<const UCharsTrieElement *>(left); |
|
137 const UCharsTrieElement *rightElement=static_cast<const UCharsTrieElement *>(right); |
|
138 return leftElement->compareStringTo(*rightElement, *strings); |
|
139 } |
|
140 |
|
141 U_CDECL_END |
|
142 |
|
143 UCharsTrie * |
|
144 UCharsTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
|
145 buildUChars(buildOption, errorCode); |
|
146 UCharsTrie *newTrie=NULL; |
|
147 if(U_SUCCESS(errorCode)) { |
|
148 newTrie=new UCharsTrie(uchars, uchars+(ucharsCapacity-ucharsLength)); |
|
149 if(newTrie==NULL) { |
|
150 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
151 } else { |
|
152 uchars=NULL; // The new trie now owns the array. |
|
153 ucharsCapacity=0; |
|
154 } |
|
155 } |
|
156 return newTrie; |
|
157 } |
|
158 |
|
159 UnicodeString & |
|
160 UCharsTrieBuilder::buildUnicodeString(UStringTrieBuildOption buildOption, UnicodeString &result, |
|
161 UErrorCode &errorCode) { |
|
162 buildUChars(buildOption, errorCode); |
|
163 if(U_SUCCESS(errorCode)) { |
|
164 result.setTo(FALSE, uchars+(ucharsCapacity-ucharsLength), ucharsLength); |
|
165 } |
|
166 return result; |
|
167 } |
|
168 |
|
169 void |
|
170 UCharsTrieBuilder::buildUChars(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
|
171 if(U_FAILURE(errorCode)) { |
|
172 return; |
|
173 } |
|
174 if(uchars!=NULL && ucharsLength>0) { |
|
175 // Already built. |
|
176 return; |
|
177 } |
|
178 if(ucharsLength==0) { |
|
179 if(elementsLength==0) { |
|
180 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
181 return; |
|
182 } |
|
183 if(strings.isBogus()) { |
|
184 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
185 return; |
|
186 } |
|
187 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(UCharsTrieElement), |
|
188 compareElementStrings, &strings, |
|
189 FALSE, // need not be a stable sort |
|
190 &errorCode); |
|
191 if(U_FAILURE(errorCode)) { |
|
192 return; |
|
193 } |
|
194 // Duplicate strings are not allowed. |
|
195 UnicodeString prev=elements[0].getString(strings); |
|
196 for(int32_t i=1; i<elementsLength; ++i) { |
|
197 UnicodeString current=elements[i].getString(strings); |
|
198 if(prev==current) { |
|
199 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
200 return; |
|
201 } |
|
202 prev.fastCopyFrom(current); |
|
203 } |
|
204 } |
|
205 // Create and UChar-serialize the trie for the elements. |
|
206 ucharsLength=0; |
|
207 int32_t capacity=strings.length(); |
|
208 if(capacity<1024) { |
|
209 capacity=1024; |
|
210 } |
|
211 if(ucharsCapacity<capacity) { |
|
212 uprv_free(uchars); |
|
213 uchars=static_cast<UChar *>(uprv_malloc(capacity*2)); |
|
214 if(uchars==NULL) { |
|
215 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
216 ucharsCapacity=0; |
|
217 return; |
|
218 } |
|
219 ucharsCapacity=capacity; |
|
220 } |
|
221 StringTrieBuilder::build(buildOption, elementsLength, errorCode); |
|
222 if(uchars==NULL) { |
|
223 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
224 } |
|
225 } |
|
226 |
|
227 int32_t |
|
228 UCharsTrieBuilder::getElementStringLength(int32_t i) const { |
|
229 return elements[i].getStringLength(strings); |
|
230 } |
|
231 |
|
232 UChar |
|
233 UCharsTrieBuilder::getElementUnit(int32_t i, int32_t unitIndex) const { |
|
234 return elements[i].charAt(unitIndex, strings); |
|
235 } |
|
236 |
|
237 int32_t |
|
238 UCharsTrieBuilder::getElementValue(int32_t i) const { |
|
239 return elements[i].getValue(); |
|
240 } |
|
241 |
|
242 int32_t |
|
243 UCharsTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const { |
|
244 const UCharsTrieElement &firstElement=elements[first]; |
|
245 const UCharsTrieElement &lastElement=elements[last]; |
|
246 int32_t minStringLength=firstElement.getStringLength(strings); |
|
247 while(++unitIndex<minStringLength && |
|
248 firstElement.charAt(unitIndex, strings)== |
|
249 lastElement.charAt(unitIndex, strings)) {} |
|
250 return unitIndex; |
|
251 } |
|
252 |
|
253 int32_t |
|
254 UCharsTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const { |
|
255 int32_t length=0; // Number of different units at unitIndex. |
|
256 int32_t i=start; |
|
257 do { |
|
258 UChar unit=elements[i++].charAt(unitIndex, strings); |
|
259 while(i<limit && unit==elements[i].charAt(unitIndex, strings)) { |
|
260 ++i; |
|
261 } |
|
262 ++length; |
|
263 } while(i<limit); |
|
264 return length; |
|
265 } |
|
266 |
|
267 int32_t |
|
268 UCharsTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const { |
|
269 do { |
|
270 UChar unit=elements[i++].charAt(unitIndex, strings); |
|
271 while(unit==elements[i].charAt(unitIndex, strings)) { |
|
272 ++i; |
|
273 } |
|
274 } while(--count>0); |
|
275 return i; |
|
276 } |
|
277 |
|
278 int32_t |
|
279 UCharsTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const { |
|
280 while(unit==elements[i].charAt(unitIndex, strings)) { |
|
281 ++i; |
|
282 } |
|
283 return i; |
|
284 } |
|
285 |
|
286 UCharsTrieBuilder::UCTLinearMatchNode::UCTLinearMatchNode(const UChar *units, int32_t len, Node *nextNode) |
|
287 : LinearMatchNode(len, nextNode), s(units) { |
|
288 hash=hash*37+ustr_hashUCharsN(units, len); |
|
289 } |
|
290 |
|
291 UBool |
|
292 UCharsTrieBuilder::UCTLinearMatchNode::operator==(const Node &other) const { |
|
293 if(this==&other) { |
|
294 return TRUE; |
|
295 } |
|
296 if(!LinearMatchNode::operator==(other)) { |
|
297 return FALSE; |
|
298 } |
|
299 const UCTLinearMatchNode &o=(const UCTLinearMatchNode &)other; |
|
300 return 0==u_memcmp(s, o.s, length); |
|
301 } |
|
302 |
|
303 void |
|
304 UCharsTrieBuilder::UCTLinearMatchNode::write(StringTrieBuilder &builder) { |
|
305 UCharsTrieBuilder &b=(UCharsTrieBuilder &)builder; |
|
306 next->write(builder); |
|
307 b.write(s, length); |
|
308 offset=b.writeValueAndType(hasValue, value, b.getMinLinearMatch()+length-1); |
|
309 } |
|
310 |
|
311 StringTrieBuilder::Node * |
|
312 UCharsTrieBuilder::createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, |
|
313 Node *nextNode) const { |
|
314 return new UCTLinearMatchNode( |
|
315 elements[i].getString(strings).getBuffer()+unitIndex, |
|
316 length, |
|
317 nextNode); |
|
318 } |
|
319 |
|
320 UBool |
|
321 UCharsTrieBuilder::ensureCapacity(int32_t length) { |
|
322 if(uchars==NULL) { |
|
323 return FALSE; // previous memory allocation had failed |
|
324 } |
|
325 if(length>ucharsCapacity) { |
|
326 int32_t newCapacity=ucharsCapacity; |
|
327 do { |
|
328 newCapacity*=2; |
|
329 } while(newCapacity<=length); |
|
330 UChar *newUChars=static_cast<UChar *>(uprv_malloc(newCapacity*2)); |
|
331 if(newUChars==NULL) { |
|
332 // unable to allocate memory |
|
333 uprv_free(uchars); |
|
334 uchars=NULL; |
|
335 ucharsCapacity=0; |
|
336 return FALSE; |
|
337 } |
|
338 u_memcpy(newUChars+(newCapacity-ucharsLength), |
|
339 uchars+(ucharsCapacity-ucharsLength), ucharsLength); |
|
340 uprv_free(uchars); |
|
341 uchars=newUChars; |
|
342 ucharsCapacity=newCapacity; |
|
343 } |
|
344 return TRUE; |
|
345 } |
|
346 |
|
347 int32_t |
|
348 UCharsTrieBuilder::write(int32_t unit) { |
|
349 int32_t newLength=ucharsLength+1; |
|
350 if(ensureCapacity(newLength)) { |
|
351 ucharsLength=newLength; |
|
352 uchars[ucharsCapacity-ucharsLength]=(UChar)unit; |
|
353 } |
|
354 return ucharsLength; |
|
355 } |
|
356 |
|
357 int32_t |
|
358 UCharsTrieBuilder::write(const UChar *s, int32_t length) { |
|
359 int32_t newLength=ucharsLength+length; |
|
360 if(ensureCapacity(newLength)) { |
|
361 ucharsLength=newLength; |
|
362 u_memcpy(uchars+(ucharsCapacity-ucharsLength), s, length); |
|
363 } |
|
364 return ucharsLength; |
|
365 } |
|
366 |
|
367 int32_t |
|
368 UCharsTrieBuilder::writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) { |
|
369 return write(elements[i].getString(strings).getBuffer()+unitIndex, length); |
|
370 } |
|
371 |
|
372 int32_t |
|
373 UCharsTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { |
|
374 if(0<=i && i<=UCharsTrie::kMaxOneUnitValue) { |
|
375 return write(i|(isFinal<<15)); |
|
376 } |
|
377 UChar intUnits[3]; |
|
378 int32_t length; |
|
379 if(i<0 || i>UCharsTrie::kMaxTwoUnitValue) { |
|
380 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitValueLead); |
|
381 intUnits[1]=(UChar)((uint32_t)i>>16); |
|
382 intUnits[2]=(UChar)i; |
|
383 length=3; |
|
384 // } else if(i<=UCharsTrie::kMaxOneUnitValue) { |
|
385 // intUnits[0]=(UChar)(i); |
|
386 // length=1; |
|
387 } else { |
|
388 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitValueLead+(i>>16)); |
|
389 intUnits[1]=(UChar)i; |
|
390 length=2; |
|
391 } |
|
392 intUnits[0]=(UChar)(intUnits[0]|(isFinal<<15)); |
|
393 return write(intUnits, length); |
|
394 } |
|
395 |
|
396 int32_t |
|
397 UCharsTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { |
|
398 if(!hasValue) { |
|
399 return write(node); |
|
400 } |
|
401 UChar intUnits[3]; |
|
402 int32_t length; |
|
403 if(value<0 || value>UCharsTrie::kMaxTwoUnitNodeValue) { |
|
404 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitNodeValueLead); |
|
405 intUnits[1]=(UChar)((uint32_t)value>>16); |
|
406 intUnits[2]=(UChar)value; |
|
407 length=3; |
|
408 } else if(value<=UCharsTrie::kMaxOneUnitNodeValue) { |
|
409 intUnits[0]=(UChar)((value+1)<<6); |
|
410 length=1; |
|
411 } else { |
|
412 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitNodeValueLead+((value>>10)&0x7fc0)); |
|
413 intUnits[1]=(UChar)value; |
|
414 length=2; |
|
415 } |
|
416 intUnits[0]|=(UChar)node; |
|
417 return write(intUnits, length); |
|
418 } |
|
419 |
|
420 int32_t |
|
421 UCharsTrieBuilder::writeDeltaTo(int32_t jumpTarget) { |
|
422 int32_t i=ucharsLength-jumpTarget; |
|
423 U_ASSERT(i>=0); |
|
424 if(i<=UCharsTrie::kMaxOneUnitDelta) { |
|
425 return write(i); |
|
426 } |
|
427 UChar intUnits[3]; |
|
428 int32_t length; |
|
429 if(i<=UCharsTrie::kMaxTwoUnitDelta) { |
|
430 intUnits[0]=(UChar)(UCharsTrie::kMinTwoUnitDeltaLead+(i>>16)); |
|
431 length=1; |
|
432 } else { |
|
433 intUnits[0]=(UChar)(UCharsTrie::kThreeUnitDeltaLead); |
|
434 intUnits[1]=(UChar)(i>>16); |
|
435 length=2; |
|
436 } |
|
437 intUnits[length++]=(UChar)i; |
|
438 return write(intUnits, length); |
|
439 } |
|
440 |
|
441 U_NAMESPACE_END |