|
1 /* |
|
2 ******************************************************************************* |
|
3 * Copyright (C) 2010-2012, International Business Machines |
|
4 * Corporation and others. All Rights Reserved. |
|
5 ******************************************************************************* |
|
6 * file name: bytestriebuilder.cpp |
|
7 * encoding: US-ASCII |
|
8 * tab size: 8 (not used) |
|
9 * indentation:4 |
|
10 * |
|
11 * created on: 2010sep25 |
|
12 * created by: Markus W. Scherer |
|
13 */ |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 #include "unicode/bytestrie.h" |
|
17 #include "unicode/bytestriebuilder.h" |
|
18 #include "unicode/stringpiece.h" |
|
19 #include "charstr.h" |
|
20 #include "cmemory.h" |
|
21 #include "uhash.h" |
|
22 #include "uarrsort.h" |
|
23 #include "uassert.h" |
|
24 #include "ustr_imp.h" |
|
25 |
|
26 U_NAMESPACE_BEGIN |
|
27 |
|
28 /* |
|
29 * Note: This builder implementation stores (bytes, value) pairs with full copies |
|
30 * of the byte sequences, until the BytesTrie is built. |
|
31 * It might(!) take less memory if we collected the data in a temporary, dynamic trie. |
|
32 */ |
|
33 |
|
34 class BytesTrieElement : public UMemory { |
|
35 public: |
|
36 // Use compiler's default constructor, initializes nothing. |
|
37 |
|
38 void setTo(const StringPiece &s, int32_t val, CharString &strings, UErrorCode &errorCode); |
|
39 |
|
40 StringPiece getString(const CharString &strings) const { |
|
41 int32_t offset=stringOffset; |
|
42 int32_t length; |
|
43 if(offset>=0) { |
|
44 length=(uint8_t)strings[offset++]; |
|
45 } else { |
|
46 offset=~offset; |
|
47 length=((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; |
|
48 offset+=2; |
|
49 } |
|
50 return StringPiece(strings.data()+offset, length); |
|
51 } |
|
52 int32_t getStringLength(const CharString &strings) const { |
|
53 int32_t offset=stringOffset; |
|
54 if(offset>=0) { |
|
55 return (uint8_t)strings[offset]; |
|
56 } else { |
|
57 offset=~offset; |
|
58 return ((int32_t)(uint8_t)strings[offset]<<8)|(uint8_t)strings[offset+1]; |
|
59 } |
|
60 } |
|
61 |
|
62 char charAt(int32_t index, const CharString &strings) const { return data(strings)[index]; } |
|
63 |
|
64 int32_t getValue() const { return value; } |
|
65 |
|
66 int32_t compareStringTo(const BytesTrieElement &o, const CharString &strings) const; |
|
67 |
|
68 private: |
|
69 const char *data(const CharString &strings) const { |
|
70 int32_t offset=stringOffset; |
|
71 if(offset>=0) { |
|
72 ++offset; |
|
73 } else { |
|
74 offset=~offset+2; |
|
75 } |
|
76 return strings.data()+offset; |
|
77 } |
|
78 |
|
79 // If the stringOffset is non-negative, then the first strings byte contains |
|
80 // the string length. |
|
81 // If the stringOffset is negative, then the first two strings bytes contain |
|
82 // the string length (big-endian), and the offset needs to be bit-inverted. |
|
83 // (Compared with a stringLength field here, this saves 3 bytes per string for most strings.) |
|
84 int32_t stringOffset; |
|
85 int32_t value; |
|
86 }; |
|
87 |
|
88 void |
|
89 BytesTrieElement::setTo(const StringPiece &s, int32_t val, |
|
90 CharString &strings, UErrorCode &errorCode) { |
|
91 if(U_FAILURE(errorCode)) { |
|
92 return; |
|
93 } |
|
94 int32_t length=s.length(); |
|
95 if(length>0xffff) { |
|
96 // Too long: We store the length in 1 or 2 bytes. |
|
97 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
98 return; |
|
99 } |
|
100 int32_t offset=strings.length(); |
|
101 if(length>0xff) { |
|
102 offset=~offset; |
|
103 strings.append((char)(length>>8), errorCode); |
|
104 } |
|
105 strings.append((char)length, errorCode); |
|
106 stringOffset=offset; |
|
107 value=val; |
|
108 strings.append(s, errorCode); |
|
109 } |
|
110 |
|
111 int32_t |
|
112 BytesTrieElement::compareStringTo(const BytesTrieElement &other, const CharString &strings) const { |
|
113 // TODO: add StringPiece::compare(), see ticket #8187 |
|
114 StringPiece thisString=getString(strings); |
|
115 StringPiece otherString=other.getString(strings); |
|
116 int32_t lengthDiff=thisString.length()-otherString.length(); |
|
117 int32_t commonLength; |
|
118 if(lengthDiff<=0) { |
|
119 commonLength=thisString.length(); |
|
120 } else { |
|
121 commonLength=otherString.length(); |
|
122 } |
|
123 int32_t diff=uprv_memcmp(thisString.data(), otherString.data(), commonLength); |
|
124 return diff!=0 ? diff : lengthDiff; |
|
125 } |
|
126 |
|
127 BytesTrieBuilder::BytesTrieBuilder(UErrorCode &errorCode) |
|
128 : strings(NULL), elements(NULL), elementsCapacity(0), elementsLength(0), |
|
129 bytes(NULL), bytesCapacity(0), bytesLength(0) { |
|
130 if(U_FAILURE(errorCode)) { |
|
131 return; |
|
132 } |
|
133 strings=new CharString(); |
|
134 if(strings==NULL) { |
|
135 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
136 } |
|
137 } |
|
138 |
|
139 BytesTrieBuilder::~BytesTrieBuilder() { |
|
140 delete strings; |
|
141 delete[] elements; |
|
142 uprv_free(bytes); |
|
143 } |
|
144 |
|
145 BytesTrieBuilder & |
|
146 BytesTrieBuilder::add(const StringPiece &s, int32_t value, UErrorCode &errorCode) { |
|
147 if(U_FAILURE(errorCode)) { |
|
148 return *this; |
|
149 } |
|
150 if(bytesLength>0) { |
|
151 // Cannot add elements after building. |
|
152 errorCode=U_NO_WRITE_PERMISSION; |
|
153 return *this; |
|
154 } |
|
155 if(elementsLength==elementsCapacity) { |
|
156 int32_t newCapacity; |
|
157 if(elementsCapacity==0) { |
|
158 newCapacity=1024; |
|
159 } else { |
|
160 newCapacity=4*elementsCapacity; |
|
161 } |
|
162 BytesTrieElement *newElements=new BytesTrieElement[newCapacity]; |
|
163 if(newElements==NULL) { |
|
164 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
165 return *this; // error instead of dereferencing null |
|
166 } |
|
167 if(elementsLength>0) { |
|
168 uprv_memcpy(newElements, elements, elementsLength*sizeof(BytesTrieElement)); |
|
169 } |
|
170 delete[] elements; |
|
171 elements=newElements; |
|
172 elementsCapacity=newCapacity; |
|
173 } |
|
174 elements[elementsLength++].setTo(s, value, *strings, errorCode); |
|
175 return *this; |
|
176 } |
|
177 |
|
178 U_CDECL_BEGIN |
|
179 |
|
180 static int32_t U_CALLCONV |
|
181 compareElementStrings(const void *context, const void *left, const void *right) { |
|
182 const CharString *strings=static_cast<const CharString *>(context); |
|
183 const BytesTrieElement *leftElement=static_cast<const BytesTrieElement *>(left); |
|
184 const BytesTrieElement *rightElement=static_cast<const BytesTrieElement *>(right); |
|
185 return leftElement->compareStringTo(*rightElement, *strings); |
|
186 } |
|
187 |
|
188 U_CDECL_END |
|
189 |
|
190 BytesTrie * |
|
191 BytesTrieBuilder::build(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
|
192 buildBytes(buildOption, errorCode); |
|
193 BytesTrie *newTrie=NULL; |
|
194 if(U_SUCCESS(errorCode)) { |
|
195 newTrie=new BytesTrie(bytes, bytes+(bytesCapacity-bytesLength)); |
|
196 if(newTrie==NULL) { |
|
197 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
198 } else { |
|
199 bytes=NULL; // The new trie now owns the array. |
|
200 bytesCapacity=0; |
|
201 } |
|
202 } |
|
203 return newTrie; |
|
204 } |
|
205 |
|
206 StringPiece |
|
207 BytesTrieBuilder::buildStringPiece(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
|
208 buildBytes(buildOption, errorCode); |
|
209 StringPiece result; |
|
210 if(U_SUCCESS(errorCode)) { |
|
211 result.set(bytes+(bytesCapacity-bytesLength), bytesLength); |
|
212 } |
|
213 return result; |
|
214 } |
|
215 |
|
216 void |
|
217 BytesTrieBuilder::buildBytes(UStringTrieBuildOption buildOption, UErrorCode &errorCode) { |
|
218 if(U_FAILURE(errorCode)) { |
|
219 return; |
|
220 } |
|
221 if(bytes!=NULL && bytesLength>0) { |
|
222 // Already built. |
|
223 return; |
|
224 } |
|
225 if(bytesLength==0) { |
|
226 if(elementsLength==0) { |
|
227 errorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
|
228 return; |
|
229 } |
|
230 uprv_sortArray(elements, elementsLength, (int32_t)sizeof(BytesTrieElement), |
|
231 compareElementStrings, strings, |
|
232 FALSE, // need not be a stable sort |
|
233 &errorCode); |
|
234 if(U_FAILURE(errorCode)) { |
|
235 return; |
|
236 } |
|
237 // Duplicate strings are not allowed. |
|
238 StringPiece prev=elements[0].getString(*strings); |
|
239 for(int32_t i=1; i<elementsLength; ++i) { |
|
240 StringPiece current=elements[i].getString(*strings); |
|
241 if(prev==current) { |
|
242 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
243 return; |
|
244 } |
|
245 prev=current; |
|
246 } |
|
247 } |
|
248 // Create and byte-serialize the trie for the elements. |
|
249 bytesLength=0; |
|
250 int32_t capacity=strings->length(); |
|
251 if(capacity<1024) { |
|
252 capacity=1024; |
|
253 } |
|
254 if(bytesCapacity<capacity) { |
|
255 uprv_free(bytes); |
|
256 bytes=static_cast<char *>(uprv_malloc(capacity)); |
|
257 if(bytes==NULL) { |
|
258 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
259 bytesCapacity=0; |
|
260 return; |
|
261 } |
|
262 bytesCapacity=capacity; |
|
263 } |
|
264 StringTrieBuilder::build(buildOption, elementsLength, errorCode); |
|
265 if(bytes==NULL) { |
|
266 errorCode=U_MEMORY_ALLOCATION_ERROR; |
|
267 } |
|
268 } |
|
269 |
|
270 BytesTrieBuilder & |
|
271 BytesTrieBuilder::clear() { |
|
272 strings->clear(); |
|
273 elementsLength=0; |
|
274 bytesLength=0; |
|
275 return *this; |
|
276 } |
|
277 |
|
278 int32_t |
|
279 BytesTrieBuilder::getElementStringLength(int32_t i) const { |
|
280 return elements[i].getStringLength(*strings); |
|
281 } |
|
282 |
|
283 UChar |
|
284 BytesTrieBuilder::getElementUnit(int32_t i, int32_t byteIndex) const { |
|
285 return (uint8_t)elements[i].charAt(byteIndex, *strings); |
|
286 } |
|
287 |
|
288 int32_t |
|
289 BytesTrieBuilder::getElementValue(int32_t i) const { |
|
290 return elements[i].getValue(); |
|
291 } |
|
292 |
|
293 int32_t |
|
294 BytesTrieBuilder::getLimitOfLinearMatch(int32_t first, int32_t last, int32_t byteIndex) const { |
|
295 const BytesTrieElement &firstElement=elements[first]; |
|
296 const BytesTrieElement &lastElement=elements[last]; |
|
297 int32_t minStringLength=firstElement.getStringLength(*strings); |
|
298 while(++byteIndex<minStringLength && |
|
299 firstElement.charAt(byteIndex, *strings)== |
|
300 lastElement.charAt(byteIndex, *strings)) {} |
|
301 return byteIndex; |
|
302 } |
|
303 |
|
304 int32_t |
|
305 BytesTrieBuilder::countElementUnits(int32_t start, int32_t limit, int32_t byteIndex) const { |
|
306 int32_t length=0; // Number of different bytes at byteIndex. |
|
307 int32_t i=start; |
|
308 do { |
|
309 char byte=elements[i++].charAt(byteIndex, *strings); |
|
310 while(i<limit && byte==elements[i].charAt(byteIndex, *strings)) { |
|
311 ++i; |
|
312 } |
|
313 ++length; |
|
314 } while(i<limit); |
|
315 return length; |
|
316 } |
|
317 |
|
318 int32_t |
|
319 BytesTrieBuilder::skipElementsBySomeUnits(int32_t i, int32_t byteIndex, int32_t count) const { |
|
320 do { |
|
321 char byte=elements[i++].charAt(byteIndex, *strings); |
|
322 while(byte==elements[i].charAt(byteIndex, *strings)) { |
|
323 ++i; |
|
324 } |
|
325 } while(--count>0); |
|
326 return i; |
|
327 } |
|
328 |
|
329 int32_t |
|
330 BytesTrieBuilder::indexOfElementWithNextUnit(int32_t i, int32_t byteIndex, UChar byte) const { |
|
331 char b=(char)byte; |
|
332 while(b==elements[i].charAt(byteIndex, *strings)) { |
|
333 ++i; |
|
334 } |
|
335 return i; |
|
336 } |
|
337 |
|
338 BytesTrieBuilder::BTLinearMatchNode::BTLinearMatchNode(const char *bytes, int32_t len, Node *nextNode) |
|
339 : LinearMatchNode(len, nextNode), s(bytes) { |
|
340 hash=hash*37+ustr_hashCharsN(bytes, len); |
|
341 } |
|
342 |
|
343 UBool |
|
344 BytesTrieBuilder::BTLinearMatchNode::operator==(const Node &other) const { |
|
345 if(this==&other) { |
|
346 return TRUE; |
|
347 } |
|
348 if(!LinearMatchNode::operator==(other)) { |
|
349 return FALSE; |
|
350 } |
|
351 const BTLinearMatchNode &o=(const BTLinearMatchNode &)other; |
|
352 return 0==uprv_memcmp(s, o.s, length); |
|
353 } |
|
354 |
|
355 void |
|
356 BytesTrieBuilder::BTLinearMatchNode::write(StringTrieBuilder &builder) { |
|
357 BytesTrieBuilder &b=(BytesTrieBuilder &)builder; |
|
358 next->write(builder); |
|
359 b.write(s, length); |
|
360 offset=b.write(b.getMinLinearMatch()+length-1); |
|
361 } |
|
362 |
|
363 StringTrieBuilder::Node * |
|
364 BytesTrieBuilder::createLinearMatchNode(int32_t i, int32_t byteIndex, int32_t length, |
|
365 Node *nextNode) const { |
|
366 return new BTLinearMatchNode( |
|
367 elements[i].getString(*strings).data()+byteIndex, |
|
368 length, |
|
369 nextNode); |
|
370 } |
|
371 |
|
372 UBool |
|
373 BytesTrieBuilder::ensureCapacity(int32_t length) { |
|
374 if(bytes==NULL) { |
|
375 return FALSE; // previous memory allocation had failed |
|
376 } |
|
377 if(length>bytesCapacity) { |
|
378 int32_t newCapacity=bytesCapacity; |
|
379 do { |
|
380 newCapacity*=2; |
|
381 } while(newCapacity<=length); |
|
382 char *newBytes=static_cast<char *>(uprv_malloc(newCapacity)); |
|
383 if(newBytes==NULL) { |
|
384 // unable to allocate memory |
|
385 uprv_free(bytes); |
|
386 bytes=NULL; |
|
387 bytesCapacity=0; |
|
388 return FALSE; |
|
389 } |
|
390 uprv_memcpy(newBytes+(newCapacity-bytesLength), |
|
391 bytes+(bytesCapacity-bytesLength), bytesLength); |
|
392 uprv_free(bytes); |
|
393 bytes=newBytes; |
|
394 bytesCapacity=newCapacity; |
|
395 } |
|
396 return TRUE; |
|
397 } |
|
398 |
|
399 int32_t |
|
400 BytesTrieBuilder::write(int32_t byte) { |
|
401 int32_t newLength=bytesLength+1; |
|
402 if(ensureCapacity(newLength)) { |
|
403 bytesLength=newLength; |
|
404 bytes[bytesCapacity-bytesLength]=(char)byte; |
|
405 } |
|
406 return bytesLength; |
|
407 } |
|
408 |
|
409 int32_t |
|
410 BytesTrieBuilder::write(const char *b, int32_t length) { |
|
411 int32_t newLength=bytesLength+length; |
|
412 if(ensureCapacity(newLength)) { |
|
413 bytesLength=newLength; |
|
414 uprv_memcpy(bytes+(bytesCapacity-bytesLength), b, length); |
|
415 } |
|
416 return bytesLength; |
|
417 } |
|
418 |
|
419 int32_t |
|
420 BytesTrieBuilder::writeElementUnits(int32_t i, int32_t byteIndex, int32_t length) { |
|
421 return write(elements[i].getString(*strings).data()+byteIndex, length); |
|
422 } |
|
423 |
|
424 int32_t |
|
425 BytesTrieBuilder::writeValueAndFinal(int32_t i, UBool isFinal) { |
|
426 if(0<=i && i<=BytesTrie::kMaxOneByteValue) { |
|
427 return write(((BytesTrie::kMinOneByteValueLead+i)<<1)|isFinal); |
|
428 } |
|
429 char intBytes[5]; |
|
430 int32_t length=1; |
|
431 if(i<0 || i>0xffffff) { |
|
432 intBytes[0]=(char)BytesTrie::kFiveByteValueLead; |
|
433 intBytes[1]=(char)((uint32_t)i>>24); |
|
434 intBytes[2]=(char)((uint32_t)i>>16); |
|
435 intBytes[3]=(char)((uint32_t)i>>8); |
|
436 intBytes[4]=(char)i; |
|
437 length=5; |
|
438 // } else if(i<=BytesTrie::kMaxOneByteValue) { |
|
439 // intBytes[0]=(char)(BytesTrie::kMinOneByteValueLead+i); |
|
440 } else { |
|
441 if(i<=BytesTrie::kMaxTwoByteValue) { |
|
442 intBytes[0]=(char)(BytesTrie::kMinTwoByteValueLead+(i>>8)); |
|
443 } else { |
|
444 if(i<=BytesTrie::kMaxThreeByteValue) { |
|
445 intBytes[0]=(char)(BytesTrie::kMinThreeByteValueLead+(i>>16)); |
|
446 } else { |
|
447 intBytes[0]=(char)BytesTrie::kFourByteValueLead; |
|
448 intBytes[1]=(char)(i>>16); |
|
449 length=2; |
|
450 } |
|
451 intBytes[length++]=(char)(i>>8); |
|
452 } |
|
453 intBytes[length++]=(char)i; |
|
454 } |
|
455 intBytes[0]=(char)((intBytes[0]<<1)|isFinal); |
|
456 return write(intBytes, length); |
|
457 } |
|
458 |
|
459 int32_t |
|
460 BytesTrieBuilder::writeValueAndType(UBool hasValue, int32_t value, int32_t node) { |
|
461 int32_t offset=write(node); |
|
462 if(hasValue) { |
|
463 offset=writeValueAndFinal(value, FALSE); |
|
464 } |
|
465 return offset; |
|
466 } |
|
467 |
|
468 int32_t |
|
469 BytesTrieBuilder::writeDeltaTo(int32_t jumpTarget) { |
|
470 int32_t i=bytesLength-jumpTarget; |
|
471 U_ASSERT(i>=0); |
|
472 if(i<=BytesTrie::kMaxOneByteDelta) { |
|
473 return write(i); |
|
474 } |
|
475 char intBytes[5]; |
|
476 int32_t length; |
|
477 if(i<=BytesTrie::kMaxTwoByteDelta) { |
|
478 intBytes[0]=(char)(BytesTrie::kMinTwoByteDeltaLead+(i>>8)); |
|
479 length=1; |
|
480 } else { |
|
481 if(i<=BytesTrie::kMaxThreeByteDelta) { |
|
482 intBytes[0]=(char)(BytesTrie::kMinThreeByteDeltaLead+(i>>16)); |
|
483 length=2; |
|
484 } else { |
|
485 if(i<=0xffffff) { |
|
486 intBytes[0]=(char)BytesTrie::kFourByteDeltaLead; |
|
487 length=3; |
|
488 } else { |
|
489 intBytes[0]=(char)BytesTrie::kFiveByteDeltaLead; |
|
490 intBytes[1]=(char)(i>>24); |
|
491 length=4; |
|
492 } |
|
493 intBytes[1]=(char)(i>>16); |
|
494 } |
|
495 intBytes[1]=(char)(i>>8); |
|
496 } |
|
497 intBytes[length++]=(char)i; |
|
498 return write(intBytes, length); |
|
499 } |
|
500 |
|
501 U_NAMESPACE_END |