|
1 /* |
|
2 ****************************************************************************** |
|
3 * Copyright (C) 1999-2013, International Business Machines Corporation and |
|
4 * others. All Rights Reserved. |
|
5 ****************************************************************************** |
|
6 * |
|
7 * File unistr.cpp |
|
8 * |
|
9 * Modification History: |
|
10 * |
|
11 * Date Name Description |
|
12 * 09/25/98 stephen Creation. |
|
13 * 04/20/99 stephen Overhauled per 4/16 code review. |
|
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX |
|
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from |
|
16 * Replaceable. |
|
17 * 06/25/01 grhoten Removed the dependency on iostream |
|
18 ****************************************************************************** |
|
19 */ |
|
20 |
|
21 #include "unicode/utypes.h" |
|
22 #include "unicode/appendable.h" |
|
23 #include "unicode/putil.h" |
|
24 #include "cstring.h" |
|
25 #include "cmemory.h" |
|
26 #include "unicode/ustring.h" |
|
27 #include "unicode/unistr.h" |
|
28 #include "unicode/utf.h" |
|
29 #include "unicode/utf16.h" |
|
30 #include "uelement.h" |
|
31 #include "ustr_imp.h" |
|
32 #include "umutex.h" |
|
33 #include "uassert.h" |
|
34 |
|
35 #if 0 |
|
36 |
|
37 #include <iostream> |
|
38 using namespace std; |
|
39 |
|
40 //DEBUGGING |
|
41 void |
|
42 print(const UnicodeString& s, |
|
43 const char *name) |
|
44 { |
|
45 UChar c; |
|
46 cout << name << ":|"; |
|
47 for(int i = 0; i < s.length(); ++i) { |
|
48 c = s[i]; |
|
49 if(c>= 0x007E || c < 0x0020) |
|
50 cout << "[0x" << hex << s[i] << "]"; |
|
51 else |
|
52 cout << (char) s[i]; |
|
53 } |
|
54 cout << '|' << endl; |
|
55 } |
|
56 |
|
57 void |
|
58 print(const UChar *s, |
|
59 int32_t len, |
|
60 const char *name) |
|
61 { |
|
62 UChar c; |
|
63 cout << name << ":|"; |
|
64 for(int i = 0; i < len; ++i) { |
|
65 c = s[i]; |
|
66 if(c>= 0x007E || c < 0x0020) |
|
67 cout << "[0x" << hex << s[i] << "]"; |
|
68 else |
|
69 cout << (char) s[i]; |
|
70 } |
|
71 cout << '|' << endl; |
|
72 } |
|
73 // END DEBUGGING |
|
74 #endif |
|
75 |
|
76 // Local function definitions for now |
|
77 |
|
78 // need to copy areas that may overlap |
|
79 static |
|
80 inline void |
|
81 us_arrayCopy(const UChar *src, int32_t srcStart, |
|
82 UChar *dst, int32_t dstStart, int32_t count) |
|
83 { |
|
84 if(count>0) { |
|
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); |
|
86 } |
|
87 } |
|
88 |
|
89 // u_unescapeAt() callback to get a UChar from a UnicodeString |
|
90 U_CDECL_BEGIN |
|
91 static UChar U_CALLCONV |
|
92 UnicodeString_charAt(int32_t offset, void *context) { |
|
93 return ((icu::UnicodeString*) context)->charAt(offset); |
|
94 } |
|
95 U_CDECL_END |
|
96 |
|
97 U_NAMESPACE_BEGIN |
|
98 |
|
99 /* The Replaceable virtual destructor can't be defined in the header |
|
100 due to how AIX works with multiple definitions of virtual functions. |
|
101 */ |
|
102 Replaceable::~Replaceable() {} |
|
103 |
|
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) |
|
105 |
|
106 UnicodeString U_EXPORT2 |
|
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) { |
|
108 return |
|
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). |
|
110 append(s1). |
|
111 append(s2); |
|
112 } |
|
113 |
|
114 //======================================== |
|
115 // Reference Counting functions, put at top of file so that optimizing compilers |
|
116 // have a chance to automatically inline. |
|
117 //======================================== |
|
118 |
|
119 void |
|
120 UnicodeString::addRef() { |
|
121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); |
|
122 } |
|
123 |
|
124 int32_t |
|
125 UnicodeString::removeRef() { |
|
126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); |
|
127 } |
|
128 |
|
129 int32_t |
|
130 UnicodeString::refCount() const { |
|
131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); |
|
132 } |
|
133 |
|
134 void |
|
135 UnicodeString::releaseArray() { |
|
136 if((fFlags & kRefCounted) && removeRef() == 0) { |
|
137 uprv_free((int32_t *)fUnion.fFields.fArray - 1); |
|
138 } |
|
139 } |
|
140 |
|
141 |
|
142 |
|
143 //======================================== |
|
144 // Constructors |
|
145 //======================================== |
|
146 |
|
147 // The default constructor is inline in unistr.h. |
|
148 |
|
149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) |
|
150 : fShortLength(0), |
|
151 fFlags(0) |
|
152 { |
|
153 if(count <= 0 || (uint32_t)c > 0x10ffff) { |
|
154 // just allocate and do not do anything else |
|
155 allocate(capacity); |
|
156 } else { |
|
157 // count > 0, allocate and fill the new string with count c's |
|
158 int32_t unitCount = U16_LENGTH(c), length = count * unitCount; |
|
159 if(capacity < length) { |
|
160 capacity = length; |
|
161 } |
|
162 if(allocate(capacity)) { |
|
163 UChar *array = getArrayStart(); |
|
164 int32_t i = 0; |
|
165 |
|
166 // fill the new string with c |
|
167 if(unitCount == 1) { |
|
168 // fill with length UChars |
|
169 while(i < length) { |
|
170 array[i++] = (UChar)c; |
|
171 } |
|
172 } else { |
|
173 // get the code units for c |
|
174 UChar units[U16_MAX_LENGTH]; |
|
175 U16_APPEND_UNSAFE(units, i, c); |
|
176 |
|
177 // now it must be i==unitCount |
|
178 i = 0; |
|
179 |
|
180 // for Unicode, unitCount can only be 1, 2, 3, or 4 |
|
181 // 1 is handled above |
|
182 while(i < length) { |
|
183 int32_t unitIdx = 0; |
|
184 while(unitIdx < unitCount) { |
|
185 array[i++]=units[unitIdx++]; |
|
186 } |
|
187 } |
|
188 } |
|
189 } |
|
190 setLength(length); |
|
191 } |
|
192 } |
|
193 |
|
194 UnicodeString::UnicodeString(UChar ch) |
|
195 : fShortLength(1), |
|
196 fFlags(kShortString) |
|
197 { |
|
198 fUnion.fStackBuffer[0] = ch; |
|
199 } |
|
200 |
|
201 UnicodeString::UnicodeString(UChar32 ch) |
|
202 : fShortLength(0), |
|
203 fFlags(kShortString) |
|
204 { |
|
205 int32_t i = 0; |
|
206 UBool isError = FALSE; |
|
207 U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); |
|
208 // We test isError so that the compiler does not complain that we don't. |
|
209 // If isError then i==0 which is what we want anyway. |
|
210 if(!isError) { |
|
211 fShortLength = (int8_t)i; |
|
212 } |
|
213 } |
|
214 |
|
215 UnicodeString::UnicodeString(const UChar *text) |
|
216 : fShortLength(0), |
|
217 fFlags(kShortString) |
|
218 { |
|
219 doReplace(0, 0, text, 0, -1); |
|
220 } |
|
221 |
|
222 UnicodeString::UnicodeString(const UChar *text, |
|
223 int32_t textLength) |
|
224 : fShortLength(0), |
|
225 fFlags(kShortString) |
|
226 { |
|
227 doReplace(0, 0, text, 0, textLength); |
|
228 } |
|
229 |
|
230 UnicodeString::UnicodeString(UBool isTerminated, |
|
231 const UChar *text, |
|
232 int32_t textLength) |
|
233 : fShortLength(0), |
|
234 fFlags(kReadonlyAlias) |
|
235 { |
|
236 if(text == NULL) { |
|
237 // treat as an empty string, do not alias |
|
238 setToEmpty(); |
|
239 } else if(textLength < -1 || |
|
240 (textLength == -1 && !isTerminated) || |
|
241 (textLength >= 0 && isTerminated && text[textLength] != 0) |
|
242 ) { |
|
243 setToBogus(); |
|
244 } else { |
|
245 if(textLength == -1) { |
|
246 // text is terminated, or else it would have failed the above test |
|
247 textLength = u_strlen(text); |
|
248 } |
|
249 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); |
|
250 } |
|
251 } |
|
252 |
|
253 UnicodeString::UnicodeString(UChar *buff, |
|
254 int32_t buffLength, |
|
255 int32_t buffCapacity) |
|
256 : fShortLength(0), |
|
257 fFlags(kWritableAlias) |
|
258 { |
|
259 if(buff == NULL) { |
|
260 // treat as an empty string, do not alias |
|
261 setToEmpty(); |
|
262 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { |
|
263 setToBogus(); |
|
264 } else { |
|
265 if(buffLength == -1) { |
|
266 // fLength = u_strlen(buff); but do not look beyond buffCapacity |
|
267 const UChar *p = buff, *limit = buff + buffCapacity; |
|
268 while(p != limit && *p != 0) { |
|
269 ++p; |
|
270 } |
|
271 buffLength = (int32_t)(p - buff); |
|
272 } |
|
273 setArray(buff, buffLength, buffCapacity); |
|
274 } |
|
275 } |
|
276 |
|
277 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) |
|
278 : fShortLength(0), |
|
279 fFlags(kShortString) |
|
280 { |
|
281 if(src==NULL) { |
|
282 // treat as an empty string |
|
283 } else { |
|
284 if(length<0) { |
|
285 length=(int32_t)uprv_strlen(src); |
|
286 } |
|
287 if(cloneArrayIfNeeded(length, length, FALSE)) { |
|
288 u_charsToUChars(src, getArrayStart(), length); |
|
289 setLength(length); |
|
290 } else { |
|
291 setToBogus(); |
|
292 } |
|
293 } |
|
294 } |
|
295 |
|
296 #if U_CHARSET_IS_UTF8 |
|
297 |
|
298 UnicodeString::UnicodeString(const char *codepageData) |
|
299 : fShortLength(0), |
|
300 fFlags(kShortString) { |
|
301 if(codepageData != 0) { |
|
302 setToUTF8(codepageData); |
|
303 } |
|
304 } |
|
305 |
|
306 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) |
|
307 : fShortLength(0), |
|
308 fFlags(kShortString) { |
|
309 // if there's nothing to convert, do nothing |
|
310 if(codepageData == 0 || dataLength == 0 || dataLength < -1) { |
|
311 return; |
|
312 } |
|
313 if(dataLength == -1) { |
|
314 dataLength = (int32_t)uprv_strlen(codepageData); |
|
315 } |
|
316 setToUTF8(StringPiece(codepageData, dataLength)); |
|
317 } |
|
318 |
|
319 // else see unistr_cnv.cpp |
|
320 #endif |
|
321 |
|
322 UnicodeString::UnicodeString(const UnicodeString& that) |
|
323 : Replaceable(), |
|
324 fShortLength(0), |
|
325 fFlags(kShortString) |
|
326 { |
|
327 copyFrom(that); |
|
328 } |
|
329 |
|
330 UnicodeString::UnicodeString(const UnicodeString& that, |
|
331 int32_t srcStart) |
|
332 : Replaceable(), |
|
333 fShortLength(0), |
|
334 fFlags(kShortString) |
|
335 { |
|
336 setTo(that, srcStart); |
|
337 } |
|
338 |
|
339 UnicodeString::UnicodeString(const UnicodeString& that, |
|
340 int32_t srcStart, |
|
341 int32_t srcLength) |
|
342 : Replaceable(), |
|
343 fShortLength(0), |
|
344 fFlags(kShortString) |
|
345 { |
|
346 setTo(that, srcStart, srcLength); |
|
347 } |
|
348 |
|
349 // Replaceable base class clone() default implementation, does not clone |
|
350 Replaceable * |
|
351 Replaceable::clone() const { |
|
352 return NULL; |
|
353 } |
|
354 |
|
355 // UnicodeString overrides clone() with a real implementation |
|
356 Replaceable * |
|
357 UnicodeString::clone() const { |
|
358 return new UnicodeString(*this); |
|
359 } |
|
360 |
|
361 //======================================== |
|
362 // array allocation |
|
363 //======================================== |
|
364 |
|
365 UBool |
|
366 UnicodeString::allocate(int32_t capacity) { |
|
367 if(capacity <= US_STACKBUF_SIZE) { |
|
368 fFlags = kShortString; |
|
369 } else { |
|
370 // count bytes for the refCounter and the string capacity, and |
|
371 // round up to a multiple of 16; then divide by 4 and allocate int32_t's |
|
372 // to be safely aligned for the refCount |
|
373 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() |
|
374 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); |
|
375 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); |
|
376 if(array != 0) { |
|
377 // set initial refCount and point behind the refCount |
|
378 *array++ = 1; |
|
379 |
|
380 // have fArray point to the first UChar |
|
381 fUnion.fFields.fArray = (UChar *)array; |
|
382 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); |
|
383 fFlags = kLongString; |
|
384 } else { |
|
385 fShortLength = 0; |
|
386 fUnion.fFields.fArray = 0; |
|
387 fUnion.fFields.fCapacity = 0; |
|
388 fFlags = kIsBogus; |
|
389 return FALSE; |
|
390 } |
|
391 } |
|
392 return TRUE; |
|
393 } |
|
394 |
|
395 //======================================== |
|
396 // Destructor |
|
397 //======================================== |
|
398 UnicodeString::~UnicodeString() |
|
399 { |
|
400 releaseArray(); |
|
401 } |
|
402 |
|
403 //======================================== |
|
404 // Factory methods |
|
405 //======================================== |
|
406 |
|
407 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { |
|
408 UnicodeString result; |
|
409 result.setToUTF8(utf8); |
|
410 return result; |
|
411 } |
|
412 |
|
413 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { |
|
414 UnicodeString result; |
|
415 int32_t capacity; |
|
416 // Most UTF-32 strings will be BMP-only and result in a same-length |
|
417 // UTF-16 string. We overestimate the capacity just slightly, |
|
418 // just in case there are a few supplementary characters. |
|
419 if(length <= US_STACKBUF_SIZE) { |
|
420 capacity = US_STACKBUF_SIZE; |
|
421 } else { |
|
422 capacity = length + (length >> 4) + 4; |
|
423 } |
|
424 do { |
|
425 UChar *utf16 = result.getBuffer(capacity); |
|
426 int32_t length16; |
|
427 UErrorCode errorCode = U_ZERO_ERROR; |
|
428 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, |
|
429 utf32, length, |
|
430 0xfffd, // Substitution character. |
|
431 NULL, // Don't care about number of substitutions. |
|
432 &errorCode); |
|
433 result.releaseBuffer(length16); |
|
434 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
|
435 capacity = length16 + 1; // +1 for the terminating NUL. |
|
436 continue; |
|
437 } else if(U_FAILURE(errorCode)) { |
|
438 result.setToBogus(); |
|
439 } |
|
440 break; |
|
441 } while(TRUE); |
|
442 return result; |
|
443 } |
|
444 |
|
445 //======================================== |
|
446 // Assignment |
|
447 //======================================== |
|
448 |
|
449 UnicodeString & |
|
450 UnicodeString::operator=(const UnicodeString &src) { |
|
451 return copyFrom(src); |
|
452 } |
|
453 |
|
454 UnicodeString & |
|
455 UnicodeString::fastCopyFrom(const UnicodeString &src) { |
|
456 return copyFrom(src, TRUE); |
|
457 } |
|
458 |
|
459 UnicodeString & |
|
460 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { |
|
461 // if assigning to ourselves, do nothing |
|
462 if(this == 0 || this == &src) { |
|
463 return *this; |
|
464 } |
|
465 |
|
466 // is the right side bogus? |
|
467 if(&src == 0 || src.isBogus()) { |
|
468 setToBogus(); |
|
469 return *this; |
|
470 } |
|
471 |
|
472 // delete the current contents |
|
473 releaseArray(); |
|
474 |
|
475 if(src.isEmpty()) { |
|
476 // empty string - use the stack buffer |
|
477 setToEmpty(); |
|
478 return *this; |
|
479 } |
|
480 |
|
481 // we always copy the length |
|
482 int32_t srcLength = src.length(); |
|
483 setLength(srcLength); |
|
484 |
|
485 // fLength>0 and not an "open" src.getBuffer(minCapacity) |
|
486 switch(src.fFlags) { |
|
487 case kShortString: |
|
488 // short string using the stack buffer, do the same |
|
489 fFlags = kShortString; |
|
490 uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR); |
|
491 break; |
|
492 case kLongString: |
|
493 // src uses a refCounted string buffer, use that buffer with refCount |
|
494 // src is const, use a cast - we don't really change it |
|
495 ((UnicodeString &)src).addRef(); |
|
496 // copy all fields, share the reference-counted buffer |
|
497 fUnion.fFields.fArray = src.fUnion.fFields.fArray; |
|
498 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; |
|
499 fFlags = src.fFlags; |
|
500 break; |
|
501 case kReadonlyAlias: |
|
502 if(fastCopy) { |
|
503 // src is a readonly alias, do the same |
|
504 // -> maintain the readonly alias as such |
|
505 fUnion.fFields.fArray = src.fUnion.fFields.fArray; |
|
506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; |
|
507 fFlags = src.fFlags; |
|
508 break; |
|
509 } |
|
510 // else if(!fastCopy) fall through to case kWritableAlias |
|
511 // -> allocate a new buffer and copy the contents |
|
512 case kWritableAlias: |
|
513 // src is a writable alias; we make a copy of that instead |
|
514 if(allocate(srcLength)) { |
|
515 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); |
|
516 break; |
|
517 } |
|
518 // if there is not enough memory, then fall through to setting to bogus |
|
519 default: |
|
520 // if src is bogus, set ourselves to bogus |
|
521 // do not call setToBogus() here because fArray and fFlags are not consistent here |
|
522 fShortLength = 0; |
|
523 fUnion.fFields.fArray = 0; |
|
524 fUnion.fFields.fCapacity = 0; |
|
525 fFlags = kIsBogus; |
|
526 break; |
|
527 } |
|
528 |
|
529 return *this; |
|
530 } |
|
531 |
|
532 //======================================== |
|
533 // Miscellaneous operations |
|
534 //======================================== |
|
535 |
|
536 UnicodeString UnicodeString::unescape() const { |
|
537 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity |
|
538 const UChar *array = getBuffer(); |
|
539 int32_t len = length(); |
|
540 int32_t prev = 0; |
|
541 for (int32_t i=0;;) { |
|
542 if (i == len) { |
|
543 result.append(array, prev, len - prev); |
|
544 break; |
|
545 } |
|
546 if (array[i++] == 0x5C /*'\\'*/) { |
|
547 result.append(array, prev, (i - 1) - prev); |
|
548 UChar32 c = unescapeAt(i); // advances i |
|
549 if (c < 0) { |
|
550 result.remove(); // return empty string |
|
551 break; // invalid escape sequence |
|
552 } |
|
553 result.append(c); |
|
554 prev = i; |
|
555 } |
|
556 } |
|
557 return result; |
|
558 } |
|
559 |
|
560 UChar32 UnicodeString::unescapeAt(int32_t &offset) const { |
|
561 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); |
|
562 } |
|
563 |
|
564 //======================================== |
|
565 // Read-only implementation |
|
566 //======================================== |
|
567 UBool |
|
568 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { |
|
569 // Requires: this & text not bogus and have same lengths. |
|
570 // Byte-wise comparison works for equality regardless of endianness. |
|
571 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; |
|
572 } |
|
573 |
|
574 int8_t |
|
575 UnicodeString::doCompare( int32_t start, |
|
576 int32_t length, |
|
577 const UChar *srcChars, |
|
578 int32_t srcStart, |
|
579 int32_t srcLength) const |
|
580 { |
|
581 // compare illegal string values |
|
582 if(isBogus()) { |
|
583 return -1; |
|
584 } |
|
585 |
|
586 // pin indices to legal values |
|
587 pinIndices(start, length); |
|
588 |
|
589 if(srcChars == NULL) { |
|
590 // treat const UChar *srcChars==NULL as an empty string |
|
591 return length == 0 ? 0 : 1; |
|
592 } |
|
593 |
|
594 // get the correct pointer |
|
595 const UChar *chars = getArrayStart(); |
|
596 |
|
597 chars += start; |
|
598 srcChars += srcStart; |
|
599 |
|
600 int32_t minLength; |
|
601 int8_t lengthResult; |
|
602 |
|
603 // get the srcLength if necessary |
|
604 if(srcLength < 0) { |
|
605 srcLength = u_strlen(srcChars + srcStart); |
|
606 } |
|
607 |
|
608 // are we comparing different lengths? |
|
609 if(length != srcLength) { |
|
610 if(length < srcLength) { |
|
611 minLength = length; |
|
612 lengthResult = -1; |
|
613 } else { |
|
614 minLength = srcLength; |
|
615 lengthResult = 1; |
|
616 } |
|
617 } else { |
|
618 minLength = length; |
|
619 lengthResult = 0; |
|
620 } |
|
621 |
|
622 /* |
|
623 * note that uprv_memcmp() returns an int but we return an int8_t; |
|
624 * we need to take care not to truncate the result - |
|
625 * one way to do this is to right-shift the value to |
|
626 * move the sign bit into the lower 8 bits and making sure that this |
|
627 * does not become 0 itself |
|
628 */ |
|
629 |
|
630 if(minLength > 0 && chars != srcChars) { |
|
631 int32_t result; |
|
632 |
|
633 # if U_IS_BIG_ENDIAN |
|
634 // big-endian: byte comparison works |
|
635 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); |
|
636 if(result != 0) { |
|
637 return (int8_t)(result >> 15 | 1); |
|
638 } |
|
639 # else |
|
640 // little-endian: compare UChar units |
|
641 do { |
|
642 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); |
|
643 if(result != 0) { |
|
644 return (int8_t)(result >> 15 | 1); |
|
645 } |
|
646 } while(--minLength > 0); |
|
647 # endif |
|
648 } |
|
649 return lengthResult; |
|
650 } |
|
651 |
|
652 /* String compare in code point order - doCompare() compares in code unit order. */ |
|
653 int8_t |
|
654 UnicodeString::doCompareCodePointOrder(int32_t start, |
|
655 int32_t length, |
|
656 const UChar *srcChars, |
|
657 int32_t srcStart, |
|
658 int32_t srcLength) const |
|
659 { |
|
660 // compare illegal string values |
|
661 // treat const UChar *srcChars==NULL as an empty string |
|
662 if(isBogus()) { |
|
663 return -1; |
|
664 } |
|
665 |
|
666 // pin indices to legal values |
|
667 pinIndices(start, length); |
|
668 |
|
669 if(srcChars == NULL) { |
|
670 srcStart = srcLength = 0; |
|
671 } |
|
672 |
|
673 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); |
|
674 /* translate the 32-bit result into an 8-bit one */ |
|
675 if(diff!=0) { |
|
676 return (int8_t)(diff >> 15 | 1); |
|
677 } else { |
|
678 return 0; |
|
679 } |
|
680 } |
|
681 |
|
682 int32_t |
|
683 UnicodeString::getLength() const { |
|
684 return length(); |
|
685 } |
|
686 |
|
687 UChar |
|
688 UnicodeString::getCharAt(int32_t offset) const { |
|
689 return charAt(offset); |
|
690 } |
|
691 |
|
692 UChar32 |
|
693 UnicodeString::getChar32At(int32_t offset) const { |
|
694 return char32At(offset); |
|
695 } |
|
696 |
|
697 UChar32 |
|
698 UnicodeString::char32At(int32_t offset) const |
|
699 { |
|
700 int32_t len = length(); |
|
701 if((uint32_t)offset < (uint32_t)len) { |
|
702 const UChar *array = getArrayStart(); |
|
703 UChar32 c; |
|
704 U16_GET(array, 0, offset, len, c); |
|
705 return c; |
|
706 } else { |
|
707 return kInvalidUChar; |
|
708 } |
|
709 } |
|
710 |
|
711 int32_t |
|
712 UnicodeString::getChar32Start(int32_t offset) const { |
|
713 if((uint32_t)offset < (uint32_t)length()) { |
|
714 const UChar *array = getArrayStart(); |
|
715 U16_SET_CP_START(array, 0, offset); |
|
716 return offset; |
|
717 } else { |
|
718 return 0; |
|
719 } |
|
720 } |
|
721 |
|
722 int32_t |
|
723 UnicodeString::getChar32Limit(int32_t offset) const { |
|
724 int32_t len = length(); |
|
725 if((uint32_t)offset < (uint32_t)len) { |
|
726 const UChar *array = getArrayStart(); |
|
727 U16_SET_CP_LIMIT(array, 0, offset, len); |
|
728 return offset; |
|
729 } else { |
|
730 return len; |
|
731 } |
|
732 } |
|
733 |
|
734 int32_t |
|
735 UnicodeString::countChar32(int32_t start, int32_t length) const { |
|
736 pinIndices(start, length); |
|
737 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL |
|
738 return u_countChar32(getArrayStart()+start, length); |
|
739 } |
|
740 |
|
741 UBool |
|
742 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { |
|
743 pinIndices(start, length); |
|
744 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL |
|
745 return u_strHasMoreChar32Than(getArrayStart()+start, length, number); |
|
746 } |
|
747 |
|
748 int32_t |
|
749 UnicodeString::moveIndex32(int32_t index, int32_t delta) const { |
|
750 // pin index |
|
751 int32_t len = length(); |
|
752 if(index<0) { |
|
753 index=0; |
|
754 } else if(index>len) { |
|
755 index=len; |
|
756 } |
|
757 |
|
758 const UChar *array = getArrayStart(); |
|
759 if(delta>0) { |
|
760 U16_FWD_N(array, index, len, delta); |
|
761 } else { |
|
762 U16_BACK_N(array, 0, index, -delta); |
|
763 } |
|
764 |
|
765 return index; |
|
766 } |
|
767 |
|
768 void |
|
769 UnicodeString::doExtract(int32_t start, |
|
770 int32_t length, |
|
771 UChar *dst, |
|
772 int32_t dstStart) const |
|
773 { |
|
774 // pin indices to legal values |
|
775 pinIndices(start, length); |
|
776 |
|
777 // do not copy anything if we alias dst itself |
|
778 const UChar *array = getArrayStart(); |
|
779 if(array + start != dst + dstStart) { |
|
780 us_arrayCopy(array, start, dst, dstStart, length); |
|
781 } |
|
782 } |
|
783 |
|
784 int32_t |
|
785 UnicodeString::extract(UChar *dest, int32_t destCapacity, |
|
786 UErrorCode &errorCode) const { |
|
787 int32_t len = length(); |
|
788 if(U_SUCCESS(errorCode)) { |
|
789 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { |
|
790 errorCode=U_ILLEGAL_ARGUMENT_ERROR; |
|
791 } else { |
|
792 const UChar *array = getArrayStart(); |
|
793 if(len>0 && len<=destCapacity && array!=dest) { |
|
794 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); |
|
795 } |
|
796 return u_terminateUChars(dest, destCapacity, len, &errorCode); |
|
797 } |
|
798 } |
|
799 |
|
800 return len; |
|
801 } |
|
802 |
|
803 int32_t |
|
804 UnicodeString::extract(int32_t start, |
|
805 int32_t length, |
|
806 char *target, |
|
807 int32_t targetCapacity, |
|
808 enum EInvariant) const |
|
809 { |
|
810 // if the arguments are illegal, then do nothing |
|
811 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { |
|
812 return 0; |
|
813 } |
|
814 |
|
815 // pin the indices to legal values |
|
816 pinIndices(start, length); |
|
817 |
|
818 if(length <= targetCapacity) { |
|
819 u_UCharsToChars(getArrayStart() + start, target, length); |
|
820 } |
|
821 UErrorCode status = U_ZERO_ERROR; |
|
822 return u_terminateChars(target, targetCapacity, length, &status); |
|
823 } |
|
824 |
|
825 UnicodeString |
|
826 UnicodeString::tempSubString(int32_t start, int32_t len) const { |
|
827 pinIndices(start, len); |
|
828 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer |
|
829 if(array==NULL) { |
|
830 array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string |
|
831 len=-2; // bogus result string |
|
832 } |
|
833 return UnicodeString(FALSE, array + start, len); |
|
834 } |
|
835 |
|
836 int32_t |
|
837 UnicodeString::toUTF8(int32_t start, int32_t len, |
|
838 char *target, int32_t capacity) const { |
|
839 pinIndices(start, len); |
|
840 int32_t length8; |
|
841 UErrorCode errorCode = U_ZERO_ERROR; |
|
842 u_strToUTF8WithSub(target, capacity, &length8, |
|
843 getBuffer() + start, len, |
|
844 0xFFFD, // Standard substitution character. |
|
845 NULL, // Don't care about number of substitutions. |
|
846 &errorCode); |
|
847 return length8; |
|
848 } |
|
849 |
|
850 #if U_CHARSET_IS_UTF8 |
|
851 |
|
852 int32_t |
|
853 UnicodeString::extract(int32_t start, int32_t len, |
|
854 char *target, uint32_t dstSize) const { |
|
855 // if the arguments are illegal, then do nothing |
|
856 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { |
|
857 return 0; |
|
858 } |
|
859 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); |
|
860 } |
|
861 |
|
862 // else see unistr_cnv.cpp |
|
863 #endif |
|
864 |
|
865 void |
|
866 UnicodeString::extractBetween(int32_t start, |
|
867 int32_t limit, |
|
868 UnicodeString& target) const { |
|
869 pinIndex(start); |
|
870 pinIndex(limit); |
|
871 doExtract(start, limit - start, target); |
|
872 } |
|
873 |
|
874 // When converting from UTF-16 to UTF-8, the result will have at most 3 times |
|
875 // as many bytes as the source has UChars. |
|
876 // The "worst cases" are writing systems like Indic, Thai and CJK with |
|
877 // 3:1 bytes:UChars. |
|
878 void |
|
879 UnicodeString::toUTF8(ByteSink &sink) const { |
|
880 int32_t length16 = length(); |
|
881 if(length16 != 0) { |
|
882 char stackBuffer[1024]; |
|
883 int32_t capacity = (int32_t)sizeof(stackBuffer); |
|
884 UBool utf8IsOwned = FALSE; |
|
885 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, |
|
886 3*length16, |
|
887 stackBuffer, capacity, |
|
888 &capacity); |
|
889 int32_t length8 = 0; |
|
890 UErrorCode errorCode = U_ZERO_ERROR; |
|
891 u_strToUTF8WithSub(utf8, capacity, &length8, |
|
892 getBuffer(), length16, |
|
893 0xFFFD, // Standard substitution character. |
|
894 NULL, // Don't care about number of substitutions. |
|
895 &errorCode); |
|
896 if(errorCode == U_BUFFER_OVERFLOW_ERROR) { |
|
897 utf8 = (char *)uprv_malloc(length8); |
|
898 if(utf8 != NULL) { |
|
899 utf8IsOwned = TRUE; |
|
900 errorCode = U_ZERO_ERROR; |
|
901 u_strToUTF8WithSub(utf8, length8, &length8, |
|
902 getBuffer(), length16, |
|
903 0xFFFD, // Standard substitution character. |
|
904 NULL, // Don't care about number of substitutions. |
|
905 &errorCode); |
|
906 } else { |
|
907 errorCode = U_MEMORY_ALLOCATION_ERROR; |
|
908 } |
|
909 } |
|
910 if(U_SUCCESS(errorCode)) { |
|
911 sink.Append(utf8, length8); |
|
912 sink.Flush(); |
|
913 } |
|
914 if(utf8IsOwned) { |
|
915 uprv_free(utf8); |
|
916 } |
|
917 } |
|
918 } |
|
919 |
|
920 int32_t |
|
921 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { |
|
922 int32_t length32=0; |
|
923 if(U_SUCCESS(errorCode)) { |
|
924 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. |
|
925 u_strToUTF32WithSub(utf32, capacity, &length32, |
|
926 getBuffer(), length(), |
|
927 0xfffd, // Substitution character. |
|
928 NULL, // Don't care about number of substitutions. |
|
929 &errorCode); |
|
930 } |
|
931 return length32; |
|
932 } |
|
933 |
|
934 int32_t |
|
935 UnicodeString::indexOf(const UChar *srcChars, |
|
936 int32_t srcStart, |
|
937 int32_t srcLength, |
|
938 int32_t start, |
|
939 int32_t length) const |
|
940 { |
|
941 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { |
|
942 return -1; |
|
943 } |
|
944 |
|
945 // UnicodeString does not find empty substrings |
|
946 if(srcLength < 0 && srcChars[srcStart] == 0) { |
|
947 return -1; |
|
948 } |
|
949 |
|
950 // get the indices within bounds |
|
951 pinIndices(start, length); |
|
952 |
|
953 // find the first occurrence of the substring |
|
954 const UChar *array = getArrayStart(); |
|
955 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); |
|
956 if(match == NULL) { |
|
957 return -1; |
|
958 } else { |
|
959 return (int32_t)(match - array); |
|
960 } |
|
961 } |
|
962 |
|
963 int32_t |
|
964 UnicodeString::doIndexOf(UChar c, |
|
965 int32_t start, |
|
966 int32_t length) const |
|
967 { |
|
968 // pin indices |
|
969 pinIndices(start, length); |
|
970 |
|
971 // find the first occurrence of c |
|
972 const UChar *array = getArrayStart(); |
|
973 const UChar *match = u_memchr(array + start, c, length); |
|
974 if(match == NULL) { |
|
975 return -1; |
|
976 } else { |
|
977 return (int32_t)(match - array); |
|
978 } |
|
979 } |
|
980 |
|
981 int32_t |
|
982 UnicodeString::doIndexOf(UChar32 c, |
|
983 int32_t start, |
|
984 int32_t length) const { |
|
985 // pin indices |
|
986 pinIndices(start, length); |
|
987 |
|
988 // find the first occurrence of c |
|
989 const UChar *array = getArrayStart(); |
|
990 const UChar *match = u_memchr32(array + start, c, length); |
|
991 if(match == NULL) { |
|
992 return -1; |
|
993 } else { |
|
994 return (int32_t)(match - array); |
|
995 } |
|
996 } |
|
997 |
|
998 int32_t |
|
999 UnicodeString::lastIndexOf(const UChar *srcChars, |
|
1000 int32_t srcStart, |
|
1001 int32_t srcLength, |
|
1002 int32_t start, |
|
1003 int32_t length) const |
|
1004 { |
|
1005 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { |
|
1006 return -1; |
|
1007 } |
|
1008 |
|
1009 // UnicodeString does not find empty substrings |
|
1010 if(srcLength < 0 && srcChars[srcStart] == 0) { |
|
1011 return -1; |
|
1012 } |
|
1013 |
|
1014 // get the indices within bounds |
|
1015 pinIndices(start, length); |
|
1016 |
|
1017 // find the last occurrence of the substring |
|
1018 const UChar *array = getArrayStart(); |
|
1019 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); |
|
1020 if(match == NULL) { |
|
1021 return -1; |
|
1022 } else { |
|
1023 return (int32_t)(match - array); |
|
1024 } |
|
1025 } |
|
1026 |
|
1027 int32_t |
|
1028 UnicodeString::doLastIndexOf(UChar c, |
|
1029 int32_t start, |
|
1030 int32_t length) const |
|
1031 { |
|
1032 if(isBogus()) { |
|
1033 return -1; |
|
1034 } |
|
1035 |
|
1036 // pin indices |
|
1037 pinIndices(start, length); |
|
1038 |
|
1039 // find the last occurrence of c |
|
1040 const UChar *array = getArrayStart(); |
|
1041 const UChar *match = u_memrchr(array + start, c, length); |
|
1042 if(match == NULL) { |
|
1043 return -1; |
|
1044 } else { |
|
1045 return (int32_t)(match - array); |
|
1046 } |
|
1047 } |
|
1048 |
|
1049 int32_t |
|
1050 UnicodeString::doLastIndexOf(UChar32 c, |
|
1051 int32_t start, |
|
1052 int32_t length) const { |
|
1053 // pin indices |
|
1054 pinIndices(start, length); |
|
1055 |
|
1056 // find the last occurrence of c |
|
1057 const UChar *array = getArrayStart(); |
|
1058 const UChar *match = u_memrchr32(array + start, c, length); |
|
1059 if(match == NULL) { |
|
1060 return -1; |
|
1061 } else { |
|
1062 return (int32_t)(match - array); |
|
1063 } |
|
1064 } |
|
1065 |
|
1066 //======================================== |
|
1067 // Write implementation |
|
1068 //======================================== |
|
1069 |
|
1070 UnicodeString& |
|
1071 UnicodeString::findAndReplace(int32_t start, |
|
1072 int32_t length, |
|
1073 const UnicodeString& oldText, |
|
1074 int32_t oldStart, |
|
1075 int32_t oldLength, |
|
1076 const UnicodeString& newText, |
|
1077 int32_t newStart, |
|
1078 int32_t newLength) |
|
1079 { |
|
1080 if(isBogus() || oldText.isBogus() || newText.isBogus()) { |
|
1081 return *this; |
|
1082 } |
|
1083 |
|
1084 pinIndices(start, length); |
|
1085 oldText.pinIndices(oldStart, oldLength); |
|
1086 newText.pinIndices(newStart, newLength); |
|
1087 |
|
1088 if(oldLength == 0) { |
|
1089 return *this; |
|
1090 } |
|
1091 |
|
1092 while(length > 0 && length >= oldLength) { |
|
1093 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); |
|
1094 if(pos < 0) { |
|
1095 // no more oldText's here: done |
|
1096 break; |
|
1097 } else { |
|
1098 // we found oldText, replace it by newText and go beyond it |
|
1099 replace(pos, oldLength, newText, newStart, newLength); |
|
1100 length -= pos + oldLength - start; |
|
1101 start = pos + newLength; |
|
1102 } |
|
1103 } |
|
1104 |
|
1105 return *this; |
|
1106 } |
|
1107 |
|
1108 |
|
1109 void |
|
1110 UnicodeString::setToBogus() |
|
1111 { |
|
1112 releaseArray(); |
|
1113 |
|
1114 fShortLength = 0; |
|
1115 fUnion.fFields.fArray = 0; |
|
1116 fUnion.fFields.fCapacity = 0; |
|
1117 fFlags = kIsBogus; |
|
1118 } |
|
1119 |
|
1120 // turn a bogus string into an empty one |
|
1121 void |
|
1122 UnicodeString::unBogus() { |
|
1123 if(fFlags & kIsBogus) { |
|
1124 setToEmpty(); |
|
1125 } |
|
1126 } |
|
1127 |
|
1128 const UChar * |
|
1129 UnicodeString::getTerminatedBuffer() { |
|
1130 if(!isWritable()) { |
|
1131 return 0; |
|
1132 } |
|
1133 UChar *array = getArrayStart(); |
|
1134 int32_t len = length(); |
|
1135 if(len < getCapacity()) { |
|
1136 if(fFlags & kBufferIsReadonly) { |
|
1137 // If len<capacity on a read-only alias, then array[len] is |
|
1138 // either the original NUL (if constructed with (TRUE, s, length)) |
|
1139 // or one of the original string contents characters (if later truncated), |
|
1140 // therefore we can assume that array[len] is initialized memory. |
|
1141 if(array[len] == 0) { |
|
1142 return array; |
|
1143 } |
|
1144 } else if(((fFlags & kRefCounted) == 0 || refCount() == 1)) { |
|
1145 // kRefCounted: Do not write the NUL if the buffer is shared. |
|
1146 // That is mostly safe, except when the length of one copy was modified |
|
1147 // without copy-on-write, e.g., via truncate(newLength) or remove(void). |
|
1148 // Then the NUL would be written into the middle of another copy's string. |
|
1149 |
|
1150 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. |
|
1151 // Do not test if there is a NUL already because it might be uninitialized memory. |
|
1152 // (That would be safe, but tools like valgrind & Purify would complain.) |
|
1153 array[len] = 0; |
|
1154 return array; |
|
1155 } |
|
1156 } |
|
1157 if(cloneArrayIfNeeded(len+1)) { |
|
1158 array = getArrayStart(); |
|
1159 array[len] = 0; |
|
1160 return array; |
|
1161 } else { |
|
1162 return NULL; |
|
1163 } |
|
1164 } |
|
1165 |
|
1166 // setTo() analogous to the readonly-aliasing constructor with the same signature |
|
1167 UnicodeString & |
|
1168 UnicodeString::setTo(UBool isTerminated, |
|
1169 const UChar *text, |
|
1170 int32_t textLength) |
|
1171 { |
|
1172 if(fFlags & kOpenGetBuffer) { |
|
1173 // do not modify a string that has an "open" getBuffer(minCapacity) |
|
1174 return *this; |
|
1175 } |
|
1176 |
|
1177 if(text == NULL) { |
|
1178 // treat as an empty string, do not alias |
|
1179 releaseArray(); |
|
1180 setToEmpty(); |
|
1181 return *this; |
|
1182 } |
|
1183 |
|
1184 if( textLength < -1 || |
|
1185 (textLength == -1 && !isTerminated) || |
|
1186 (textLength >= 0 && isTerminated && text[textLength] != 0) |
|
1187 ) { |
|
1188 setToBogus(); |
|
1189 return *this; |
|
1190 } |
|
1191 |
|
1192 releaseArray(); |
|
1193 |
|
1194 if(textLength == -1) { |
|
1195 // text is terminated, or else it would have failed the above test |
|
1196 textLength = u_strlen(text); |
|
1197 } |
|
1198 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); |
|
1199 |
|
1200 fFlags = kReadonlyAlias; |
|
1201 return *this; |
|
1202 } |
|
1203 |
|
1204 // setTo() analogous to the writable-aliasing constructor with the same signature |
|
1205 UnicodeString & |
|
1206 UnicodeString::setTo(UChar *buffer, |
|
1207 int32_t buffLength, |
|
1208 int32_t buffCapacity) { |
|
1209 if(fFlags & kOpenGetBuffer) { |
|
1210 // do not modify a string that has an "open" getBuffer(minCapacity) |
|
1211 return *this; |
|
1212 } |
|
1213 |
|
1214 if(buffer == NULL) { |
|
1215 // treat as an empty string, do not alias |
|
1216 releaseArray(); |
|
1217 setToEmpty(); |
|
1218 return *this; |
|
1219 } |
|
1220 |
|
1221 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { |
|
1222 setToBogus(); |
|
1223 return *this; |
|
1224 } else if(buffLength == -1) { |
|
1225 // buffLength = u_strlen(buff); but do not look beyond buffCapacity |
|
1226 const UChar *p = buffer, *limit = buffer + buffCapacity; |
|
1227 while(p != limit && *p != 0) { |
|
1228 ++p; |
|
1229 } |
|
1230 buffLength = (int32_t)(p - buffer); |
|
1231 } |
|
1232 |
|
1233 releaseArray(); |
|
1234 |
|
1235 setArray(buffer, buffLength, buffCapacity); |
|
1236 fFlags = kWritableAlias; |
|
1237 return *this; |
|
1238 } |
|
1239 |
|
1240 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { |
|
1241 unBogus(); |
|
1242 int32_t length = utf8.length(); |
|
1243 int32_t capacity; |
|
1244 // The UTF-16 string will be at most as long as the UTF-8 string. |
|
1245 if(length <= US_STACKBUF_SIZE) { |
|
1246 capacity = US_STACKBUF_SIZE; |
|
1247 } else { |
|
1248 capacity = length + 1; // +1 for the terminating NUL. |
|
1249 } |
|
1250 UChar *utf16 = getBuffer(capacity); |
|
1251 int32_t length16; |
|
1252 UErrorCode errorCode = U_ZERO_ERROR; |
|
1253 u_strFromUTF8WithSub(utf16, getCapacity(), &length16, |
|
1254 utf8.data(), length, |
|
1255 0xfffd, // Substitution character. |
|
1256 NULL, // Don't care about number of substitutions. |
|
1257 &errorCode); |
|
1258 releaseBuffer(length16); |
|
1259 if(U_FAILURE(errorCode)) { |
|
1260 setToBogus(); |
|
1261 } |
|
1262 return *this; |
|
1263 } |
|
1264 |
|
1265 UnicodeString& |
|
1266 UnicodeString::setCharAt(int32_t offset, |
|
1267 UChar c) |
|
1268 { |
|
1269 int32_t len = length(); |
|
1270 if(cloneArrayIfNeeded() && len > 0) { |
|
1271 if(offset < 0) { |
|
1272 offset = 0; |
|
1273 } else if(offset >= len) { |
|
1274 offset = len - 1; |
|
1275 } |
|
1276 |
|
1277 getArrayStart()[offset] = c; |
|
1278 } |
|
1279 return *this; |
|
1280 } |
|
1281 |
|
1282 UnicodeString& |
|
1283 UnicodeString::replace(int32_t start, |
|
1284 int32_t _length, |
|
1285 UChar32 srcChar) { |
|
1286 UChar buffer[U16_MAX_LENGTH]; |
|
1287 int32_t count = 0; |
|
1288 UBool isError = FALSE; |
|
1289 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); |
|
1290 // We test isError so that the compiler does not complain that we don't. |
|
1291 // If isError (srcChar is not a valid code point) then count==0 which means |
|
1292 // we remove the source segment rather than replacing it with srcChar. |
|
1293 return doReplace(start, _length, buffer, 0, isError ? 0 : count); |
|
1294 } |
|
1295 |
|
1296 UnicodeString& |
|
1297 UnicodeString::append(UChar32 srcChar) { |
|
1298 UChar buffer[U16_MAX_LENGTH]; |
|
1299 int32_t _length = 0; |
|
1300 UBool isError = FALSE; |
|
1301 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); |
|
1302 // We test isError so that the compiler does not complain that we don't. |
|
1303 // If isError then _length==0 which turns the doReplace() into a no-op anyway. |
|
1304 return isError ? *this : doReplace(length(), 0, buffer, 0, _length); |
|
1305 } |
|
1306 |
|
1307 UnicodeString& |
|
1308 UnicodeString::doReplace( int32_t start, |
|
1309 int32_t length, |
|
1310 const UnicodeString& src, |
|
1311 int32_t srcStart, |
|
1312 int32_t srcLength) |
|
1313 { |
|
1314 if(!src.isBogus()) { |
|
1315 // pin the indices to legal values |
|
1316 src.pinIndices(srcStart, srcLength); |
|
1317 |
|
1318 // get the characters from src |
|
1319 // and replace the range in ourselves with them |
|
1320 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); |
|
1321 } else { |
|
1322 // remove the range |
|
1323 return doReplace(start, length, 0, 0, 0); |
|
1324 } |
|
1325 } |
|
1326 |
|
1327 UnicodeString& |
|
1328 UnicodeString::doReplace(int32_t start, |
|
1329 int32_t length, |
|
1330 const UChar *srcChars, |
|
1331 int32_t srcStart, |
|
1332 int32_t srcLength) |
|
1333 { |
|
1334 if(!isWritable()) { |
|
1335 return *this; |
|
1336 } |
|
1337 |
|
1338 int32_t oldLength = this->length(); |
|
1339 |
|
1340 // optimize (read-only alias).remove(0, start) and .remove(start, end) |
|
1341 if((fFlags&kBufferIsReadonly) && srcLength == 0) { |
|
1342 if(start == 0) { |
|
1343 // remove prefix by adjusting the array pointer |
|
1344 pinIndex(length); |
|
1345 fUnion.fFields.fArray += length; |
|
1346 fUnion.fFields.fCapacity -= length; |
|
1347 setLength(oldLength - length); |
|
1348 return *this; |
|
1349 } else { |
|
1350 pinIndex(start); |
|
1351 if(length >= (oldLength - start)) { |
|
1352 // remove suffix by reducing the length (like truncate()) |
|
1353 setLength(start); |
|
1354 fUnion.fFields.fCapacity = start; // not NUL-terminated any more |
|
1355 return *this; |
|
1356 } |
|
1357 } |
|
1358 } |
|
1359 |
|
1360 if(srcChars == 0) { |
|
1361 srcStart = srcLength = 0; |
|
1362 } else if(srcLength < 0) { |
|
1363 // get the srcLength if necessary |
|
1364 srcLength = u_strlen(srcChars + srcStart); |
|
1365 } |
|
1366 |
|
1367 // calculate the size of the string after the replace |
|
1368 int32_t newLength; |
|
1369 |
|
1370 // optimize append() onto a large-enough, owned string |
|
1371 if(start >= oldLength) { |
|
1372 if(srcLength == 0) { |
|
1373 return *this; |
|
1374 } |
|
1375 newLength = oldLength + srcLength; |
|
1376 if(newLength <= getCapacity() && isBufferWritable()) { |
|
1377 UChar *oldArray = getArrayStart(); |
|
1378 // Do not copy characters when |
|
1379 // UChar *buffer=str.getAppendBuffer(...); |
|
1380 // is followed by |
|
1381 // str.append(buffer, length); |
|
1382 // or |
|
1383 // str.appendString(buffer, length) |
|
1384 // or similar. |
|
1385 if(srcChars + srcStart != oldArray + start || start > oldLength) { |
|
1386 us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength); |
|
1387 } |
|
1388 setLength(newLength); |
|
1389 return *this; |
|
1390 } else { |
|
1391 // pin the indices to legal values |
|
1392 start = oldLength; |
|
1393 length = 0; |
|
1394 } |
|
1395 } else { |
|
1396 // pin the indices to legal values |
|
1397 pinIndices(start, length); |
|
1398 |
|
1399 newLength = oldLength - length + srcLength; |
|
1400 } |
|
1401 |
|
1402 // the following may change fArray but will not copy the current contents; |
|
1403 // therefore we need to keep the current fArray |
|
1404 UChar oldStackBuffer[US_STACKBUF_SIZE]; |
|
1405 UChar *oldArray; |
|
1406 if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { |
|
1407 // copy the stack buffer contents because it will be overwritten with |
|
1408 // fUnion.fFields values |
|
1409 u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength); |
|
1410 oldArray = oldStackBuffer; |
|
1411 } else { |
|
1412 oldArray = getArrayStart(); |
|
1413 } |
|
1414 |
|
1415 // clone our array and allocate a bigger array if needed |
|
1416 int32_t *bufferToDelete = 0; |
|
1417 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, |
|
1418 FALSE, &bufferToDelete) |
|
1419 ) { |
|
1420 return *this; |
|
1421 } |
|
1422 |
|
1423 // now do the replace |
|
1424 |
|
1425 UChar *newArray = getArrayStart(); |
|
1426 if(newArray != oldArray) { |
|
1427 // if fArray changed, then we need to copy everything except what will change |
|
1428 us_arrayCopy(oldArray, 0, newArray, 0, start); |
|
1429 us_arrayCopy(oldArray, start + length, |
|
1430 newArray, start + srcLength, |
|
1431 oldLength - (start + length)); |
|
1432 } else if(length != srcLength) { |
|
1433 // fArray did not change; copy only the portion that isn't changing, leaving a hole |
|
1434 us_arrayCopy(oldArray, start + length, |
|
1435 newArray, start + srcLength, |
|
1436 oldLength - (start + length)); |
|
1437 } |
|
1438 |
|
1439 // now fill in the hole with the new string |
|
1440 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); |
|
1441 |
|
1442 setLength(newLength); |
|
1443 |
|
1444 // delayed delete in case srcChars == fArray when we started, and |
|
1445 // to keep oldArray alive for the above operations |
|
1446 if (bufferToDelete) { |
|
1447 uprv_free(bufferToDelete); |
|
1448 } |
|
1449 |
|
1450 return *this; |
|
1451 } |
|
1452 |
|
1453 /** |
|
1454 * Replaceable API |
|
1455 */ |
|
1456 void |
|
1457 UnicodeString::handleReplaceBetween(int32_t start, |
|
1458 int32_t limit, |
|
1459 const UnicodeString& text) { |
|
1460 replaceBetween(start, limit, text); |
|
1461 } |
|
1462 |
|
1463 /** |
|
1464 * Replaceable API |
|
1465 */ |
|
1466 void |
|
1467 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { |
|
1468 if (limit <= start) { |
|
1469 return; // Nothing to do; avoid bogus malloc call |
|
1470 } |
|
1471 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); |
|
1472 // Check to make sure text is not null. |
|
1473 if (text != NULL) { |
|
1474 extractBetween(start, limit, text, 0); |
|
1475 insert(dest, text, 0, limit - start); |
|
1476 uprv_free(text); |
|
1477 } |
|
1478 } |
|
1479 |
|
1480 /** |
|
1481 * Replaceable API |
|
1482 * |
|
1483 * NOTE: This is for the Replaceable class. There is no rep.cpp, |
|
1484 * so we implement this function here. |
|
1485 */ |
|
1486 UBool Replaceable::hasMetaData() const { |
|
1487 return TRUE; |
|
1488 } |
|
1489 |
|
1490 /** |
|
1491 * Replaceable API |
|
1492 */ |
|
1493 UBool UnicodeString::hasMetaData() const { |
|
1494 return FALSE; |
|
1495 } |
|
1496 |
|
1497 UnicodeString& |
|
1498 UnicodeString::doReverse(int32_t start, int32_t length) { |
|
1499 if(length <= 1 || !cloneArrayIfNeeded()) { |
|
1500 return *this; |
|
1501 } |
|
1502 |
|
1503 // pin the indices to legal values |
|
1504 pinIndices(start, length); |
|
1505 if(length <= 1) { // pinIndices() might have shrunk the length |
|
1506 return *this; |
|
1507 } |
|
1508 |
|
1509 UChar *left = getArrayStart() + start; |
|
1510 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) |
|
1511 UChar swap; |
|
1512 UBool hasSupplementary = FALSE; |
|
1513 |
|
1514 // Before the loop we know left<right because length>=2. |
|
1515 do { |
|
1516 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); |
|
1517 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); |
|
1518 *right-- = swap; |
|
1519 } while(left < right); |
|
1520 // Make sure to test the middle code unit of an odd-length string. |
|
1521 // Redundant if the length is even. |
|
1522 hasSupplementary |= (UBool)U16_IS_LEAD(*left); |
|
1523 |
|
1524 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ |
|
1525 if(hasSupplementary) { |
|
1526 UChar swap2; |
|
1527 |
|
1528 left = getArrayStart() + start; |
|
1529 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right |
|
1530 while(left < right) { |
|
1531 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { |
|
1532 *left++ = swap2; |
|
1533 *left++ = swap; |
|
1534 } else { |
|
1535 ++left; |
|
1536 } |
|
1537 } |
|
1538 } |
|
1539 |
|
1540 return *this; |
|
1541 } |
|
1542 |
|
1543 UBool |
|
1544 UnicodeString::padLeading(int32_t targetLength, |
|
1545 UChar padChar) |
|
1546 { |
|
1547 int32_t oldLength = length(); |
|
1548 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { |
|
1549 return FALSE; |
|
1550 } else { |
|
1551 // move contents up by padding width |
|
1552 UChar *array = getArrayStart(); |
|
1553 int32_t start = targetLength - oldLength; |
|
1554 us_arrayCopy(array, 0, array, start, oldLength); |
|
1555 |
|
1556 // fill in padding character |
|
1557 while(--start >= 0) { |
|
1558 array[start] = padChar; |
|
1559 } |
|
1560 setLength(targetLength); |
|
1561 return TRUE; |
|
1562 } |
|
1563 } |
|
1564 |
|
1565 UBool |
|
1566 UnicodeString::padTrailing(int32_t targetLength, |
|
1567 UChar padChar) |
|
1568 { |
|
1569 int32_t oldLength = length(); |
|
1570 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { |
|
1571 return FALSE; |
|
1572 } else { |
|
1573 // fill in padding character |
|
1574 UChar *array = getArrayStart(); |
|
1575 int32_t length = targetLength; |
|
1576 while(--length >= oldLength) { |
|
1577 array[length] = padChar; |
|
1578 } |
|
1579 setLength(targetLength); |
|
1580 return TRUE; |
|
1581 } |
|
1582 } |
|
1583 |
|
1584 //======================================== |
|
1585 // Hashing |
|
1586 //======================================== |
|
1587 int32_t |
|
1588 UnicodeString::doHashCode() const |
|
1589 { |
|
1590 /* Delegate hash computation to uhash. This makes UnicodeString |
|
1591 * hashing consistent with UChar* hashing. */ |
|
1592 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); |
|
1593 if (hashCode == kInvalidHashCode) { |
|
1594 hashCode = kEmptyHashCode; |
|
1595 } |
|
1596 return hashCode; |
|
1597 } |
|
1598 |
|
1599 //======================================== |
|
1600 // External Buffer |
|
1601 //======================================== |
|
1602 |
|
1603 UChar * |
|
1604 UnicodeString::getBuffer(int32_t minCapacity) { |
|
1605 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { |
|
1606 fFlags|=kOpenGetBuffer; |
|
1607 fShortLength=0; |
|
1608 return getArrayStart(); |
|
1609 } else { |
|
1610 return 0; |
|
1611 } |
|
1612 } |
|
1613 |
|
1614 void |
|
1615 UnicodeString::releaseBuffer(int32_t newLength) { |
|
1616 if(fFlags&kOpenGetBuffer && newLength>=-1) { |
|
1617 // set the new fLength |
|
1618 int32_t capacity=getCapacity(); |
|
1619 if(newLength==-1) { |
|
1620 // the new length is the string length, capped by fCapacity |
|
1621 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; |
|
1622 while(p<limit && *p!=0) { |
|
1623 ++p; |
|
1624 } |
|
1625 newLength=(int32_t)(p-array); |
|
1626 } else if(newLength>capacity) { |
|
1627 newLength=capacity; |
|
1628 } |
|
1629 setLength(newLength); |
|
1630 fFlags&=~kOpenGetBuffer; |
|
1631 } |
|
1632 } |
|
1633 |
|
1634 //======================================== |
|
1635 // Miscellaneous |
|
1636 //======================================== |
|
1637 UBool |
|
1638 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, |
|
1639 int32_t growCapacity, |
|
1640 UBool doCopyArray, |
|
1641 int32_t **pBufferToDelete, |
|
1642 UBool forceClone) { |
|
1643 // default parameters need to be static, therefore |
|
1644 // the defaults are -1 to have convenience defaults |
|
1645 if(newCapacity == -1) { |
|
1646 newCapacity = getCapacity(); |
|
1647 } |
|
1648 |
|
1649 // while a getBuffer(minCapacity) is "open", |
|
1650 // prevent any modifications of the string by returning FALSE here |
|
1651 // if the string is bogus, then only an assignment or similar can revive it |
|
1652 if(!isWritable()) { |
|
1653 return FALSE; |
|
1654 } |
|
1655 |
|
1656 /* |
|
1657 * We need to make a copy of the array if |
|
1658 * the buffer is read-only, or |
|
1659 * the buffer is refCounted (shared), and refCount>1, or |
|
1660 * the buffer is too small. |
|
1661 * Return FALSE if memory could not be allocated. |
|
1662 */ |
|
1663 if(forceClone || |
|
1664 fFlags & kBufferIsReadonly || |
|
1665 (fFlags & kRefCounted && refCount() > 1) || |
|
1666 newCapacity > getCapacity() |
|
1667 ) { |
|
1668 // check growCapacity for default value and use of the stack buffer |
|
1669 if(growCapacity < 0) { |
|
1670 growCapacity = newCapacity; |
|
1671 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { |
|
1672 growCapacity = US_STACKBUF_SIZE; |
|
1673 } |
|
1674 |
|
1675 // save old values |
|
1676 UChar oldStackBuffer[US_STACKBUF_SIZE]; |
|
1677 UChar *oldArray; |
|
1678 uint8_t flags = fFlags; |
|
1679 |
|
1680 if(flags&kUsingStackBuffer) { |
|
1681 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ |
|
1682 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { |
|
1683 // copy the stack buffer contents because it will be overwritten with |
|
1684 // fUnion.fFields values |
|
1685 us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength); |
|
1686 oldArray = oldStackBuffer; |
|
1687 } else { |
|
1688 oldArray = 0; // no need to copy from stack buffer to itself |
|
1689 } |
|
1690 } else { |
|
1691 oldArray = fUnion.fFields.fArray; |
|
1692 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ |
|
1693 } |
|
1694 |
|
1695 // allocate a new array |
|
1696 if(allocate(growCapacity) || |
|
1697 (newCapacity < growCapacity && allocate(newCapacity)) |
|
1698 ) { |
|
1699 if(doCopyArray && oldArray != 0) { |
|
1700 // copy the contents |
|
1701 // do not copy more than what fits - it may be smaller than before |
|
1702 int32_t minLength = length(); |
|
1703 newCapacity = getCapacity(); |
|
1704 if(newCapacity < minLength) { |
|
1705 minLength = newCapacity; |
|
1706 setLength(minLength); |
|
1707 } |
|
1708 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); |
|
1709 } else { |
|
1710 fShortLength = 0; |
|
1711 } |
|
1712 |
|
1713 // release the old array |
|
1714 if(flags & kRefCounted) { |
|
1715 // the array is refCounted; decrement and release if 0 |
|
1716 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); |
|
1717 if(umtx_atomic_dec(pRefCount) == 0) { |
|
1718 if(pBufferToDelete == 0) { |
|
1719 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t |
|
1720 // is defined as volatile. (Volatile has useful non-standard behavior |
|
1721 // with this compiler.) |
|
1722 uprv_free((void *)pRefCount); |
|
1723 } else { |
|
1724 // the caller requested to delete it himself |
|
1725 *pBufferToDelete = (int32_t *)pRefCount; |
|
1726 } |
|
1727 } |
|
1728 } |
|
1729 } else { |
|
1730 // not enough memory for growCapacity and not even for the smaller newCapacity |
|
1731 // reset the old values for setToBogus() to release the array |
|
1732 if(!(flags&kUsingStackBuffer)) { |
|
1733 fUnion.fFields.fArray = oldArray; |
|
1734 } |
|
1735 fFlags = flags; |
|
1736 setToBogus(); |
|
1737 return FALSE; |
|
1738 } |
|
1739 } |
|
1740 return TRUE; |
|
1741 } |
|
1742 |
|
1743 // UnicodeStringAppendable ------------------------------------------------- *** |
|
1744 |
|
1745 UnicodeStringAppendable::~UnicodeStringAppendable() {} |
|
1746 |
|
1747 UBool |
|
1748 UnicodeStringAppendable::appendCodeUnit(UChar c) { |
|
1749 return str.doReplace(str.length(), 0, &c, 0, 1).isWritable(); |
|
1750 } |
|
1751 |
|
1752 UBool |
|
1753 UnicodeStringAppendable::appendCodePoint(UChar32 c) { |
|
1754 UChar buffer[U16_MAX_LENGTH]; |
|
1755 int32_t cLength = 0; |
|
1756 UBool isError = FALSE; |
|
1757 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); |
|
1758 return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable(); |
|
1759 } |
|
1760 |
|
1761 UBool |
|
1762 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { |
|
1763 return str.doReplace(str.length(), 0, s, 0, length).isWritable(); |
|
1764 } |
|
1765 |
|
1766 UBool |
|
1767 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { |
|
1768 return str.cloneArrayIfNeeded(str.length() + appendCapacity); |
|
1769 } |
|
1770 |
|
1771 UChar * |
|
1772 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, |
|
1773 int32_t desiredCapacityHint, |
|
1774 UChar *scratch, int32_t scratchCapacity, |
|
1775 int32_t *resultCapacity) { |
|
1776 if(minCapacity < 1 || scratchCapacity < minCapacity) { |
|
1777 *resultCapacity = 0; |
|
1778 return NULL; |
|
1779 } |
|
1780 int32_t oldLength = str.length(); |
|
1781 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { |
|
1782 *resultCapacity = str.getCapacity() - oldLength; |
|
1783 return str.getArrayStart() + oldLength; |
|
1784 } |
|
1785 *resultCapacity = scratchCapacity; |
|
1786 return scratch; |
|
1787 } |
|
1788 |
|
1789 U_NAMESPACE_END |
|
1790 |
|
1791 U_NAMESPACE_USE |
|
1792 |
|
1793 U_CAPI int32_t U_EXPORT2 |
|
1794 uhash_hashUnicodeString(const UElement key) { |
|
1795 const UnicodeString *str = (const UnicodeString*) key.pointer; |
|
1796 return (str == NULL) ? 0 : str->hashCode(); |
|
1797 } |
|
1798 |
|
1799 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString* |
|
1800 // does not depend on hashtable code. |
|
1801 U_CAPI UBool U_EXPORT2 |
|
1802 uhash_compareUnicodeString(const UElement key1, const UElement key2) { |
|
1803 const UnicodeString *str1 = (const UnicodeString*) key1.pointer; |
|
1804 const UnicodeString *str2 = (const UnicodeString*) key2.pointer; |
|
1805 if (str1 == str2) { |
|
1806 return TRUE; |
|
1807 } |
|
1808 if (str1 == NULL || str2 == NULL) { |
|
1809 return FALSE; |
|
1810 } |
|
1811 return *str1 == *str2; |
|
1812 } |
|
1813 |
|
1814 #ifdef U_STATIC_IMPLEMENTATION |
|
1815 /* |
|
1816 This should never be called. It is defined here to make sure that the |
|
1817 virtual vector deleting destructor is defined within unistr.cpp. |
|
1818 The vector deleting destructor is already a part of UObject, |
|
1819 but defining it here makes sure that it is included with this object file. |
|
1820 This makes sure that static library dependencies are kept to a minimum. |
|
1821 */ |
|
1822 static void uprv_UnicodeStringDummy(void) { |
|
1823 delete [] (new UnicodeString[2]); |
|
1824 } |
|
1825 #endif |