1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/unistr.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1825 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* Copyright (C) 1999-2013, International Business Machines Corporation and 1.7 +* others. All Rights Reserved. 1.8 +****************************************************************************** 1.9 +* 1.10 +* File unistr.cpp 1.11 +* 1.12 +* Modification History: 1.13 +* 1.14 +* Date Name Description 1.15 +* 09/25/98 stephen Creation. 1.16 +* 04/20/99 stephen Overhauled per 4/16 code review. 1.17 +* 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX 1.18 +* 11/18/99 aliu Added handleReplaceBetween() to make inherit from 1.19 +* Replaceable. 1.20 +* 06/25/01 grhoten Removed the dependency on iostream 1.21 +****************************************************************************** 1.22 +*/ 1.23 + 1.24 +#include "unicode/utypes.h" 1.25 +#include "unicode/appendable.h" 1.26 +#include "unicode/putil.h" 1.27 +#include "cstring.h" 1.28 +#include "cmemory.h" 1.29 +#include "unicode/ustring.h" 1.30 +#include "unicode/unistr.h" 1.31 +#include "unicode/utf.h" 1.32 +#include "unicode/utf16.h" 1.33 +#include "uelement.h" 1.34 +#include "ustr_imp.h" 1.35 +#include "umutex.h" 1.36 +#include "uassert.h" 1.37 + 1.38 +#if 0 1.39 + 1.40 +#include <iostream> 1.41 +using namespace std; 1.42 + 1.43 +//DEBUGGING 1.44 +void 1.45 +print(const UnicodeString& s, 1.46 + const char *name) 1.47 +{ 1.48 + UChar c; 1.49 + cout << name << ":|"; 1.50 + for(int i = 0; i < s.length(); ++i) { 1.51 + c = s[i]; 1.52 + if(c>= 0x007E || c < 0x0020) 1.53 + cout << "[0x" << hex << s[i] << "]"; 1.54 + else 1.55 + cout << (char) s[i]; 1.56 + } 1.57 + cout << '|' << endl; 1.58 +} 1.59 + 1.60 +void 1.61 +print(const UChar *s, 1.62 + int32_t len, 1.63 + const char *name) 1.64 +{ 1.65 + UChar c; 1.66 + cout << name << ":|"; 1.67 + for(int i = 0; i < len; ++i) { 1.68 + c = s[i]; 1.69 + if(c>= 0x007E || c < 0x0020) 1.70 + cout << "[0x" << hex << s[i] << "]"; 1.71 + else 1.72 + cout << (char) s[i]; 1.73 + } 1.74 + cout << '|' << endl; 1.75 +} 1.76 +// END DEBUGGING 1.77 +#endif 1.78 + 1.79 +// Local function definitions for now 1.80 + 1.81 +// need to copy areas that may overlap 1.82 +static 1.83 +inline void 1.84 +us_arrayCopy(const UChar *src, int32_t srcStart, 1.85 + UChar *dst, int32_t dstStart, int32_t count) 1.86 +{ 1.87 + if(count>0) { 1.88 + uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src))); 1.89 + } 1.90 +} 1.91 + 1.92 +// u_unescapeAt() callback to get a UChar from a UnicodeString 1.93 +U_CDECL_BEGIN 1.94 +static UChar U_CALLCONV 1.95 +UnicodeString_charAt(int32_t offset, void *context) { 1.96 + return ((icu::UnicodeString*) context)->charAt(offset); 1.97 +} 1.98 +U_CDECL_END 1.99 + 1.100 +U_NAMESPACE_BEGIN 1.101 + 1.102 +/* The Replaceable virtual destructor can't be defined in the header 1.103 + due to how AIX works with multiple definitions of virtual functions. 1.104 +*/ 1.105 +Replaceable::~Replaceable() {} 1.106 + 1.107 +UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString) 1.108 + 1.109 +UnicodeString U_EXPORT2 1.110 +operator+ (const UnicodeString &s1, const UnicodeString &s2) { 1.111 + return 1.112 + UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0). 1.113 + append(s1). 1.114 + append(s2); 1.115 +} 1.116 + 1.117 +//======================================== 1.118 +// Reference Counting functions, put at top of file so that optimizing compilers 1.119 +// have a chance to automatically inline. 1.120 +//======================================== 1.121 + 1.122 +void 1.123 +UnicodeString::addRef() { 1.124 + umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 1.125 +} 1.126 + 1.127 +int32_t 1.128 +UnicodeString::removeRef() { 1.129 + return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1); 1.130 +} 1.131 + 1.132 +int32_t 1.133 +UnicodeString::refCount() const { 1.134 + return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1)); 1.135 +} 1.136 + 1.137 +void 1.138 +UnicodeString::releaseArray() { 1.139 + if((fFlags & kRefCounted) && removeRef() == 0) { 1.140 + uprv_free((int32_t *)fUnion.fFields.fArray - 1); 1.141 + } 1.142 +} 1.143 + 1.144 + 1.145 + 1.146 +//======================================== 1.147 +// Constructors 1.148 +//======================================== 1.149 + 1.150 +// The default constructor is inline in unistr.h. 1.151 + 1.152 +UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) 1.153 + : fShortLength(0), 1.154 + fFlags(0) 1.155 +{ 1.156 + if(count <= 0 || (uint32_t)c > 0x10ffff) { 1.157 + // just allocate and do not do anything else 1.158 + allocate(capacity); 1.159 + } else { 1.160 + // count > 0, allocate and fill the new string with count c's 1.161 + int32_t unitCount = U16_LENGTH(c), length = count * unitCount; 1.162 + if(capacity < length) { 1.163 + capacity = length; 1.164 + } 1.165 + if(allocate(capacity)) { 1.166 + UChar *array = getArrayStart(); 1.167 + int32_t i = 0; 1.168 + 1.169 + // fill the new string with c 1.170 + if(unitCount == 1) { 1.171 + // fill with length UChars 1.172 + while(i < length) { 1.173 + array[i++] = (UChar)c; 1.174 + } 1.175 + } else { 1.176 + // get the code units for c 1.177 + UChar units[U16_MAX_LENGTH]; 1.178 + U16_APPEND_UNSAFE(units, i, c); 1.179 + 1.180 + // now it must be i==unitCount 1.181 + i = 0; 1.182 + 1.183 + // for Unicode, unitCount can only be 1, 2, 3, or 4 1.184 + // 1 is handled above 1.185 + while(i < length) { 1.186 + int32_t unitIdx = 0; 1.187 + while(unitIdx < unitCount) { 1.188 + array[i++]=units[unitIdx++]; 1.189 + } 1.190 + } 1.191 + } 1.192 + } 1.193 + setLength(length); 1.194 + } 1.195 +} 1.196 + 1.197 +UnicodeString::UnicodeString(UChar ch) 1.198 + : fShortLength(1), 1.199 + fFlags(kShortString) 1.200 +{ 1.201 + fUnion.fStackBuffer[0] = ch; 1.202 +} 1.203 + 1.204 +UnicodeString::UnicodeString(UChar32 ch) 1.205 + : fShortLength(0), 1.206 + fFlags(kShortString) 1.207 +{ 1.208 + int32_t i = 0; 1.209 + UBool isError = FALSE; 1.210 + U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError); 1.211 + // We test isError so that the compiler does not complain that we don't. 1.212 + // If isError then i==0 which is what we want anyway. 1.213 + if(!isError) { 1.214 + fShortLength = (int8_t)i; 1.215 + } 1.216 +} 1.217 + 1.218 +UnicodeString::UnicodeString(const UChar *text) 1.219 + : fShortLength(0), 1.220 + fFlags(kShortString) 1.221 +{ 1.222 + doReplace(0, 0, text, 0, -1); 1.223 +} 1.224 + 1.225 +UnicodeString::UnicodeString(const UChar *text, 1.226 + int32_t textLength) 1.227 + : fShortLength(0), 1.228 + fFlags(kShortString) 1.229 +{ 1.230 + doReplace(0, 0, text, 0, textLength); 1.231 +} 1.232 + 1.233 +UnicodeString::UnicodeString(UBool isTerminated, 1.234 + const UChar *text, 1.235 + int32_t textLength) 1.236 + : fShortLength(0), 1.237 + fFlags(kReadonlyAlias) 1.238 +{ 1.239 + if(text == NULL) { 1.240 + // treat as an empty string, do not alias 1.241 + setToEmpty(); 1.242 + } else if(textLength < -1 || 1.243 + (textLength == -1 && !isTerminated) || 1.244 + (textLength >= 0 && isTerminated && text[textLength] != 0) 1.245 + ) { 1.246 + setToBogus(); 1.247 + } else { 1.248 + if(textLength == -1) { 1.249 + // text is terminated, or else it would have failed the above test 1.250 + textLength = u_strlen(text); 1.251 + } 1.252 + setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1.253 + } 1.254 +} 1.255 + 1.256 +UnicodeString::UnicodeString(UChar *buff, 1.257 + int32_t buffLength, 1.258 + int32_t buffCapacity) 1.259 + : fShortLength(0), 1.260 + fFlags(kWritableAlias) 1.261 +{ 1.262 + if(buff == NULL) { 1.263 + // treat as an empty string, do not alias 1.264 + setToEmpty(); 1.265 + } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1.266 + setToBogus(); 1.267 + } else { 1.268 + if(buffLength == -1) { 1.269 + // fLength = u_strlen(buff); but do not look beyond buffCapacity 1.270 + const UChar *p = buff, *limit = buff + buffCapacity; 1.271 + while(p != limit && *p != 0) { 1.272 + ++p; 1.273 + } 1.274 + buffLength = (int32_t)(p - buff); 1.275 + } 1.276 + setArray(buff, buffLength, buffCapacity); 1.277 + } 1.278 +} 1.279 + 1.280 +UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) 1.281 + : fShortLength(0), 1.282 + fFlags(kShortString) 1.283 +{ 1.284 + if(src==NULL) { 1.285 + // treat as an empty string 1.286 + } else { 1.287 + if(length<0) { 1.288 + length=(int32_t)uprv_strlen(src); 1.289 + } 1.290 + if(cloneArrayIfNeeded(length, length, FALSE)) { 1.291 + u_charsToUChars(src, getArrayStart(), length); 1.292 + setLength(length); 1.293 + } else { 1.294 + setToBogus(); 1.295 + } 1.296 + } 1.297 +} 1.298 + 1.299 +#if U_CHARSET_IS_UTF8 1.300 + 1.301 +UnicodeString::UnicodeString(const char *codepageData) 1.302 + : fShortLength(0), 1.303 + fFlags(kShortString) { 1.304 + if(codepageData != 0) { 1.305 + setToUTF8(codepageData); 1.306 + } 1.307 +} 1.308 + 1.309 +UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) 1.310 + : fShortLength(0), 1.311 + fFlags(kShortString) { 1.312 + // if there's nothing to convert, do nothing 1.313 + if(codepageData == 0 || dataLength == 0 || dataLength < -1) { 1.314 + return; 1.315 + } 1.316 + if(dataLength == -1) { 1.317 + dataLength = (int32_t)uprv_strlen(codepageData); 1.318 + } 1.319 + setToUTF8(StringPiece(codepageData, dataLength)); 1.320 +} 1.321 + 1.322 +// else see unistr_cnv.cpp 1.323 +#endif 1.324 + 1.325 +UnicodeString::UnicodeString(const UnicodeString& that) 1.326 + : Replaceable(), 1.327 + fShortLength(0), 1.328 + fFlags(kShortString) 1.329 +{ 1.330 + copyFrom(that); 1.331 +} 1.332 + 1.333 +UnicodeString::UnicodeString(const UnicodeString& that, 1.334 + int32_t srcStart) 1.335 + : Replaceable(), 1.336 + fShortLength(0), 1.337 + fFlags(kShortString) 1.338 +{ 1.339 + setTo(that, srcStart); 1.340 +} 1.341 + 1.342 +UnicodeString::UnicodeString(const UnicodeString& that, 1.343 + int32_t srcStart, 1.344 + int32_t srcLength) 1.345 + : Replaceable(), 1.346 + fShortLength(0), 1.347 + fFlags(kShortString) 1.348 +{ 1.349 + setTo(that, srcStart, srcLength); 1.350 +} 1.351 + 1.352 +// Replaceable base class clone() default implementation, does not clone 1.353 +Replaceable * 1.354 +Replaceable::clone() const { 1.355 + return NULL; 1.356 +} 1.357 + 1.358 +// UnicodeString overrides clone() with a real implementation 1.359 +Replaceable * 1.360 +UnicodeString::clone() const { 1.361 + return new UnicodeString(*this); 1.362 +} 1.363 + 1.364 +//======================================== 1.365 +// array allocation 1.366 +//======================================== 1.367 + 1.368 +UBool 1.369 +UnicodeString::allocate(int32_t capacity) { 1.370 + if(capacity <= US_STACKBUF_SIZE) { 1.371 + fFlags = kShortString; 1.372 + } else { 1.373 + // count bytes for the refCounter and the string capacity, and 1.374 + // round up to a multiple of 16; then divide by 4 and allocate int32_t's 1.375 + // to be safely aligned for the refCount 1.376 + // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer() 1.377 + int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2); 1.378 + int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words ); 1.379 + if(array != 0) { 1.380 + // set initial refCount and point behind the refCount 1.381 + *array++ = 1; 1.382 + 1.383 + // have fArray point to the first UChar 1.384 + fUnion.fFields.fArray = (UChar *)array; 1.385 + fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR)); 1.386 + fFlags = kLongString; 1.387 + } else { 1.388 + fShortLength = 0; 1.389 + fUnion.fFields.fArray = 0; 1.390 + fUnion.fFields.fCapacity = 0; 1.391 + fFlags = kIsBogus; 1.392 + return FALSE; 1.393 + } 1.394 + } 1.395 + return TRUE; 1.396 +} 1.397 + 1.398 +//======================================== 1.399 +// Destructor 1.400 +//======================================== 1.401 +UnicodeString::~UnicodeString() 1.402 +{ 1.403 + releaseArray(); 1.404 +} 1.405 + 1.406 +//======================================== 1.407 +// Factory methods 1.408 +//======================================== 1.409 + 1.410 +UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) { 1.411 + UnicodeString result; 1.412 + result.setToUTF8(utf8); 1.413 + return result; 1.414 +} 1.415 + 1.416 +UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) { 1.417 + UnicodeString result; 1.418 + int32_t capacity; 1.419 + // Most UTF-32 strings will be BMP-only and result in a same-length 1.420 + // UTF-16 string. We overestimate the capacity just slightly, 1.421 + // just in case there are a few supplementary characters. 1.422 + if(length <= US_STACKBUF_SIZE) { 1.423 + capacity = US_STACKBUF_SIZE; 1.424 + } else { 1.425 + capacity = length + (length >> 4) + 4; 1.426 + } 1.427 + do { 1.428 + UChar *utf16 = result.getBuffer(capacity); 1.429 + int32_t length16; 1.430 + UErrorCode errorCode = U_ZERO_ERROR; 1.431 + u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16, 1.432 + utf32, length, 1.433 + 0xfffd, // Substitution character. 1.434 + NULL, // Don't care about number of substitutions. 1.435 + &errorCode); 1.436 + result.releaseBuffer(length16); 1.437 + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 1.438 + capacity = length16 + 1; // +1 for the terminating NUL. 1.439 + continue; 1.440 + } else if(U_FAILURE(errorCode)) { 1.441 + result.setToBogus(); 1.442 + } 1.443 + break; 1.444 + } while(TRUE); 1.445 + return result; 1.446 +} 1.447 + 1.448 +//======================================== 1.449 +// Assignment 1.450 +//======================================== 1.451 + 1.452 +UnicodeString & 1.453 +UnicodeString::operator=(const UnicodeString &src) { 1.454 + return copyFrom(src); 1.455 +} 1.456 + 1.457 +UnicodeString & 1.458 +UnicodeString::fastCopyFrom(const UnicodeString &src) { 1.459 + return copyFrom(src, TRUE); 1.460 +} 1.461 + 1.462 +UnicodeString & 1.463 +UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) { 1.464 + // if assigning to ourselves, do nothing 1.465 + if(this == 0 || this == &src) { 1.466 + return *this; 1.467 + } 1.468 + 1.469 + // is the right side bogus? 1.470 + if(&src == 0 || src.isBogus()) { 1.471 + setToBogus(); 1.472 + return *this; 1.473 + } 1.474 + 1.475 + // delete the current contents 1.476 + releaseArray(); 1.477 + 1.478 + if(src.isEmpty()) { 1.479 + // empty string - use the stack buffer 1.480 + setToEmpty(); 1.481 + return *this; 1.482 + } 1.483 + 1.484 + // we always copy the length 1.485 + int32_t srcLength = src.length(); 1.486 + setLength(srcLength); 1.487 + 1.488 + // fLength>0 and not an "open" src.getBuffer(minCapacity) 1.489 + switch(src.fFlags) { 1.490 + case kShortString: 1.491 + // short string using the stack buffer, do the same 1.492 + fFlags = kShortString; 1.493 + uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR); 1.494 + break; 1.495 + case kLongString: 1.496 + // src uses a refCounted string buffer, use that buffer with refCount 1.497 + // src is const, use a cast - we don't really change it 1.498 + ((UnicodeString &)src).addRef(); 1.499 + // copy all fields, share the reference-counted buffer 1.500 + fUnion.fFields.fArray = src.fUnion.fFields.fArray; 1.501 + fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 1.502 + fFlags = src.fFlags; 1.503 + break; 1.504 + case kReadonlyAlias: 1.505 + if(fastCopy) { 1.506 + // src is a readonly alias, do the same 1.507 + // -> maintain the readonly alias as such 1.508 + fUnion.fFields.fArray = src.fUnion.fFields.fArray; 1.509 + fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity; 1.510 + fFlags = src.fFlags; 1.511 + break; 1.512 + } 1.513 + // else if(!fastCopy) fall through to case kWritableAlias 1.514 + // -> allocate a new buffer and copy the contents 1.515 + case kWritableAlias: 1.516 + // src is a writable alias; we make a copy of that instead 1.517 + if(allocate(srcLength)) { 1.518 + uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR); 1.519 + break; 1.520 + } 1.521 + // if there is not enough memory, then fall through to setting to bogus 1.522 + default: 1.523 + // if src is bogus, set ourselves to bogus 1.524 + // do not call setToBogus() here because fArray and fFlags are not consistent here 1.525 + fShortLength = 0; 1.526 + fUnion.fFields.fArray = 0; 1.527 + fUnion.fFields.fCapacity = 0; 1.528 + fFlags = kIsBogus; 1.529 + break; 1.530 + } 1.531 + 1.532 + return *this; 1.533 +} 1.534 + 1.535 +//======================================== 1.536 +// Miscellaneous operations 1.537 +//======================================== 1.538 + 1.539 +UnicodeString UnicodeString::unescape() const { 1.540 + UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity 1.541 + const UChar *array = getBuffer(); 1.542 + int32_t len = length(); 1.543 + int32_t prev = 0; 1.544 + for (int32_t i=0;;) { 1.545 + if (i == len) { 1.546 + result.append(array, prev, len - prev); 1.547 + break; 1.548 + } 1.549 + if (array[i++] == 0x5C /*'\\'*/) { 1.550 + result.append(array, prev, (i - 1) - prev); 1.551 + UChar32 c = unescapeAt(i); // advances i 1.552 + if (c < 0) { 1.553 + result.remove(); // return empty string 1.554 + break; // invalid escape sequence 1.555 + } 1.556 + result.append(c); 1.557 + prev = i; 1.558 + } 1.559 + } 1.560 + return result; 1.561 +} 1.562 + 1.563 +UChar32 UnicodeString::unescapeAt(int32_t &offset) const { 1.564 + return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this); 1.565 +} 1.566 + 1.567 +//======================================== 1.568 +// Read-only implementation 1.569 +//======================================== 1.570 +UBool 1.571 +UnicodeString::doEquals(const UnicodeString &text, int32_t len) const { 1.572 + // Requires: this & text not bogus and have same lengths. 1.573 + // Byte-wise comparison works for equality regardless of endianness. 1.574 + return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0; 1.575 +} 1.576 + 1.577 +int8_t 1.578 +UnicodeString::doCompare( int32_t start, 1.579 + int32_t length, 1.580 + const UChar *srcChars, 1.581 + int32_t srcStart, 1.582 + int32_t srcLength) const 1.583 +{ 1.584 + // compare illegal string values 1.585 + if(isBogus()) { 1.586 + return -1; 1.587 + } 1.588 + 1.589 + // pin indices to legal values 1.590 + pinIndices(start, length); 1.591 + 1.592 + if(srcChars == NULL) { 1.593 + // treat const UChar *srcChars==NULL as an empty string 1.594 + return length == 0 ? 0 : 1; 1.595 + } 1.596 + 1.597 + // get the correct pointer 1.598 + const UChar *chars = getArrayStart(); 1.599 + 1.600 + chars += start; 1.601 + srcChars += srcStart; 1.602 + 1.603 + int32_t minLength; 1.604 + int8_t lengthResult; 1.605 + 1.606 + // get the srcLength if necessary 1.607 + if(srcLength < 0) { 1.608 + srcLength = u_strlen(srcChars + srcStart); 1.609 + } 1.610 + 1.611 + // are we comparing different lengths? 1.612 + if(length != srcLength) { 1.613 + if(length < srcLength) { 1.614 + minLength = length; 1.615 + lengthResult = -1; 1.616 + } else { 1.617 + minLength = srcLength; 1.618 + lengthResult = 1; 1.619 + } 1.620 + } else { 1.621 + minLength = length; 1.622 + lengthResult = 0; 1.623 + } 1.624 + 1.625 + /* 1.626 + * note that uprv_memcmp() returns an int but we return an int8_t; 1.627 + * we need to take care not to truncate the result - 1.628 + * one way to do this is to right-shift the value to 1.629 + * move the sign bit into the lower 8 bits and making sure that this 1.630 + * does not become 0 itself 1.631 + */ 1.632 + 1.633 + if(minLength > 0 && chars != srcChars) { 1.634 + int32_t result; 1.635 + 1.636 +# if U_IS_BIG_ENDIAN 1.637 + // big-endian: byte comparison works 1.638 + result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar)); 1.639 + if(result != 0) { 1.640 + return (int8_t)(result >> 15 | 1); 1.641 + } 1.642 +# else 1.643 + // little-endian: compare UChar units 1.644 + do { 1.645 + result = ((int32_t)*(chars++) - (int32_t)*(srcChars++)); 1.646 + if(result != 0) { 1.647 + return (int8_t)(result >> 15 | 1); 1.648 + } 1.649 + } while(--minLength > 0); 1.650 +# endif 1.651 + } 1.652 + return lengthResult; 1.653 +} 1.654 + 1.655 +/* String compare in code point order - doCompare() compares in code unit order. */ 1.656 +int8_t 1.657 +UnicodeString::doCompareCodePointOrder(int32_t start, 1.658 + int32_t length, 1.659 + const UChar *srcChars, 1.660 + int32_t srcStart, 1.661 + int32_t srcLength) const 1.662 +{ 1.663 + // compare illegal string values 1.664 + // treat const UChar *srcChars==NULL as an empty string 1.665 + if(isBogus()) { 1.666 + return -1; 1.667 + } 1.668 + 1.669 + // pin indices to legal values 1.670 + pinIndices(start, length); 1.671 + 1.672 + if(srcChars == NULL) { 1.673 + srcStart = srcLength = 0; 1.674 + } 1.675 + 1.676 + int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE); 1.677 + /* translate the 32-bit result into an 8-bit one */ 1.678 + if(diff!=0) { 1.679 + return (int8_t)(diff >> 15 | 1); 1.680 + } else { 1.681 + return 0; 1.682 + } 1.683 +} 1.684 + 1.685 +int32_t 1.686 +UnicodeString::getLength() const { 1.687 + return length(); 1.688 +} 1.689 + 1.690 +UChar 1.691 +UnicodeString::getCharAt(int32_t offset) const { 1.692 + return charAt(offset); 1.693 +} 1.694 + 1.695 +UChar32 1.696 +UnicodeString::getChar32At(int32_t offset) const { 1.697 + return char32At(offset); 1.698 +} 1.699 + 1.700 +UChar32 1.701 +UnicodeString::char32At(int32_t offset) const 1.702 +{ 1.703 + int32_t len = length(); 1.704 + if((uint32_t)offset < (uint32_t)len) { 1.705 + const UChar *array = getArrayStart(); 1.706 + UChar32 c; 1.707 + U16_GET(array, 0, offset, len, c); 1.708 + return c; 1.709 + } else { 1.710 + return kInvalidUChar; 1.711 + } 1.712 +} 1.713 + 1.714 +int32_t 1.715 +UnicodeString::getChar32Start(int32_t offset) const { 1.716 + if((uint32_t)offset < (uint32_t)length()) { 1.717 + const UChar *array = getArrayStart(); 1.718 + U16_SET_CP_START(array, 0, offset); 1.719 + return offset; 1.720 + } else { 1.721 + return 0; 1.722 + } 1.723 +} 1.724 + 1.725 +int32_t 1.726 +UnicodeString::getChar32Limit(int32_t offset) const { 1.727 + int32_t len = length(); 1.728 + if((uint32_t)offset < (uint32_t)len) { 1.729 + const UChar *array = getArrayStart(); 1.730 + U16_SET_CP_LIMIT(array, 0, offset, len); 1.731 + return offset; 1.732 + } else { 1.733 + return len; 1.734 + } 1.735 +} 1.736 + 1.737 +int32_t 1.738 +UnicodeString::countChar32(int32_t start, int32_t length) const { 1.739 + pinIndices(start, length); 1.740 + // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL 1.741 + return u_countChar32(getArrayStart()+start, length); 1.742 +} 1.743 + 1.744 +UBool 1.745 +UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const { 1.746 + pinIndices(start, length); 1.747 + // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL 1.748 + return u_strHasMoreChar32Than(getArrayStart()+start, length, number); 1.749 +} 1.750 + 1.751 +int32_t 1.752 +UnicodeString::moveIndex32(int32_t index, int32_t delta) const { 1.753 + // pin index 1.754 + int32_t len = length(); 1.755 + if(index<0) { 1.756 + index=0; 1.757 + } else if(index>len) { 1.758 + index=len; 1.759 + } 1.760 + 1.761 + const UChar *array = getArrayStart(); 1.762 + if(delta>0) { 1.763 + U16_FWD_N(array, index, len, delta); 1.764 + } else { 1.765 + U16_BACK_N(array, 0, index, -delta); 1.766 + } 1.767 + 1.768 + return index; 1.769 +} 1.770 + 1.771 +void 1.772 +UnicodeString::doExtract(int32_t start, 1.773 + int32_t length, 1.774 + UChar *dst, 1.775 + int32_t dstStart) const 1.776 +{ 1.777 + // pin indices to legal values 1.778 + pinIndices(start, length); 1.779 + 1.780 + // do not copy anything if we alias dst itself 1.781 + const UChar *array = getArrayStart(); 1.782 + if(array + start != dst + dstStart) { 1.783 + us_arrayCopy(array, start, dst, dstStart, length); 1.784 + } 1.785 +} 1.786 + 1.787 +int32_t 1.788 +UnicodeString::extract(UChar *dest, int32_t destCapacity, 1.789 + UErrorCode &errorCode) const { 1.790 + int32_t len = length(); 1.791 + if(U_SUCCESS(errorCode)) { 1.792 + if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) { 1.793 + errorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.794 + } else { 1.795 + const UChar *array = getArrayStart(); 1.796 + if(len>0 && len<=destCapacity && array!=dest) { 1.797 + uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR); 1.798 + } 1.799 + return u_terminateUChars(dest, destCapacity, len, &errorCode); 1.800 + } 1.801 + } 1.802 + 1.803 + return len; 1.804 +} 1.805 + 1.806 +int32_t 1.807 +UnicodeString::extract(int32_t start, 1.808 + int32_t length, 1.809 + char *target, 1.810 + int32_t targetCapacity, 1.811 + enum EInvariant) const 1.812 +{ 1.813 + // if the arguments are illegal, then do nothing 1.814 + if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) { 1.815 + return 0; 1.816 + } 1.817 + 1.818 + // pin the indices to legal values 1.819 + pinIndices(start, length); 1.820 + 1.821 + if(length <= targetCapacity) { 1.822 + u_UCharsToChars(getArrayStart() + start, target, length); 1.823 + } 1.824 + UErrorCode status = U_ZERO_ERROR; 1.825 + return u_terminateChars(target, targetCapacity, length, &status); 1.826 +} 1.827 + 1.828 +UnicodeString 1.829 +UnicodeString::tempSubString(int32_t start, int32_t len) const { 1.830 + pinIndices(start, len); 1.831 + const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer 1.832 + if(array==NULL) { 1.833 + array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string 1.834 + len=-2; // bogus result string 1.835 + } 1.836 + return UnicodeString(FALSE, array + start, len); 1.837 +} 1.838 + 1.839 +int32_t 1.840 +UnicodeString::toUTF8(int32_t start, int32_t len, 1.841 + char *target, int32_t capacity) const { 1.842 + pinIndices(start, len); 1.843 + int32_t length8; 1.844 + UErrorCode errorCode = U_ZERO_ERROR; 1.845 + u_strToUTF8WithSub(target, capacity, &length8, 1.846 + getBuffer() + start, len, 1.847 + 0xFFFD, // Standard substitution character. 1.848 + NULL, // Don't care about number of substitutions. 1.849 + &errorCode); 1.850 + return length8; 1.851 +} 1.852 + 1.853 +#if U_CHARSET_IS_UTF8 1.854 + 1.855 +int32_t 1.856 +UnicodeString::extract(int32_t start, int32_t len, 1.857 + char *target, uint32_t dstSize) const { 1.858 + // if the arguments are illegal, then do nothing 1.859 + if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) { 1.860 + return 0; 1.861 + } 1.862 + return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff); 1.863 +} 1.864 + 1.865 +// else see unistr_cnv.cpp 1.866 +#endif 1.867 + 1.868 +void 1.869 +UnicodeString::extractBetween(int32_t start, 1.870 + int32_t limit, 1.871 + UnicodeString& target) const { 1.872 + pinIndex(start); 1.873 + pinIndex(limit); 1.874 + doExtract(start, limit - start, target); 1.875 +} 1.876 + 1.877 +// When converting from UTF-16 to UTF-8, the result will have at most 3 times 1.878 +// as many bytes as the source has UChars. 1.879 +// The "worst cases" are writing systems like Indic, Thai and CJK with 1.880 +// 3:1 bytes:UChars. 1.881 +void 1.882 +UnicodeString::toUTF8(ByteSink &sink) const { 1.883 + int32_t length16 = length(); 1.884 + if(length16 != 0) { 1.885 + char stackBuffer[1024]; 1.886 + int32_t capacity = (int32_t)sizeof(stackBuffer); 1.887 + UBool utf8IsOwned = FALSE; 1.888 + char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity, 1.889 + 3*length16, 1.890 + stackBuffer, capacity, 1.891 + &capacity); 1.892 + int32_t length8 = 0; 1.893 + UErrorCode errorCode = U_ZERO_ERROR; 1.894 + u_strToUTF8WithSub(utf8, capacity, &length8, 1.895 + getBuffer(), length16, 1.896 + 0xFFFD, // Standard substitution character. 1.897 + NULL, // Don't care about number of substitutions. 1.898 + &errorCode); 1.899 + if(errorCode == U_BUFFER_OVERFLOW_ERROR) { 1.900 + utf8 = (char *)uprv_malloc(length8); 1.901 + if(utf8 != NULL) { 1.902 + utf8IsOwned = TRUE; 1.903 + errorCode = U_ZERO_ERROR; 1.904 + u_strToUTF8WithSub(utf8, length8, &length8, 1.905 + getBuffer(), length16, 1.906 + 0xFFFD, // Standard substitution character. 1.907 + NULL, // Don't care about number of substitutions. 1.908 + &errorCode); 1.909 + } else { 1.910 + errorCode = U_MEMORY_ALLOCATION_ERROR; 1.911 + } 1.912 + } 1.913 + if(U_SUCCESS(errorCode)) { 1.914 + sink.Append(utf8, length8); 1.915 + sink.Flush(); 1.916 + } 1.917 + if(utf8IsOwned) { 1.918 + uprv_free(utf8); 1.919 + } 1.920 + } 1.921 +} 1.922 + 1.923 +int32_t 1.924 +UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const { 1.925 + int32_t length32=0; 1.926 + if(U_SUCCESS(errorCode)) { 1.927 + // getBuffer() and u_strToUTF32WithSub() check for illegal arguments. 1.928 + u_strToUTF32WithSub(utf32, capacity, &length32, 1.929 + getBuffer(), length(), 1.930 + 0xfffd, // Substitution character. 1.931 + NULL, // Don't care about number of substitutions. 1.932 + &errorCode); 1.933 + } 1.934 + return length32; 1.935 +} 1.936 + 1.937 +int32_t 1.938 +UnicodeString::indexOf(const UChar *srcChars, 1.939 + int32_t srcStart, 1.940 + int32_t srcLength, 1.941 + int32_t start, 1.942 + int32_t length) const 1.943 +{ 1.944 + if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1.945 + return -1; 1.946 + } 1.947 + 1.948 + // UnicodeString does not find empty substrings 1.949 + if(srcLength < 0 && srcChars[srcStart] == 0) { 1.950 + return -1; 1.951 + } 1.952 + 1.953 + // get the indices within bounds 1.954 + pinIndices(start, length); 1.955 + 1.956 + // find the first occurrence of the substring 1.957 + const UChar *array = getArrayStart(); 1.958 + const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength); 1.959 + if(match == NULL) { 1.960 + return -1; 1.961 + } else { 1.962 + return (int32_t)(match - array); 1.963 + } 1.964 +} 1.965 + 1.966 +int32_t 1.967 +UnicodeString::doIndexOf(UChar c, 1.968 + int32_t start, 1.969 + int32_t length) const 1.970 +{ 1.971 + // pin indices 1.972 + pinIndices(start, length); 1.973 + 1.974 + // find the first occurrence of c 1.975 + const UChar *array = getArrayStart(); 1.976 + const UChar *match = u_memchr(array + start, c, length); 1.977 + if(match == NULL) { 1.978 + return -1; 1.979 + } else { 1.980 + return (int32_t)(match - array); 1.981 + } 1.982 +} 1.983 + 1.984 +int32_t 1.985 +UnicodeString::doIndexOf(UChar32 c, 1.986 + int32_t start, 1.987 + int32_t length) const { 1.988 + // pin indices 1.989 + pinIndices(start, length); 1.990 + 1.991 + // find the first occurrence of c 1.992 + const UChar *array = getArrayStart(); 1.993 + const UChar *match = u_memchr32(array + start, c, length); 1.994 + if(match == NULL) { 1.995 + return -1; 1.996 + } else { 1.997 + return (int32_t)(match - array); 1.998 + } 1.999 +} 1.1000 + 1.1001 +int32_t 1.1002 +UnicodeString::lastIndexOf(const UChar *srcChars, 1.1003 + int32_t srcStart, 1.1004 + int32_t srcLength, 1.1005 + int32_t start, 1.1006 + int32_t length) const 1.1007 +{ 1.1008 + if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) { 1.1009 + return -1; 1.1010 + } 1.1011 + 1.1012 + // UnicodeString does not find empty substrings 1.1013 + if(srcLength < 0 && srcChars[srcStart] == 0) { 1.1014 + return -1; 1.1015 + } 1.1016 + 1.1017 + // get the indices within bounds 1.1018 + pinIndices(start, length); 1.1019 + 1.1020 + // find the last occurrence of the substring 1.1021 + const UChar *array = getArrayStart(); 1.1022 + const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength); 1.1023 + if(match == NULL) { 1.1024 + return -1; 1.1025 + } else { 1.1026 + return (int32_t)(match - array); 1.1027 + } 1.1028 +} 1.1029 + 1.1030 +int32_t 1.1031 +UnicodeString::doLastIndexOf(UChar c, 1.1032 + int32_t start, 1.1033 + int32_t length) const 1.1034 +{ 1.1035 + if(isBogus()) { 1.1036 + return -1; 1.1037 + } 1.1038 + 1.1039 + // pin indices 1.1040 + pinIndices(start, length); 1.1041 + 1.1042 + // find the last occurrence of c 1.1043 + const UChar *array = getArrayStart(); 1.1044 + const UChar *match = u_memrchr(array + start, c, length); 1.1045 + if(match == NULL) { 1.1046 + return -1; 1.1047 + } else { 1.1048 + return (int32_t)(match - array); 1.1049 + } 1.1050 +} 1.1051 + 1.1052 +int32_t 1.1053 +UnicodeString::doLastIndexOf(UChar32 c, 1.1054 + int32_t start, 1.1055 + int32_t length) const { 1.1056 + // pin indices 1.1057 + pinIndices(start, length); 1.1058 + 1.1059 + // find the last occurrence of c 1.1060 + const UChar *array = getArrayStart(); 1.1061 + const UChar *match = u_memrchr32(array + start, c, length); 1.1062 + if(match == NULL) { 1.1063 + return -1; 1.1064 + } else { 1.1065 + return (int32_t)(match - array); 1.1066 + } 1.1067 +} 1.1068 + 1.1069 +//======================================== 1.1070 +// Write implementation 1.1071 +//======================================== 1.1072 + 1.1073 +UnicodeString& 1.1074 +UnicodeString::findAndReplace(int32_t start, 1.1075 + int32_t length, 1.1076 + const UnicodeString& oldText, 1.1077 + int32_t oldStart, 1.1078 + int32_t oldLength, 1.1079 + const UnicodeString& newText, 1.1080 + int32_t newStart, 1.1081 + int32_t newLength) 1.1082 +{ 1.1083 + if(isBogus() || oldText.isBogus() || newText.isBogus()) { 1.1084 + return *this; 1.1085 + } 1.1086 + 1.1087 + pinIndices(start, length); 1.1088 + oldText.pinIndices(oldStart, oldLength); 1.1089 + newText.pinIndices(newStart, newLength); 1.1090 + 1.1091 + if(oldLength == 0) { 1.1092 + return *this; 1.1093 + } 1.1094 + 1.1095 + while(length > 0 && length >= oldLength) { 1.1096 + int32_t pos = indexOf(oldText, oldStart, oldLength, start, length); 1.1097 + if(pos < 0) { 1.1098 + // no more oldText's here: done 1.1099 + break; 1.1100 + } else { 1.1101 + // we found oldText, replace it by newText and go beyond it 1.1102 + replace(pos, oldLength, newText, newStart, newLength); 1.1103 + length -= pos + oldLength - start; 1.1104 + start = pos + newLength; 1.1105 + } 1.1106 + } 1.1107 + 1.1108 + return *this; 1.1109 +} 1.1110 + 1.1111 + 1.1112 +void 1.1113 +UnicodeString::setToBogus() 1.1114 +{ 1.1115 + releaseArray(); 1.1116 + 1.1117 + fShortLength = 0; 1.1118 + fUnion.fFields.fArray = 0; 1.1119 + fUnion.fFields.fCapacity = 0; 1.1120 + fFlags = kIsBogus; 1.1121 +} 1.1122 + 1.1123 +// turn a bogus string into an empty one 1.1124 +void 1.1125 +UnicodeString::unBogus() { 1.1126 + if(fFlags & kIsBogus) { 1.1127 + setToEmpty(); 1.1128 + } 1.1129 +} 1.1130 + 1.1131 +const UChar * 1.1132 +UnicodeString::getTerminatedBuffer() { 1.1133 + if(!isWritable()) { 1.1134 + return 0; 1.1135 + } 1.1136 + UChar *array = getArrayStart(); 1.1137 + int32_t len = length(); 1.1138 + if(len < getCapacity()) { 1.1139 + if(fFlags & kBufferIsReadonly) { 1.1140 + // If len<capacity on a read-only alias, then array[len] is 1.1141 + // either the original NUL (if constructed with (TRUE, s, length)) 1.1142 + // or one of the original string contents characters (if later truncated), 1.1143 + // therefore we can assume that array[len] is initialized memory. 1.1144 + if(array[len] == 0) { 1.1145 + return array; 1.1146 + } 1.1147 + } else if(((fFlags & kRefCounted) == 0 || refCount() == 1)) { 1.1148 + // kRefCounted: Do not write the NUL if the buffer is shared. 1.1149 + // That is mostly safe, except when the length of one copy was modified 1.1150 + // without copy-on-write, e.g., via truncate(newLength) or remove(void). 1.1151 + // Then the NUL would be written into the middle of another copy's string. 1.1152 + 1.1153 + // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL. 1.1154 + // Do not test if there is a NUL already because it might be uninitialized memory. 1.1155 + // (That would be safe, but tools like valgrind & Purify would complain.) 1.1156 + array[len] = 0; 1.1157 + return array; 1.1158 + } 1.1159 + } 1.1160 + if(cloneArrayIfNeeded(len+1)) { 1.1161 + array = getArrayStart(); 1.1162 + array[len] = 0; 1.1163 + return array; 1.1164 + } else { 1.1165 + return NULL; 1.1166 + } 1.1167 +} 1.1168 + 1.1169 +// setTo() analogous to the readonly-aliasing constructor with the same signature 1.1170 +UnicodeString & 1.1171 +UnicodeString::setTo(UBool isTerminated, 1.1172 + const UChar *text, 1.1173 + int32_t textLength) 1.1174 +{ 1.1175 + if(fFlags & kOpenGetBuffer) { 1.1176 + // do not modify a string that has an "open" getBuffer(minCapacity) 1.1177 + return *this; 1.1178 + } 1.1179 + 1.1180 + if(text == NULL) { 1.1181 + // treat as an empty string, do not alias 1.1182 + releaseArray(); 1.1183 + setToEmpty(); 1.1184 + return *this; 1.1185 + } 1.1186 + 1.1187 + if( textLength < -1 || 1.1188 + (textLength == -1 && !isTerminated) || 1.1189 + (textLength >= 0 && isTerminated && text[textLength] != 0) 1.1190 + ) { 1.1191 + setToBogus(); 1.1192 + return *this; 1.1193 + } 1.1194 + 1.1195 + releaseArray(); 1.1196 + 1.1197 + if(textLength == -1) { 1.1198 + // text is terminated, or else it would have failed the above test 1.1199 + textLength = u_strlen(text); 1.1200 + } 1.1201 + setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength); 1.1202 + 1.1203 + fFlags = kReadonlyAlias; 1.1204 + return *this; 1.1205 +} 1.1206 + 1.1207 +// setTo() analogous to the writable-aliasing constructor with the same signature 1.1208 +UnicodeString & 1.1209 +UnicodeString::setTo(UChar *buffer, 1.1210 + int32_t buffLength, 1.1211 + int32_t buffCapacity) { 1.1212 + if(fFlags & kOpenGetBuffer) { 1.1213 + // do not modify a string that has an "open" getBuffer(minCapacity) 1.1214 + return *this; 1.1215 + } 1.1216 + 1.1217 + if(buffer == NULL) { 1.1218 + // treat as an empty string, do not alias 1.1219 + releaseArray(); 1.1220 + setToEmpty(); 1.1221 + return *this; 1.1222 + } 1.1223 + 1.1224 + if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) { 1.1225 + setToBogus(); 1.1226 + return *this; 1.1227 + } else if(buffLength == -1) { 1.1228 + // buffLength = u_strlen(buff); but do not look beyond buffCapacity 1.1229 + const UChar *p = buffer, *limit = buffer + buffCapacity; 1.1230 + while(p != limit && *p != 0) { 1.1231 + ++p; 1.1232 + } 1.1233 + buffLength = (int32_t)(p - buffer); 1.1234 + } 1.1235 + 1.1236 + releaseArray(); 1.1237 + 1.1238 + setArray(buffer, buffLength, buffCapacity); 1.1239 + fFlags = kWritableAlias; 1.1240 + return *this; 1.1241 +} 1.1242 + 1.1243 +UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) { 1.1244 + unBogus(); 1.1245 + int32_t length = utf8.length(); 1.1246 + int32_t capacity; 1.1247 + // The UTF-16 string will be at most as long as the UTF-8 string. 1.1248 + if(length <= US_STACKBUF_SIZE) { 1.1249 + capacity = US_STACKBUF_SIZE; 1.1250 + } else { 1.1251 + capacity = length + 1; // +1 for the terminating NUL. 1.1252 + } 1.1253 + UChar *utf16 = getBuffer(capacity); 1.1254 + int32_t length16; 1.1255 + UErrorCode errorCode = U_ZERO_ERROR; 1.1256 + u_strFromUTF8WithSub(utf16, getCapacity(), &length16, 1.1257 + utf8.data(), length, 1.1258 + 0xfffd, // Substitution character. 1.1259 + NULL, // Don't care about number of substitutions. 1.1260 + &errorCode); 1.1261 + releaseBuffer(length16); 1.1262 + if(U_FAILURE(errorCode)) { 1.1263 + setToBogus(); 1.1264 + } 1.1265 + return *this; 1.1266 +} 1.1267 + 1.1268 +UnicodeString& 1.1269 +UnicodeString::setCharAt(int32_t offset, 1.1270 + UChar c) 1.1271 +{ 1.1272 + int32_t len = length(); 1.1273 + if(cloneArrayIfNeeded() && len > 0) { 1.1274 + if(offset < 0) { 1.1275 + offset = 0; 1.1276 + } else if(offset >= len) { 1.1277 + offset = len - 1; 1.1278 + } 1.1279 + 1.1280 + getArrayStart()[offset] = c; 1.1281 + } 1.1282 + return *this; 1.1283 +} 1.1284 + 1.1285 +UnicodeString& 1.1286 +UnicodeString::replace(int32_t start, 1.1287 + int32_t _length, 1.1288 + UChar32 srcChar) { 1.1289 + UChar buffer[U16_MAX_LENGTH]; 1.1290 + int32_t count = 0; 1.1291 + UBool isError = FALSE; 1.1292 + U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError); 1.1293 + // We test isError so that the compiler does not complain that we don't. 1.1294 + // If isError (srcChar is not a valid code point) then count==0 which means 1.1295 + // we remove the source segment rather than replacing it with srcChar. 1.1296 + return doReplace(start, _length, buffer, 0, isError ? 0 : count); 1.1297 +} 1.1298 + 1.1299 +UnicodeString& 1.1300 +UnicodeString::append(UChar32 srcChar) { 1.1301 + UChar buffer[U16_MAX_LENGTH]; 1.1302 + int32_t _length = 0; 1.1303 + UBool isError = FALSE; 1.1304 + U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError); 1.1305 + // We test isError so that the compiler does not complain that we don't. 1.1306 + // If isError then _length==0 which turns the doReplace() into a no-op anyway. 1.1307 + return isError ? *this : doReplace(length(), 0, buffer, 0, _length); 1.1308 +} 1.1309 + 1.1310 +UnicodeString& 1.1311 +UnicodeString::doReplace( int32_t start, 1.1312 + int32_t length, 1.1313 + const UnicodeString& src, 1.1314 + int32_t srcStart, 1.1315 + int32_t srcLength) 1.1316 +{ 1.1317 + if(!src.isBogus()) { 1.1318 + // pin the indices to legal values 1.1319 + src.pinIndices(srcStart, srcLength); 1.1320 + 1.1321 + // get the characters from src 1.1322 + // and replace the range in ourselves with them 1.1323 + return doReplace(start, length, src.getArrayStart(), srcStart, srcLength); 1.1324 + } else { 1.1325 + // remove the range 1.1326 + return doReplace(start, length, 0, 0, 0); 1.1327 + } 1.1328 +} 1.1329 + 1.1330 +UnicodeString& 1.1331 +UnicodeString::doReplace(int32_t start, 1.1332 + int32_t length, 1.1333 + const UChar *srcChars, 1.1334 + int32_t srcStart, 1.1335 + int32_t srcLength) 1.1336 +{ 1.1337 + if(!isWritable()) { 1.1338 + return *this; 1.1339 + } 1.1340 + 1.1341 + int32_t oldLength = this->length(); 1.1342 + 1.1343 + // optimize (read-only alias).remove(0, start) and .remove(start, end) 1.1344 + if((fFlags&kBufferIsReadonly) && srcLength == 0) { 1.1345 + if(start == 0) { 1.1346 + // remove prefix by adjusting the array pointer 1.1347 + pinIndex(length); 1.1348 + fUnion.fFields.fArray += length; 1.1349 + fUnion.fFields.fCapacity -= length; 1.1350 + setLength(oldLength - length); 1.1351 + return *this; 1.1352 + } else { 1.1353 + pinIndex(start); 1.1354 + if(length >= (oldLength - start)) { 1.1355 + // remove suffix by reducing the length (like truncate()) 1.1356 + setLength(start); 1.1357 + fUnion.fFields.fCapacity = start; // not NUL-terminated any more 1.1358 + return *this; 1.1359 + } 1.1360 + } 1.1361 + } 1.1362 + 1.1363 + if(srcChars == 0) { 1.1364 + srcStart = srcLength = 0; 1.1365 + } else if(srcLength < 0) { 1.1366 + // get the srcLength if necessary 1.1367 + srcLength = u_strlen(srcChars + srcStart); 1.1368 + } 1.1369 + 1.1370 + // calculate the size of the string after the replace 1.1371 + int32_t newLength; 1.1372 + 1.1373 + // optimize append() onto a large-enough, owned string 1.1374 + if(start >= oldLength) { 1.1375 + if(srcLength == 0) { 1.1376 + return *this; 1.1377 + } 1.1378 + newLength = oldLength + srcLength; 1.1379 + if(newLength <= getCapacity() && isBufferWritable()) { 1.1380 + UChar *oldArray = getArrayStart(); 1.1381 + // Do not copy characters when 1.1382 + // UChar *buffer=str.getAppendBuffer(...); 1.1383 + // is followed by 1.1384 + // str.append(buffer, length); 1.1385 + // or 1.1386 + // str.appendString(buffer, length) 1.1387 + // or similar. 1.1388 + if(srcChars + srcStart != oldArray + start || start > oldLength) { 1.1389 + us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength); 1.1390 + } 1.1391 + setLength(newLength); 1.1392 + return *this; 1.1393 + } else { 1.1394 + // pin the indices to legal values 1.1395 + start = oldLength; 1.1396 + length = 0; 1.1397 + } 1.1398 + } else { 1.1399 + // pin the indices to legal values 1.1400 + pinIndices(start, length); 1.1401 + 1.1402 + newLength = oldLength - length + srcLength; 1.1403 + } 1.1404 + 1.1405 + // the following may change fArray but will not copy the current contents; 1.1406 + // therefore we need to keep the current fArray 1.1407 + UChar oldStackBuffer[US_STACKBUF_SIZE]; 1.1408 + UChar *oldArray; 1.1409 + if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) { 1.1410 + // copy the stack buffer contents because it will be overwritten with 1.1411 + // fUnion.fFields values 1.1412 + u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength); 1.1413 + oldArray = oldStackBuffer; 1.1414 + } else { 1.1415 + oldArray = getArrayStart(); 1.1416 + } 1.1417 + 1.1418 + // clone our array and allocate a bigger array if needed 1.1419 + int32_t *bufferToDelete = 0; 1.1420 + if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize, 1.1421 + FALSE, &bufferToDelete) 1.1422 + ) { 1.1423 + return *this; 1.1424 + } 1.1425 + 1.1426 + // now do the replace 1.1427 + 1.1428 + UChar *newArray = getArrayStart(); 1.1429 + if(newArray != oldArray) { 1.1430 + // if fArray changed, then we need to copy everything except what will change 1.1431 + us_arrayCopy(oldArray, 0, newArray, 0, start); 1.1432 + us_arrayCopy(oldArray, start + length, 1.1433 + newArray, start + srcLength, 1.1434 + oldLength - (start + length)); 1.1435 + } else if(length != srcLength) { 1.1436 + // fArray did not change; copy only the portion that isn't changing, leaving a hole 1.1437 + us_arrayCopy(oldArray, start + length, 1.1438 + newArray, start + srcLength, 1.1439 + oldLength - (start + length)); 1.1440 + } 1.1441 + 1.1442 + // now fill in the hole with the new string 1.1443 + us_arrayCopy(srcChars, srcStart, newArray, start, srcLength); 1.1444 + 1.1445 + setLength(newLength); 1.1446 + 1.1447 + // delayed delete in case srcChars == fArray when we started, and 1.1448 + // to keep oldArray alive for the above operations 1.1449 + if (bufferToDelete) { 1.1450 + uprv_free(bufferToDelete); 1.1451 + } 1.1452 + 1.1453 + return *this; 1.1454 +} 1.1455 + 1.1456 +/** 1.1457 + * Replaceable API 1.1458 + */ 1.1459 +void 1.1460 +UnicodeString::handleReplaceBetween(int32_t start, 1.1461 + int32_t limit, 1.1462 + const UnicodeString& text) { 1.1463 + replaceBetween(start, limit, text); 1.1464 +} 1.1465 + 1.1466 +/** 1.1467 + * Replaceable API 1.1468 + */ 1.1469 +void 1.1470 +UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) { 1.1471 + if (limit <= start) { 1.1472 + return; // Nothing to do; avoid bogus malloc call 1.1473 + } 1.1474 + UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) ); 1.1475 + // Check to make sure text is not null. 1.1476 + if (text != NULL) { 1.1477 + extractBetween(start, limit, text, 0); 1.1478 + insert(dest, text, 0, limit - start); 1.1479 + uprv_free(text); 1.1480 + } 1.1481 +} 1.1482 + 1.1483 +/** 1.1484 + * Replaceable API 1.1485 + * 1.1486 + * NOTE: This is for the Replaceable class. There is no rep.cpp, 1.1487 + * so we implement this function here. 1.1488 + */ 1.1489 +UBool Replaceable::hasMetaData() const { 1.1490 + return TRUE; 1.1491 +} 1.1492 + 1.1493 +/** 1.1494 + * Replaceable API 1.1495 + */ 1.1496 +UBool UnicodeString::hasMetaData() const { 1.1497 + return FALSE; 1.1498 +} 1.1499 + 1.1500 +UnicodeString& 1.1501 +UnicodeString::doReverse(int32_t start, int32_t length) { 1.1502 + if(length <= 1 || !cloneArrayIfNeeded()) { 1.1503 + return *this; 1.1504 + } 1.1505 + 1.1506 + // pin the indices to legal values 1.1507 + pinIndices(start, length); 1.1508 + if(length <= 1) { // pinIndices() might have shrunk the length 1.1509 + return *this; 1.1510 + } 1.1511 + 1.1512 + UChar *left = getArrayStart() + start; 1.1513 + UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2) 1.1514 + UChar swap; 1.1515 + UBool hasSupplementary = FALSE; 1.1516 + 1.1517 + // Before the loop we know left<right because length>=2. 1.1518 + do { 1.1519 + hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left); 1.1520 + hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right); 1.1521 + *right-- = swap; 1.1522 + } while(left < right); 1.1523 + // Make sure to test the middle code unit of an odd-length string. 1.1524 + // Redundant if the length is even. 1.1525 + hasSupplementary |= (UBool)U16_IS_LEAD(*left); 1.1526 + 1.1527 + /* if there are supplementary code points in the reversed range, then re-swap their surrogates */ 1.1528 + if(hasSupplementary) { 1.1529 + UChar swap2; 1.1530 + 1.1531 + left = getArrayStart() + start; 1.1532 + right = left + length - 1; // -1 so that we can look at *(left+1) if left<right 1.1533 + while(left < right) { 1.1534 + if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) { 1.1535 + *left++ = swap2; 1.1536 + *left++ = swap; 1.1537 + } else { 1.1538 + ++left; 1.1539 + } 1.1540 + } 1.1541 + } 1.1542 + 1.1543 + return *this; 1.1544 +} 1.1545 + 1.1546 +UBool 1.1547 +UnicodeString::padLeading(int32_t targetLength, 1.1548 + UChar padChar) 1.1549 +{ 1.1550 + int32_t oldLength = length(); 1.1551 + if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1.1552 + return FALSE; 1.1553 + } else { 1.1554 + // move contents up by padding width 1.1555 + UChar *array = getArrayStart(); 1.1556 + int32_t start = targetLength - oldLength; 1.1557 + us_arrayCopy(array, 0, array, start, oldLength); 1.1558 + 1.1559 + // fill in padding character 1.1560 + while(--start >= 0) { 1.1561 + array[start] = padChar; 1.1562 + } 1.1563 + setLength(targetLength); 1.1564 + return TRUE; 1.1565 + } 1.1566 +} 1.1567 + 1.1568 +UBool 1.1569 +UnicodeString::padTrailing(int32_t targetLength, 1.1570 + UChar padChar) 1.1571 +{ 1.1572 + int32_t oldLength = length(); 1.1573 + if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) { 1.1574 + return FALSE; 1.1575 + } else { 1.1576 + // fill in padding character 1.1577 + UChar *array = getArrayStart(); 1.1578 + int32_t length = targetLength; 1.1579 + while(--length >= oldLength) { 1.1580 + array[length] = padChar; 1.1581 + } 1.1582 + setLength(targetLength); 1.1583 + return TRUE; 1.1584 + } 1.1585 +} 1.1586 + 1.1587 +//======================================== 1.1588 +// Hashing 1.1589 +//======================================== 1.1590 +int32_t 1.1591 +UnicodeString::doHashCode() const 1.1592 +{ 1.1593 + /* Delegate hash computation to uhash. This makes UnicodeString 1.1594 + * hashing consistent with UChar* hashing. */ 1.1595 + int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length()); 1.1596 + if (hashCode == kInvalidHashCode) { 1.1597 + hashCode = kEmptyHashCode; 1.1598 + } 1.1599 + return hashCode; 1.1600 +} 1.1601 + 1.1602 +//======================================== 1.1603 +// External Buffer 1.1604 +//======================================== 1.1605 + 1.1606 +UChar * 1.1607 +UnicodeString::getBuffer(int32_t minCapacity) { 1.1608 + if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) { 1.1609 + fFlags|=kOpenGetBuffer; 1.1610 + fShortLength=0; 1.1611 + return getArrayStart(); 1.1612 + } else { 1.1613 + return 0; 1.1614 + } 1.1615 +} 1.1616 + 1.1617 +void 1.1618 +UnicodeString::releaseBuffer(int32_t newLength) { 1.1619 + if(fFlags&kOpenGetBuffer && newLength>=-1) { 1.1620 + // set the new fLength 1.1621 + int32_t capacity=getCapacity(); 1.1622 + if(newLength==-1) { 1.1623 + // the new length is the string length, capped by fCapacity 1.1624 + const UChar *array=getArrayStart(), *p=array, *limit=array+capacity; 1.1625 + while(p<limit && *p!=0) { 1.1626 + ++p; 1.1627 + } 1.1628 + newLength=(int32_t)(p-array); 1.1629 + } else if(newLength>capacity) { 1.1630 + newLength=capacity; 1.1631 + } 1.1632 + setLength(newLength); 1.1633 + fFlags&=~kOpenGetBuffer; 1.1634 + } 1.1635 +} 1.1636 + 1.1637 +//======================================== 1.1638 +// Miscellaneous 1.1639 +//======================================== 1.1640 +UBool 1.1641 +UnicodeString::cloneArrayIfNeeded(int32_t newCapacity, 1.1642 + int32_t growCapacity, 1.1643 + UBool doCopyArray, 1.1644 + int32_t **pBufferToDelete, 1.1645 + UBool forceClone) { 1.1646 + // default parameters need to be static, therefore 1.1647 + // the defaults are -1 to have convenience defaults 1.1648 + if(newCapacity == -1) { 1.1649 + newCapacity = getCapacity(); 1.1650 + } 1.1651 + 1.1652 + // while a getBuffer(minCapacity) is "open", 1.1653 + // prevent any modifications of the string by returning FALSE here 1.1654 + // if the string is bogus, then only an assignment or similar can revive it 1.1655 + if(!isWritable()) { 1.1656 + return FALSE; 1.1657 + } 1.1658 + 1.1659 + /* 1.1660 + * We need to make a copy of the array if 1.1661 + * the buffer is read-only, or 1.1662 + * the buffer is refCounted (shared), and refCount>1, or 1.1663 + * the buffer is too small. 1.1664 + * Return FALSE if memory could not be allocated. 1.1665 + */ 1.1666 + if(forceClone || 1.1667 + fFlags & kBufferIsReadonly || 1.1668 + (fFlags & kRefCounted && refCount() > 1) || 1.1669 + newCapacity > getCapacity() 1.1670 + ) { 1.1671 + // check growCapacity for default value and use of the stack buffer 1.1672 + if(growCapacity < 0) { 1.1673 + growCapacity = newCapacity; 1.1674 + } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) { 1.1675 + growCapacity = US_STACKBUF_SIZE; 1.1676 + } 1.1677 + 1.1678 + // save old values 1.1679 + UChar oldStackBuffer[US_STACKBUF_SIZE]; 1.1680 + UChar *oldArray; 1.1681 + uint8_t flags = fFlags; 1.1682 + 1.1683 + if(flags&kUsingStackBuffer) { 1.1684 + U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */ 1.1685 + if(doCopyArray && growCapacity > US_STACKBUF_SIZE) { 1.1686 + // copy the stack buffer contents because it will be overwritten with 1.1687 + // fUnion.fFields values 1.1688 + us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength); 1.1689 + oldArray = oldStackBuffer; 1.1690 + } else { 1.1691 + oldArray = 0; // no need to copy from stack buffer to itself 1.1692 + } 1.1693 + } else { 1.1694 + oldArray = fUnion.fFields.fArray; 1.1695 + U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */ 1.1696 + } 1.1697 + 1.1698 + // allocate a new array 1.1699 + if(allocate(growCapacity) || 1.1700 + (newCapacity < growCapacity && allocate(newCapacity)) 1.1701 + ) { 1.1702 + if(doCopyArray && oldArray != 0) { 1.1703 + // copy the contents 1.1704 + // do not copy more than what fits - it may be smaller than before 1.1705 + int32_t minLength = length(); 1.1706 + newCapacity = getCapacity(); 1.1707 + if(newCapacity < minLength) { 1.1708 + minLength = newCapacity; 1.1709 + setLength(minLength); 1.1710 + } 1.1711 + us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength); 1.1712 + } else { 1.1713 + fShortLength = 0; 1.1714 + } 1.1715 + 1.1716 + // release the old array 1.1717 + if(flags & kRefCounted) { 1.1718 + // the array is refCounted; decrement and release if 0 1.1719 + u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1); 1.1720 + if(umtx_atomic_dec(pRefCount) == 0) { 1.1721 + if(pBufferToDelete == 0) { 1.1722 + // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t 1.1723 + // is defined as volatile. (Volatile has useful non-standard behavior 1.1724 + // with this compiler.) 1.1725 + uprv_free((void *)pRefCount); 1.1726 + } else { 1.1727 + // the caller requested to delete it himself 1.1728 + *pBufferToDelete = (int32_t *)pRefCount; 1.1729 + } 1.1730 + } 1.1731 + } 1.1732 + } else { 1.1733 + // not enough memory for growCapacity and not even for the smaller newCapacity 1.1734 + // reset the old values for setToBogus() to release the array 1.1735 + if(!(flags&kUsingStackBuffer)) { 1.1736 + fUnion.fFields.fArray = oldArray; 1.1737 + } 1.1738 + fFlags = flags; 1.1739 + setToBogus(); 1.1740 + return FALSE; 1.1741 + } 1.1742 + } 1.1743 + return TRUE; 1.1744 +} 1.1745 + 1.1746 +// UnicodeStringAppendable ------------------------------------------------- *** 1.1747 + 1.1748 +UnicodeStringAppendable::~UnicodeStringAppendable() {} 1.1749 + 1.1750 +UBool 1.1751 +UnicodeStringAppendable::appendCodeUnit(UChar c) { 1.1752 + return str.doReplace(str.length(), 0, &c, 0, 1).isWritable(); 1.1753 +} 1.1754 + 1.1755 +UBool 1.1756 +UnicodeStringAppendable::appendCodePoint(UChar32 c) { 1.1757 + UChar buffer[U16_MAX_LENGTH]; 1.1758 + int32_t cLength = 0; 1.1759 + UBool isError = FALSE; 1.1760 + U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError); 1.1761 + return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable(); 1.1762 +} 1.1763 + 1.1764 +UBool 1.1765 +UnicodeStringAppendable::appendString(const UChar *s, int32_t length) { 1.1766 + return str.doReplace(str.length(), 0, s, 0, length).isWritable(); 1.1767 +} 1.1768 + 1.1769 +UBool 1.1770 +UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) { 1.1771 + return str.cloneArrayIfNeeded(str.length() + appendCapacity); 1.1772 +} 1.1773 + 1.1774 +UChar * 1.1775 +UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity, 1.1776 + int32_t desiredCapacityHint, 1.1777 + UChar *scratch, int32_t scratchCapacity, 1.1778 + int32_t *resultCapacity) { 1.1779 + if(minCapacity < 1 || scratchCapacity < minCapacity) { 1.1780 + *resultCapacity = 0; 1.1781 + return NULL; 1.1782 + } 1.1783 + int32_t oldLength = str.length(); 1.1784 + if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) { 1.1785 + *resultCapacity = str.getCapacity() - oldLength; 1.1786 + return str.getArrayStart() + oldLength; 1.1787 + } 1.1788 + *resultCapacity = scratchCapacity; 1.1789 + return scratch; 1.1790 +} 1.1791 + 1.1792 +U_NAMESPACE_END 1.1793 + 1.1794 +U_NAMESPACE_USE 1.1795 + 1.1796 +U_CAPI int32_t U_EXPORT2 1.1797 +uhash_hashUnicodeString(const UElement key) { 1.1798 + const UnicodeString *str = (const UnicodeString*) key.pointer; 1.1799 + return (str == NULL) ? 0 : str->hashCode(); 1.1800 +} 1.1801 + 1.1802 +// Moved here from uhash_us.cpp so that using a UVector of UnicodeString* 1.1803 +// does not depend on hashtable code. 1.1804 +U_CAPI UBool U_EXPORT2 1.1805 +uhash_compareUnicodeString(const UElement key1, const UElement key2) { 1.1806 + const UnicodeString *str1 = (const UnicodeString*) key1.pointer; 1.1807 + const UnicodeString *str2 = (const UnicodeString*) key2.pointer; 1.1808 + if (str1 == str2) { 1.1809 + return TRUE; 1.1810 + } 1.1811 + if (str1 == NULL || str2 == NULL) { 1.1812 + return FALSE; 1.1813 + } 1.1814 + return *str1 == *str2; 1.1815 +} 1.1816 + 1.1817 +#ifdef U_STATIC_IMPLEMENTATION 1.1818 +/* 1.1819 +This should never be called. It is defined here to make sure that the 1.1820 +virtual vector deleting destructor is defined within unistr.cpp. 1.1821 +The vector deleting destructor is already a part of UObject, 1.1822 +but defining it here makes sure that it is included with this object file. 1.1823 +This makes sure that static library dependencies are kept to a minimum. 1.1824 +*/ 1.1825 +static void uprv_UnicodeStringDummy(void) { 1.1826 + delete [] (new UnicodeString[2]); 1.1827 +} 1.1828 +#endif