1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/utext.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,2837 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2005-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: utext.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2005apr12 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#include "unicode/ustring.h" 1.22 +#include "unicode/unistr.h" 1.23 +#include "unicode/chariter.h" 1.24 +#include "unicode/utext.h" 1.25 +#include "unicode/utf.h" 1.26 +#include "unicode/utf8.h" 1.27 +#include "unicode/utf16.h" 1.28 +#include "ustr_imp.h" 1.29 +#include "cmemory.h" 1.30 +#include "cstring.h" 1.31 +#include "uassert.h" 1.32 +#include "putilimp.h" 1.33 + 1.34 +U_NAMESPACE_USE 1.35 + 1.36 +#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex)) 1.37 + 1.38 + 1.39 +static UBool 1.40 +utext_access(UText *ut, int64_t index, UBool forward) { 1.41 + return ut->pFuncs->access(ut, index, forward); 1.42 +} 1.43 + 1.44 + 1.45 + 1.46 +U_CAPI UBool U_EXPORT2 1.47 +utext_moveIndex32(UText *ut, int32_t delta) { 1.48 + UChar32 c; 1.49 + if (delta > 0) { 1.50 + do { 1.51 + if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) { 1.52 + return FALSE; 1.53 + } 1.54 + c = ut->chunkContents[ut->chunkOffset]; 1.55 + if (U16_IS_SURROGATE(c)) { 1.56 + c = utext_next32(ut); 1.57 + if (c == U_SENTINEL) { 1.58 + return FALSE; 1.59 + } 1.60 + } else { 1.61 + ut->chunkOffset++; 1.62 + } 1.63 + } while(--delta>0); 1.64 + 1.65 + } else if (delta<0) { 1.66 + do { 1.67 + if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) { 1.68 + return FALSE; 1.69 + } 1.70 + c = ut->chunkContents[ut->chunkOffset-1]; 1.71 + if (U16_IS_SURROGATE(c)) { 1.72 + c = utext_previous32(ut); 1.73 + if (c == U_SENTINEL) { 1.74 + return FALSE; 1.75 + } 1.76 + } else { 1.77 + ut->chunkOffset--; 1.78 + } 1.79 + } while(++delta<0); 1.80 + } 1.81 + 1.82 + return TRUE; 1.83 +} 1.84 + 1.85 + 1.86 +U_CAPI int64_t U_EXPORT2 1.87 +utext_nativeLength(UText *ut) { 1.88 + return ut->pFuncs->nativeLength(ut); 1.89 +} 1.90 + 1.91 + 1.92 +U_CAPI UBool U_EXPORT2 1.93 +utext_isLengthExpensive(const UText *ut) { 1.94 + UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0; 1.95 + return r; 1.96 +} 1.97 + 1.98 + 1.99 +U_CAPI int64_t U_EXPORT2 1.100 +utext_getNativeIndex(const UText *ut) { 1.101 + if(ut->chunkOffset <= ut->nativeIndexingLimit) { 1.102 + return ut->chunkNativeStart+ut->chunkOffset; 1.103 + } else { 1.104 + return ut->pFuncs->mapOffsetToNative(ut); 1.105 + } 1.106 +} 1.107 + 1.108 + 1.109 +U_CAPI void U_EXPORT2 1.110 +utext_setNativeIndex(UText *ut, int64_t index) { 1.111 + if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 1.112 + // The desired position is outside of the current chunk. 1.113 + // Access the new position. Assume a forward iteration from here, 1.114 + // which will also be optimimum for a single random access. 1.115 + // Reverse iterations may suffer slightly. 1.116 + ut->pFuncs->access(ut, index, TRUE); 1.117 + } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) { 1.118 + // utf-16 indexing. 1.119 + ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart); 1.120 + } else { 1.121 + ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 1.122 + } 1.123 + // The convention is that the index must always be on a code point boundary. 1.124 + // Adjust the index position if it is in the middle of a surrogate pair. 1.125 + if (ut->chunkOffset<ut->chunkLength) { 1.126 + UChar c= ut->chunkContents[ut->chunkOffset]; 1.127 + if (U16_IS_TRAIL(c)) { 1.128 + if (ut->chunkOffset==0) { 1.129 + ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE); 1.130 + } 1.131 + if (ut->chunkOffset>0) { 1.132 + UChar lead = ut->chunkContents[ut->chunkOffset-1]; 1.133 + if (U16_IS_LEAD(lead)) { 1.134 + ut->chunkOffset--; 1.135 + } 1.136 + } 1.137 + } 1.138 + } 1.139 +} 1.140 + 1.141 + 1.142 + 1.143 +U_CAPI int64_t U_EXPORT2 1.144 +utext_getPreviousNativeIndex(UText *ut) { 1.145 + // 1.146 + // Fast-path the common case. 1.147 + // Common means current position is not at the beginning of a chunk 1.148 + // and the preceding character is not supplementary. 1.149 + // 1.150 + int32_t i = ut->chunkOffset - 1; 1.151 + int64_t result; 1.152 + if (i >= 0) { 1.153 + UChar c = ut->chunkContents[i]; 1.154 + if (U16_IS_TRAIL(c) == FALSE) { 1.155 + if (i <= ut->nativeIndexingLimit) { 1.156 + result = ut->chunkNativeStart + i; 1.157 + } else { 1.158 + ut->chunkOffset = i; 1.159 + result = ut->pFuncs->mapOffsetToNative(ut); 1.160 + ut->chunkOffset++; 1.161 + } 1.162 + return result; 1.163 + } 1.164 + } 1.165 + 1.166 + // If at the start of text, simply return 0. 1.167 + if (ut->chunkOffset==0 && ut->chunkNativeStart==0) { 1.168 + return 0; 1.169 + } 1.170 + 1.171 + // Harder, less common cases. We are at a chunk boundary, or on a surrogate. 1.172 + // Keep it simple, use other functions to handle the edges. 1.173 + // 1.174 + utext_previous32(ut); 1.175 + result = UTEXT_GETNATIVEINDEX(ut); 1.176 + utext_next32(ut); 1.177 + return result; 1.178 +} 1.179 + 1.180 + 1.181 +// 1.182 +// utext_current32. Get the UChar32 at the current position. 1.183 +// UText iteration position is always on a code point boundary, 1.184 +// never on the trail half of a surrogate pair. 1.185 +// 1.186 +U_CAPI UChar32 U_EXPORT2 1.187 +utext_current32(UText *ut) { 1.188 + UChar32 c; 1.189 + if (ut->chunkOffset==ut->chunkLength) { 1.190 + // Current position is just off the end of the chunk. 1.191 + if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 1.192 + // Off the end of the text. 1.193 + return U_SENTINEL; 1.194 + } 1.195 + } 1.196 + 1.197 + c = ut->chunkContents[ut->chunkOffset]; 1.198 + if (U16_IS_LEAD(c) == FALSE) { 1.199 + // Normal, non-supplementary case. 1.200 + return c; 1.201 + } 1.202 + 1.203 + // 1.204 + // Possible supplementary char. 1.205 + // 1.206 + UChar32 trail = 0; 1.207 + UChar32 supplementaryC = c; 1.208 + if ((ut->chunkOffset+1) < ut->chunkLength) { 1.209 + // The trail surrogate is in the same chunk. 1.210 + trail = ut->chunkContents[ut->chunkOffset+1]; 1.211 + } else { 1.212 + // The trail surrogate is in a different chunk. 1.213 + // Because we must maintain the iteration position, we need to switch forward 1.214 + // into the new chunk, get the trail surrogate, then revert the chunk back to the 1.215 + // original one. 1.216 + // An edge case to be careful of: the entire text may end with an unpaired 1.217 + // leading surrogate. The attempt to access the trail will fail, but 1.218 + // the original position before the unpaired lead still needs to be restored. 1.219 + int64_t nativePosition = ut->chunkNativeLimit; 1.220 + int32_t originalOffset = ut->chunkOffset; 1.221 + if (ut->pFuncs->access(ut, nativePosition, TRUE)) { 1.222 + trail = ut->chunkContents[ut->chunkOffset]; 1.223 + } 1.224 + UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk 1.225 + U_ASSERT(r==TRUE); 1.226 + ut->chunkOffset = originalOffset; 1.227 + if(!r) { 1.228 + return U_SENTINEL; 1.229 + } 1.230 + } 1.231 + 1.232 + if (U16_IS_TRAIL(trail)) { 1.233 + supplementaryC = U16_GET_SUPPLEMENTARY(c, trail); 1.234 + } 1.235 + return supplementaryC; 1.236 + 1.237 +} 1.238 + 1.239 + 1.240 +U_CAPI UChar32 U_EXPORT2 1.241 +utext_char32At(UText *ut, int64_t nativeIndex) { 1.242 + UChar32 c = U_SENTINEL; 1.243 + 1.244 + // Fast path the common case. 1.245 + if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) { 1.246 + ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart); 1.247 + c = ut->chunkContents[ut->chunkOffset]; 1.248 + if (U16_IS_SURROGATE(c) == FALSE) { 1.249 + return c; 1.250 + } 1.251 + } 1.252 + 1.253 + 1.254 + utext_setNativeIndex(ut, nativeIndex); 1.255 + if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) { 1.256 + c = ut->chunkContents[ut->chunkOffset]; 1.257 + if (U16_IS_SURROGATE(c)) { 1.258 + // For surrogates, let current32() deal with the complications 1.259 + // of supplementaries that may span chunk boundaries. 1.260 + c = utext_current32(ut); 1.261 + } 1.262 + } 1.263 + return c; 1.264 +} 1.265 + 1.266 + 1.267 +U_CAPI UChar32 U_EXPORT2 1.268 +utext_next32(UText *ut) { 1.269 + UChar32 c; 1.270 + 1.271 + if (ut->chunkOffset >= ut->chunkLength) { 1.272 + if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 1.273 + return U_SENTINEL; 1.274 + } 1.275 + } 1.276 + 1.277 + c = ut->chunkContents[ut->chunkOffset++]; 1.278 + if (U16_IS_LEAD(c) == FALSE) { 1.279 + // Normal case, not supplementary. 1.280 + // (A trail surrogate seen here is just returned as is, as a surrogate value. 1.281 + // It cannot be part of a pair.) 1.282 + return c; 1.283 + } 1.284 + 1.285 + if (ut->chunkOffset >= ut->chunkLength) { 1.286 + if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { 1.287 + // c is an unpaired lead surrogate at the end of the text. 1.288 + // return it as it is. 1.289 + return c; 1.290 + } 1.291 + } 1.292 + UChar32 trail = ut->chunkContents[ut->chunkOffset]; 1.293 + if (U16_IS_TRAIL(trail) == FALSE) { 1.294 + // c was an unpaired lead surrogate, not at the end of the text. 1.295 + // return it as it is (unpaired). Iteration position is on the 1.296 + // following character, possibly in the next chunk, where the 1.297 + // trail surrogate would have been if it had existed. 1.298 + return c; 1.299 + } 1.300 + 1.301 + UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail); 1.302 + ut->chunkOffset++; // move iteration position over the trail surrogate. 1.303 + return supplementary; 1.304 + } 1.305 + 1.306 + 1.307 +U_CAPI UChar32 U_EXPORT2 1.308 +utext_previous32(UText *ut) { 1.309 + UChar32 c; 1.310 + 1.311 + if (ut->chunkOffset <= 0) { 1.312 + if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { 1.313 + return U_SENTINEL; 1.314 + } 1.315 + } 1.316 + ut->chunkOffset--; 1.317 + c = ut->chunkContents[ut->chunkOffset]; 1.318 + if (U16_IS_TRAIL(c) == FALSE) { 1.319 + // Normal case, not supplementary. 1.320 + // (A lead surrogate seen here is just returned as is, as a surrogate value. 1.321 + // It cannot be part of a pair.) 1.322 + return c; 1.323 + } 1.324 + 1.325 + if (ut->chunkOffset <= 0) { 1.326 + if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { 1.327 + // c is an unpaired trail surrogate at the start of the text. 1.328 + // return it as it is. 1.329 + return c; 1.330 + } 1.331 + } 1.332 + 1.333 + UChar32 lead = ut->chunkContents[ut->chunkOffset-1]; 1.334 + if (U16_IS_LEAD(lead) == FALSE) { 1.335 + // c was an unpaired trail surrogate, not at the end of the text. 1.336 + // return it as it is (unpaired). Iteration position is at c 1.337 + return c; 1.338 + } 1.339 + 1.340 + UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c); 1.341 + ut->chunkOffset--; // move iteration position over the lead surrogate. 1.342 + return supplementary; 1.343 +} 1.344 + 1.345 + 1.346 + 1.347 +U_CAPI UChar32 U_EXPORT2 1.348 +utext_next32From(UText *ut, int64_t index) { 1.349 + UChar32 c = U_SENTINEL; 1.350 + 1.351 + if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { 1.352 + // Desired position is outside of the current chunk. 1.353 + if(!ut->pFuncs->access(ut, index, TRUE)) { 1.354 + // no chunk available here 1.355 + return U_SENTINEL; 1.356 + } 1.357 + } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 1.358 + // Desired position is in chunk, with direct 1:1 native to UTF16 indexing 1.359 + ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 1.360 + } else { 1.361 + // Desired position is in chunk, with non-UTF16 indexing. 1.362 + ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index); 1.363 + } 1.364 + 1.365 + c = ut->chunkContents[ut->chunkOffset++]; 1.366 + if (U16_IS_SURROGATE(c)) { 1.367 + // Surrogates. Many edge cases. Use other functions that already 1.368 + // deal with the problems. 1.369 + utext_setNativeIndex(ut, index); 1.370 + c = utext_next32(ut); 1.371 + } 1.372 + return c; 1.373 +} 1.374 + 1.375 + 1.376 +U_CAPI UChar32 U_EXPORT2 1.377 +utext_previous32From(UText *ut, int64_t index) { 1.378 + // 1.379 + // Return the character preceding the specified index. 1.380 + // Leave the iteration position at the start of the character that was returned. 1.381 + // 1.382 + UChar32 cPrev; // The character preceding cCurr, which is what we will return. 1.383 + 1.384 + // Address the chunk containg the position preceding the incoming index 1.385 + // A tricky edge case: 1.386 + // We try to test the requested native index against the chunkNativeStart to determine 1.387 + // whether the character preceding the one at the index is in the current chunk. 1.388 + // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the 1.389 + // requested index is on something other than the first position of the first char. 1.390 + // 1.391 + if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) { 1.392 + // Requested native index is outside of the current chunk. 1.393 + if(!ut->pFuncs->access(ut, index, FALSE)) { 1.394 + // no chunk available here 1.395 + return U_SENTINEL; 1.396 + } 1.397 + } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { 1.398 + // Direct UTF-16 indexing. 1.399 + ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 1.400 + } else { 1.401 + ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); 1.402 + if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) { 1.403 + // no chunk available here 1.404 + return U_SENTINEL; 1.405 + } 1.406 + } 1.407 + 1.408 + // 1.409 + // Simple case with no surrogates. 1.410 + // 1.411 + ut->chunkOffset--; 1.412 + cPrev = ut->chunkContents[ut->chunkOffset]; 1.413 + 1.414 + if (U16_IS_SURROGATE(cPrev)) { 1.415 + // Possible supplementary. Many edge cases. 1.416 + // Let other functions do the heavy lifting. 1.417 + utext_setNativeIndex(ut, index); 1.418 + cPrev = utext_previous32(ut); 1.419 + } 1.420 + return cPrev; 1.421 +} 1.422 + 1.423 + 1.424 +U_CAPI int32_t U_EXPORT2 1.425 +utext_extract(UText *ut, 1.426 + int64_t start, int64_t limit, 1.427 + UChar *dest, int32_t destCapacity, 1.428 + UErrorCode *status) { 1.429 + return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status); 1.430 + } 1.431 + 1.432 + 1.433 + 1.434 +U_CAPI UBool U_EXPORT2 1.435 +utext_equals(const UText *a, const UText *b) { 1.436 + if (a==NULL || b==NULL || 1.437 + a->magic != UTEXT_MAGIC || 1.438 + b->magic != UTEXT_MAGIC) { 1.439 + // Null or invalid arguments don't compare equal to anything. 1.440 + return FALSE; 1.441 + } 1.442 + 1.443 + if (a->pFuncs != b->pFuncs) { 1.444 + // Different types of text providers. 1.445 + return FALSE; 1.446 + } 1.447 + 1.448 + if (a->context != b->context) { 1.449 + // Different sources (different strings) 1.450 + return FALSE; 1.451 + } 1.452 + if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) { 1.453 + // Different current position in the string. 1.454 + return FALSE; 1.455 + } 1.456 + 1.457 + return TRUE; 1.458 +} 1.459 + 1.460 +U_CAPI UBool U_EXPORT2 1.461 +utext_isWritable(const UText *ut) 1.462 +{ 1.463 + UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0; 1.464 + return b; 1.465 +} 1.466 + 1.467 + 1.468 +U_CAPI void U_EXPORT2 1.469 +utext_freeze(UText *ut) { 1.470 + // Zero out the WRITABLE flag. 1.471 + ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE)); 1.472 +} 1.473 + 1.474 + 1.475 +U_CAPI UBool U_EXPORT2 1.476 +utext_hasMetaData(const UText *ut) 1.477 +{ 1.478 + UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0; 1.479 + return b; 1.480 +} 1.481 + 1.482 + 1.483 + 1.484 +U_CAPI int32_t U_EXPORT2 1.485 +utext_replace(UText *ut, 1.486 + int64_t nativeStart, int64_t nativeLimit, 1.487 + const UChar *replacementText, int32_t replacementLength, 1.488 + UErrorCode *status) 1.489 +{ 1.490 + if (U_FAILURE(*status)) { 1.491 + return 0; 1.492 + } 1.493 + if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 1.494 + *status = U_NO_WRITE_PERMISSION; 1.495 + return 0; 1.496 + } 1.497 + int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status); 1.498 + return i; 1.499 +} 1.500 + 1.501 +U_CAPI void U_EXPORT2 1.502 +utext_copy(UText *ut, 1.503 + int64_t nativeStart, int64_t nativeLimit, 1.504 + int64_t destIndex, 1.505 + UBool move, 1.506 + UErrorCode *status) 1.507 +{ 1.508 + if (U_FAILURE(*status)) { 1.509 + return; 1.510 + } 1.511 + if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { 1.512 + *status = U_NO_WRITE_PERMISSION; 1.513 + return; 1.514 + } 1.515 + ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status); 1.516 +} 1.517 + 1.518 + 1.519 + 1.520 +U_CAPI UText * U_EXPORT2 1.521 +utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) { 1.522 + UText *result; 1.523 + result = src->pFuncs->clone(dest, src, deep, status); 1.524 + if (readOnly) { 1.525 + utext_freeze(result); 1.526 + } 1.527 + return result; 1.528 +} 1.529 + 1.530 + 1.531 + 1.532 +//------------------------------------------------------------------------------ 1.533 +// 1.534 +// UText common functions implementation 1.535 +// 1.536 +//------------------------------------------------------------------------------ 1.537 + 1.538 +// 1.539 +// UText.flags bit definitions 1.540 +// 1.541 +enum { 1.542 + UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap. 1.543 + // 0 if caller provided storage for the UText. 1.544 + 1.545 + UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate 1.546 + // heap block. 1.547 + // 0 if there is no separate allocation. Either no extra 1.548 + // storage was requested, or it is appended to the end 1.549 + // of the main UText storage. 1.550 + 1.551 + UTEXT_OPEN = 4 // 1 if this UText is currently open 1.552 + // 0 if this UText is not open. 1.553 +}; 1.554 + 1.555 + 1.556 +// 1.557 +// Extended form of a UText. The purpose is to aid in computing the total size required 1.558 +// when a provider asks for a UText to be allocated with extra storage. 1.559 + 1.560 +struct ExtendedUText { 1.561 + UText ut; 1.562 + UAlignedMemory extension; 1.563 +}; 1.564 + 1.565 +static const UText emptyText = UTEXT_INITIALIZER; 1.566 + 1.567 +U_CAPI UText * U_EXPORT2 1.568 +utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { 1.569 + if (U_FAILURE(*status)) { 1.570 + return ut; 1.571 + } 1.572 + 1.573 + if (ut == NULL) { 1.574 + // We need to heap-allocate storage for the new UText 1.575 + int32_t spaceRequired = sizeof(UText); 1.576 + if (extraSpace > 0) { 1.577 + spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory); 1.578 + } 1.579 + ut = (UText *)uprv_malloc(spaceRequired); 1.580 + if (ut == NULL) { 1.581 + *status = U_MEMORY_ALLOCATION_ERROR; 1.582 + return NULL; 1.583 + } else { 1.584 + *ut = emptyText; 1.585 + ut->flags |= UTEXT_HEAP_ALLOCATED; 1.586 + if (spaceRequired>0) { 1.587 + ut->extraSize = extraSpace; 1.588 + ut->pExtra = &((ExtendedUText *)ut)->extension; 1.589 + } 1.590 + } 1.591 + } else { 1.592 + // We have been supplied with an already existing UText. 1.593 + // Verify that it really appears to be a UText. 1.594 + if (ut->magic != UTEXT_MAGIC) { 1.595 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.596 + return ut; 1.597 + } 1.598 + // If the ut is already open and there's a provider supplied close 1.599 + // function, call it. 1.600 + if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) { 1.601 + ut->pFuncs->close(ut); 1.602 + } 1.603 + ut->flags &= ~UTEXT_OPEN; 1.604 + 1.605 + // If extra space was requested by our caller, check whether 1.606 + // sufficient already exists, and allocate new if needed. 1.607 + if (extraSpace > ut->extraSize) { 1.608 + // Need more space. If there is existing separately allocated space, 1.609 + // delete it first, then allocate new space. 1.610 + if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 1.611 + uprv_free(ut->pExtra); 1.612 + ut->extraSize = 0; 1.613 + } 1.614 + ut->pExtra = uprv_malloc(extraSpace); 1.615 + if (ut->pExtra == NULL) { 1.616 + *status = U_MEMORY_ALLOCATION_ERROR; 1.617 + } else { 1.618 + ut->extraSize = extraSpace; 1.619 + ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED; 1.620 + } 1.621 + } 1.622 + } 1.623 + if (U_SUCCESS(*status)) { 1.624 + ut->flags |= UTEXT_OPEN; 1.625 + 1.626 + // Initialize all remaining fields of the UText. 1.627 + // 1.628 + ut->context = NULL; 1.629 + ut->chunkContents = NULL; 1.630 + ut->p = NULL; 1.631 + ut->q = NULL; 1.632 + ut->r = NULL; 1.633 + ut->a = 0; 1.634 + ut->b = 0; 1.635 + ut->c = 0; 1.636 + ut->chunkOffset = 0; 1.637 + ut->chunkLength = 0; 1.638 + ut->chunkNativeStart = 0; 1.639 + ut->chunkNativeLimit = 0; 1.640 + ut->nativeIndexingLimit = 0; 1.641 + ut->providerProperties = 0; 1.642 + ut->privA = 0; 1.643 + ut->privB = 0; 1.644 + ut->privC = 0; 1.645 + ut->privP = NULL; 1.646 + if (ut->pExtra!=NULL && ut->extraSize>0) 1.647 + uprv_memset(ut->pExtra, 0, ut->extraSize); 1.648 + 1.649 + } 1.650 + return ut; 1.651 +} 1.652 + 1.653 + 1.654 +U_CAPI UText * U_EXPORT2 1.655 +utext_close(UText *ut) { 1.656 + if (ut==NULL || 1.657 + ut->magic != UTEXT_MAGIC || 1.658 + (ut->flags & UTEXT_OPEN) == 0) 1.659 + { 1.660 + // The supplied ut is not an open UText. 1.661 + // Do nothing. 1.662 + return ut; 1.663 + } 1.664 + 1.665 + // If the provider gave us a close function, call it now. 1.666 + // This will clean up anything allocated specifically by the provider. 1.667 + if (ut->pFuncs->close != NULL) { 1.668 + ut->pFuncs->close(ut); 1.669 + } 1.670 + ut->flags &= ~UTEXT_OPEN; 1.671 + 1.672 + // If we (the framework) allocated the UText or subsidiary storage, 1.673 + // delete it. 1.674 + if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { 1.675 + uprv_free(ut->pExtra); 1.676 + ut->pExtra = NULL; 1.677 + ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED; 1.678 + ut->extraSize = 0; 1.679 + } 1.680 + 1.681 + // Zero out function table of the closed UText. This is a defensive move, 1.682 + // inteded to cause applications that inadvertantly use a closed 1.683 + // utext to crash with null pointer errors. 1.684 + ut->pFuncs = NULL; 1.685 + 1.686 + if (ut->flags & UTEXT_HEAP_ALLOCATED) { 1.687 + // This UText was allocated by UText setup. We need to free it. 1.688 + // Clear magic, so we can detect if the user messes up and immediately 1.689 + // tries to reopen another UText using the deleted storage. 1.690 + ut->magic = 0; 1.691 + uprv_free(ut); 1.692 + ut = NULL; 1.693 + } 1.694 + return ut; 1.695 +} 1.696 + 1.697 + 1.698 + 1.699 + 1.700 +// 1.701 +// invalidateChunk Reset a chunk to have no contents, so that the next call 1.702 +// to access will cause new data to load. 1.703 +// This is needed when copy/move/replace operate directly on the 1.704 +// backing text, potentially putting it out of sync with the 1.705 +// contents in the chunk. 1.706 +// 1.707 +static void 1.708 +invalidateChunk(UText *ut) { 1.709 + ut->chunkLength = 0; 1.710 + ut->chunkNativeLimit = 0; 1.711 + ut->chunkNativeStart = 0; 1.712 + ut->chunkOffset = 0; 1.713 + ut->nativeIndexingLimit = 0; 1.714 +} 1.715 + 1.716 +// 1.717 +// pinIndex Do range pinning on a native index parameter. 1.718 +// 64 bit pinning is done in place. 1.719 +// 32 bit truncated result is returned as a convenience for 1.720 +// use in providers that don't need 64 bits. 1.721 +static int32_t 1.722 +pinIndex(int64_t &index, int64_t limit) { 1.723 + if (index<0) { 1.724 + index = 0; 1.725 + } else if (index > limit) { 1.726 + index = limit; 1.727 + } 1.728 + return (int32_t)index; 1.729 +} 1.730 + 1.731 + 1.732 +U_CDECL_BEGIN 1.733 + 1.734 +// 1.735 +// Pointer relocation function, 1.736 +// a utility used by shallow clone. 1.737 +// Adjust a pointer that refers to something within one UText (the source) 1.738 +// to refer to the same relative offset within a another UText (the target) 1.739 +// 1.740 +static void adjustPointer(UText *dest, const void **destPtr, const UText *src) { 1.741 + // convert all pointers to (char *) so that byte address arithmetic will work. 1.742 + char *dptr = (char *)*destPtr; 1.743 + char *dUText = (char *)dest; 1.744 + char *sUText = (char *)src; 1.745 + 1.746 + if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) { 1.747 + // target ptr was to something within the src UText's pExtra storage. 1.748 + // relocate it into the target UText's pExtra region. 1.749 + *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra); 1.750 + } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) { 1.751 + // target ptr was pointing to somewhere within the source UText itself. 1.752 + // Move it to the same offset within the target UText. 1.753 + *destPtr = dUText + (dptr-sUText); 1.754 + } 1.755 +} 1.756 + 1.757 + 1.758 +// 1.759 +// Clone. This is a generic copy-the-utext-by-value clone function that can be 1.760 +// used as-is with some utext types, and as a helper by other clones. 1.761 +// 1.762 +static UText * U_CALLCONV 1.763 +shallowTextClone(UText * dest, const UText * src, UErrorCode * status) { 1.764 + if (U_FAILURE(*status)) { 1.765 + return NULL; 1.766 + } 1.767 + int32_t srcExtraSize = src->extraSize; 1.768 + 1.769 + // 1.770 + // Use the generic text_setup to allocate storage if required. 1.771 + // 1.772 + dest = utext_setup(dest, srcExtraSize, status); 1.773 + if (U_FAILURE(*status)) { 1.774 + return dest; 1.775 + } 1.776 + 1.777 + // 1.778 + // flags (how the UText was allocated) and the pointer to the 1.779 + // extra storage must retain the values in the cloned utext that 1.780 + // were set up by utext_setup. Save them separately before 1.781 + // copying the whole struct. 1.782 + // 1.783 + void *destExtra = dest->pExtra; 1.784 + int32_t flags = dest->flags; 1.785 + 1.786 + 1.787 + // 1.788 + // Copy the whole UText struct by value. 1.789 + // Any "Extra" storage is copied also. 1.790 + // 1.791 + int sizeToCopy = src->sizeOfStruct; 1.792 + if (sizeToCopy > dest->sizeOfStruct) { 1.793 + sizeToCopy = dest->sizeOfStruct; 1.794 + } 1.795 + uprv_memcpy(dest, src, sizeToCopy); 1.796 + dest->pExtra = destExtra; 1.797 + dest->flags = flags; 1.798 + if (srcExtraSize > 0) { 1.799 + uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize); 1.800 + } 1.801 + 1.802 + // 1.803 + // Relocate any pointers in the target that refer to the UText itself 1.804 + // to point to the cloned copy rather than the original source. 1.805 + // 1.806 + adjustPointer(dest, &dest->context, src); 1.807 + adjustPointer(dest, &dest->p, src); 1.808 + adjustPointer(dest, &dest->q, src); 1.809 + adjustPointer(dest, &dest->r, src); 1.810 + adjustPointer(dest, (const void **)&dest->chunkContents, src); 1.811 + 1.812 + return dest; 1.813 +} 1.814 + 1.815 + 1.816 +U_CDECL_END 1.817 + 1.818 + 1.819 + 1.820 +//------------------------------------------------------------------------------ 1.821 +// 1.822 +// UText implementation for UTF-8 char * strings (read-only) 1.823 +// Limitation: string length must be <= 0x7fffffff in length. 1.824 +// (length must for in an int32_t variable) 1.825 +// 1.826 +// Use of UText data members: 1.827 +// context pointer to UTF-8 string 1.828 +// utext.b is the input string length (bytes). 1.829 +// utext.c Length scanned so far in string 1.830 +// (for optimizing finding length of zero terminated strings.) 1.831 +// utext.p pointer to the current buffer 1.832 +// utext.q pointer to the other buffer. 1.833 +// 1.834 +//------------------------------------------------------------------------------ 1.835 + 1.836 +// Chunk size. 1.837 +// Must be less than 85, because of byte mapping from UChar indexes to native indexes. 1.838 +// Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes 1.839 +// to two UChars.) 1.840 +// 1.841 +enum { UTF8_TEXT_CHUNK_SIZE=32 }; 1.842 + 1.843 +// 1.844 +// UTF8Buf Two of these structs will be set up in the UText's extra allocated space. 1.845 +// Each contains the UChar chunk buffer, the to and from native maps, and 1.846 +// header info. 1.847 +// 1.848 +// because backwards iteration fills the buffers starting at the end and 1.849 +// working towards the front, the filled part of the buffers may not begin 1.850 +// at the start of the available storage for the buffers. 1.851 +// 1.852 +// Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for 1.853 +// the last character added being a supplementary, and thus requiring a surrogate 1.854 +// pair. Doing this is simpler than checking for the edge case. 1.855 +// 1.856 + 1.857 +struct UTF8Buf { 1.858 + int32_t bufNativeStart; // Native index of first char in UChar buf 1.859 + int32_t bufNativeLimit; // Native index following last char in buf. 1.860 + int32_t bufStartIdx; // First filled position in buf. 1.861 + int32_t bufLimitIdx; // Limit of filled range in buf. 1.862 + int32_t bufNILimit; // Limit of native indexing part of buf 1.863 + int32_t toUCharsMapStart; // Native index corresponding to 1.864 + // mapToUChars[0]. 1.865 + // Set to bufNativeStart when filling forwards. 1.866 + // Set to computed value when filling backwards. 1.867 + 1.868 + UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the 1.869 + // the chunk size, to allow for surrogate at the end. 1.870 + // Length must be identical to mapToNative array, below, 1.871 + // because of the way indexing works when the array is 1.872 + // filled backwards during a reverse iteration. Thus, 1.873 + // the additional extra size. 1.874 + uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to 1.875 + // native offset from bufNativeStart. 1.876 + // Requires two extra slots, 1.877 + // one for a supplementary starting in the last normal position, 1.878 + // and one for an entry for the buffer limit position. 1.879 + uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to 1.880 + // correspoding offset in filled part of buf. 1.881 + int32_t align; 1.882 +}; 1.883 + 1.884 +U_CDECL_BEGIN 1.885 + 1.886 +// 1.887 +// utf8TextLength 1.888 +// 1.889 +// Get the length of the string. If we don't already know it, 1.890 +// we'll need to scan for the trailing nul. 1.891 +// 1.892 +static int64_t U_CALLCONV 1.893 +utf8TextLength(UText *ut) { 1.894 + if (ut->b < 0) { 1.895 + // Zero terminated string, and we haven't scanned to the end yet. 1.896 + // Scan it now. 1.897 + const char *r = (const char *)ut->context + ut->c; 1.898 + while (*r != 0) { 1.899 + r++; 1.900 + } 1.901 + if ((r - (const char *)ut->context) < 0x7fffffff) { 1.902 + ut->b = (int32_t)(r - (const char *)ut->context); 1.903 + } else { 1.904 + // Actual string was bigger (more than 2 gig) than we 1.905 + // can handle. Clip it to 2 GB. 1.906 + ut->b = 0x7fffffff; 1.907 + } 1.908 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.909 + } 1.910 + return ut->b; 1.911 +} 1.912 + 1.913 + 1.914 + 1.915 + 1.916 + 1.917 + 1.918 +static UBool U_CALLCONV 1.919 +utf8TextAccess(UText *ut, int64_t index, UBool forward) { 1.920 + // 1.921 + // Apologies to those who are allergic to goto statements. 1.922 + // Consider each goto to a labelled block to be the equivalent of 1.923 + // call the named block as if it were a function(); 1.924 + // return; 1.925 + // 1.926 + const uint8_t *s8=(const uint8_t *)ut->context; 1.927 + UTF8Buf *u8b = NULL; 1.928 + int32_t length = ut->b; // Length of original utf-8 1.929 + int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits. 1.930 + int32_t mapIndex = 0; 1.931 + if (index<0) { 1.932 + ix=0; 1.933 + } else if (index > 0x7fffffff) { 1.934 + // Strings with 64 bit lengths not supported by this UTF-8 provider. 1.935 + ix = 0x7fffffff; 1.936 + } 1.937 + 1.938 + // Pin requested index to the string length. 1.939 + if (ix>length) { 1.940 + if (length>=0) { 1.941 + ix=length; 1.942 + } else if (ix>=ut->c) { 1.943 + // Zero terminated string, and requested index is beyond 1.944 + // the region that has already been scanned. 1.945 + // Scan up to either the end of the string or to the 1.946 + // requested position, whichever comes first. 1.947 + while (ut->c<ix && s8[ut->c]!=0) { 1.948 + ut->c++; 1.949 + } 1.950 + // TODO: support for null terminated string length > 32 bits. 1.951 + if (s8[ut->c] == 0) { 1.952 + // We just found the actual length of the string. 1.953 + // Trim the requested index back to that. 1.954 + ix = ut->c; 1.955 + ut->b = ut->c; 1.956 + length = ut->c; 1.957 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.958 + } 1.959 + } 1.960 + } 1.961 + 1.962 + // 1.963 + // Dispatch to the appropriate action for a forward iteration request. 1.964 + // 1.965 + if (forward) { 1.966 + if (ix==ut->chunkNativeLimit) { 1.967 + // Check for normal sequential iteration cases first. 1.968 + if (ix==length) { 1.969 + // Just reached end of string 1.970 + // Don't swap buffers, but do set the 1.971 + // current buffer position. 1.972 + ut->chunkOffset = ut->chunkLength; 1.973 + return FALSE; 1.974 + } else { 1.975 + // End of current buffer. 1.976 + // check whether other buffer already has what we need. 1.977 + UTF8Buf *altB = (UTF8Buf *)ut->q; 1.978 + if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) { 1.979 + goto swapBuffers; 1.980 + } 1.981 + } 1.982 + } 1.983 + 1.984 + // A random access. Desired index could be in either or niether buf. 1.985 + // For optimizing the order of testing, first check for the index 1.986 + // being in the other buffer. This will be the case for uses that 1.987 + // move back and forth over a fairly limited range 1.988 + { 1.989 + u8b = (UTF8Buf *)ut->q; // the alternate buffer 1.990 + if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) { 1.991 + // Requested index is in the other buffer. 1.992 + goto swapBuffers; 1.993 + } 1.994 + if (ix == length) { 1.995 + // Requested index is end-of-string. 1.996 + // (this is the case of randomly seeking to the end. 1.997 + // The case of iterating off the end is handled earlier.) 1.998 + if (ix == ut->chunkNativeLimit) { 1.999 + // Current buffer extends up to the end of the string. 1.1000 + // Leave it as the current buffer. 1.1001 + ut->chunkOffset = ut->chunkLength; 1.1002 + return FALSE; 1.1003 + } 1.1004 + if (ix == u8b->bufNativeLimit) { 1.1005 + // Alternate buffer extends to the end of string. 1.1006 + // Swap it in as the current buffer. 1.1007 + goto swapBuffersAndFail; 1.1008 + } 1.1009 + 1.1010 + // Neither existing buffer extends to the end of the string. 1.1011 + goto makeStubBuffer; 1.1012 + } 1.1013 + 1.1014 + if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) { 1.1015 + // Requested index is in neither buffer. 1.1016 + goto fillForward; 1.1017 + } 1.1018 + 1.1019 + // Requested index is in this buffer. 1.1020 + u8b = (UTF8Buf *)ut->p; // the current buffer 1.1021 + mapIndex = ix - u8b->toUCharsMapStart; 1.1022 + ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1.1023 + return TRUE; 1.1024 + 1.1025 + } 1.1026 + } 1.1027 + 1.1028 + 1.1029 + // 1.1030 + // Dispatch to the appropriate action for a 1.1031 + // Backwards Diretion iteration request. 1.1032 + // 1.1033 + if (ix==ut->chunkNativeStart) { 1.1034 + // Check for normal sequential iteration cases first. 1.1035 + if (ix==0) { 1.1036 + // Just reached the start of string 1.1037 + // Don't swap buffers, but do set the 1.1038 + // current buffer position. 1.1039 + ut->chunkOffset = 0; 1.1040 + return FALSE; 1.1041 + } else { 1.1042 + // Start of current buffer. 1.1043 + // check whether other buffer already has what we need. 1.1044 + UTF8Buf *altB = (UTF8Buf *)ut->q; 1.1045 + if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) { 1.1046 + goto swapBuffers; 1.1047 + } 1.1048 + } 1.1049 + } 1.1050 + 1.1051 + // A random access. Desired index could be in either or niether buf. 1.1052 + // For optimizing the order of testing, 1.1053 + // Most likely case: in the other buffer. 1.1054 + // Second most likely: in neither buffer. 1.1055 + // Unlikely, but must work: in the current buffer. 1.1056 + u8b = (UTF8Buf *)ut->q; // the alternate buffer 1.1057 + if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) { 1.1058 + // Requested index is in the other buffer. 1.1059 + goto swapBuffers; 1.1060 + } 1.1061 + // Requested index is start-of-string. 1.1062 + // (this is the case of randomly seeking to the start. 1.1063 + // The case of iterating off the start is handled earlier.) 1.1064 + if (ix==0) { 1.1065 + if (u8b->bufNativeStart==0) { 1.1066 + // Alternate buffer contains the data for the start string. 1.1067 + // Make it be the current buffer. 1.1068 + goto swapBuffersAndFail; 1.1069 + } else { 1.1070 + // Request for data before the start of string, 1.1071 + // neither buffer is usable. 1.1072 + // set up a zero-length buffer. 1.1073 + goto makeStubBuffer; 1.1074 + } 1.1075 + } 1.1076 + 1.1077 + if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) { 1.1078 + // Requested index is in neither buffer. 1.1079 + goto fillReverse; 1.1080 + } 1.1081 + 1.1082 + // Requested index is in this buffer. 1.1083 + // Set the utf16 buffer index. 1.1084 + u8b = (UTF8Buf *)ut->p; 1.1085 + mapIndex = ix - u8b->toUCharsMapStart; 1.1086 + ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1.1087 + if (ut->chunkOffset==0) { 1.1088 + // This occurs when the first character in the text is 1.1089 + // a multi-byte UTF-8 char, and the requested index is to 1.1090 + // one of the trailing bytes. Because there is no preceding , 1.1091 + // character, this access fails. We can't pick up on the 1.1092 + // situation sooner because the requested index is not zero. 1.1093 + return FALSE; 1.1094 + } else { 1.1095 + return TRUE; 1.1096 + } 1.1097 + 1.1098 + 1.1099 + 1.1100 +swapBuffers: 1.1101 + // The alternate buffer (ut->q) has the string data that was requested. 1.1102 + // Swap the primary and alternate buffers, and set the 1.1103 + // chunk index into the new primary buffer. 1.1104 + { 1.1105 + u8b = (UTF8Buf *)ut->q; 1.1106 + ut->q = ut->p; 1.1107 + ut->p = u8b; 1.1108 + ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1.1109 + ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1.1110 + ut->chunkNativeStart = u8b->bufNativeStart; 1.1111 + ut->chunkNativeLimit = u8b->bufNativeLimit; 1.1112 + ut->nativeIndexingLimit = u8b->bufNILimit; 1.1113 + 1.1114 + // Index into the (now current) chunk 1.1115 + // Use the map to set the chunk index. It's more trouble than it's worth 1.1116 + // to check whether native indexing can be used. 1.1117 + U_ASSERT(ix>=u8b->bufNativeStart); 1.1118 + U_ASSERT(ix<=u8b->bufNativeLimit); 1.1119 + mapIndex = ix - u8b->toUCharsMapStart; 1.1120 + U_ASSERT(mapIndex>=0); 1.1121 + U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars)); 1.1122 + ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1.1123 + 1.1124 + return TRUE; 1.1125 + } 1.1126 + 1.1127 + 1.1128 + swapBuffersAndFail: 1.1129 + // We got a request for either the start or end of the string, 1.1130 + // with iteration continuing in the out-of-bounds direction. 1.1131 + // The alternate buffer already contains the data up to the 1.1132 + // start/end. 1.1133 + // Swap the buffers, then return failure, indicating that we couldn't 1.1134 + // make things correct for continuing the iteration in the requested 1.1135 + // direction. The position & buffer are correct should the 1.1136 + // user decide to iterate in the opposite direction. 1.1137 + u8b = (UTF8Buf *)ut->q; 1.1138 + ut->q = ut->p; 1.1139 + ut->p = u8b; 1.1140 + ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; 1.1141 + ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1.1142 + ut->chunkNativeStart = u8b->bufNativeStart; 1.1143 + ut->chunkNativeLimit = u8b->bufNativeLimit; 1.1144 + ut->nativeIndexingLimit = u8b->bufNILimit; 1.1145 + 1.1146 + // Index into the (now current) chunk 1.1147 + // For this function (swapBuffersAndFail), the requested index 1.1148 + // will always be at either the start or end of the chunk. 1.1149 + if (ix==u8b->bufNativeLimit) { 1.1150 + ut->chunkOffset = ut->chunkLength; 1.1151 + } else { 1.1152 + ut->chunkOffset = 0; 1.1153 + U_ASSERT(ix == u8b->bufNativeStart); 1.1154 + } 1.1155 + return FALSE; 1.1156 + 1.1157 +makeStubBuffer: 1.1158 + // The user has done a seek/access past the start or end 1.1159 + // of the string. Rather than loading data that is likely 1.1160 + // to never be used, just set up a zero-length buffer at 1.1161 + // the position. 1.1162 + u8b = (UTF8Buf *)ut->q; 1.1163 + u8b->bufNativeStart = ix; 1.1164 + u8b->bufNativeLimit = ix; 1.1165 + u8b->bufStartIdx = 0; 1.1166 + u8b->bufLimitIdx = 0; 1.1167 + u8b->bufNILimit = 0; 1.1168 + u8b->toUCharsMapStart = ix; 1.1169 + u8b->mapToNative[0] = 0; 1.1170 + u8b->mapToUChars[0] = 0; 1.1171 + goto swapBuffersAndFail; 1.1172 + 1.1173 + 1.1174 + 1.1175 +fillForward: 1.1176 + { 1.1177 + // Move the incoming index to a code point boundary. 1.1178 + U8_SET_CP_START(s8, 0, ix); 1.1179 + 1.1180 + // Swap the UText buffers. 1.1181 + // We want to fill what was previously the alternate buffer, 1.1182 + // and make what was the current buffer be the new alternate. 1.1183 + UTF8Buf *u8b = (UTF8Buf *)ut->q; 1.1184 + ut->q = ut->p; 1.1185 + ut->p = u8b; 1.1186 + 1.1187 + int32_t strLen = ut->b; 1.1188 + UBool nulTerminated = FALSE; 1.1189 + if (strLen < 0) { 1.1190 + strLen = 0x7fffffff; 1.1191 + nulTerminated = TRUE; 1.1192 + } 1.1193 + 1.1194 + UChar *buf = u8b->buf; 1.1195 + uint8_t *mapToNative = u8b->mapToNative; 1.1196 + uint8_t *mapToUChars = u8b->mapToUChars; 1.1197 + int32_t destIx = 0; 1.1198 + int32_t srcIx = ix; 1.1199 + UBool seenNonAscii = FALSE; 1.1200 + UChar32 c = 0; 1.1201 + 1.1202 + // Fill the chunk buffer and mapping arrays. 1.1203 + while (destIx<UTF8_TEXT_CHUNK_SIZE) { 1.1204 + c = s8[srcIx]; 1.1205 + if (c>0 && c<0x80) { 1.1206 + // Special case ASCII range for speed. 1.1207 + // zero is excluded to simplify bounds checking. 1.1208 + buf[destIx] = (UChar)c; 1.1209 + mapToNative[destIx] = (uint8_t)(srcIx - ix); 1.1210 + mapToUChars[srcIx-ix] = (uint8_t)destIx; 1.1211 + srcIx++; 1.1212 + destIx++; 1.1213 + } else { 1.1214 + // General case, handle everything. 1.1215 + if (seenNonAscii == FALSE) { 1.1216 + seenNonAscii = TRUE; 1.1217 + u8b->bufNILimit = destIx; 1.1218 + } 1.1219 + 1.1220 + int32_t cIx = srcIx; 1.1221 + int32_t dIx = destIx; 1.1222 + int32_t dIxSaved = destIx; 1.1223 + U8_NEXT_OR_FFFD(s8, srcIx, strLen, c); 1.1224 + if (c==0 && nulTerminated) { 1.1225 + srcIx--; 1.1226 + break; 1.1227 + } 1.1228 + 1.1229 + U16_APPEND_UNSAFE(buf, destIx, c); 1.1230 + do { 1.1231 + mapToNative[dIx++] = (uint8_t)(cIx - ix); 1.1232 + } while (dIx < destIx); 1.1233 + 1.1234 + do { 1.1235 + mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved; 1.1236 + } while (cIx < srcIx); 1.1237 + } 1.1238 + if (srcIx>=strLen) { 1.1239 + break; 1.1240 + } 1.1241 + 1.1242 + } 1.1243 + 1.1244 + // store Native <--> Chunk Map entries for the end of the buffer. 1.1245 + // There is no actual character here, but the index position is valid. 1.1246 + mapToNative[destIx] = (uint8_t)(srcIx - ix); 1.1247 + mapToUChars[srcIx - ix] = (uint8_t)destIx; 1.1248 + 1.1249 + // fill in Buffer descriptor 1.1250 + u8b->bufNativeStart = ix; 1.1251 + u8b->bufNativeLimit = srcIx; 1.1252 + u8b->bufStartIdx = 0; 1.1253 + u8b->bufLimitIdx = destIx; 1.1254 + if (seenNonAscii == FALSE) { 1.1255 + u8b->bufNILimit = destIx; 1.1256 + } 1.1257 + u8b->toUCharsMapStart = u8b->bufNativeStart; 1.1258 + 1.1259 + // Set UText chunk to refer to this buffer. 1.1260 + ut->chunkContents = buf; 1.1261 + ut->chunkOffset = 0; 1.1262 + ut->chunkLength = u8b->bufLimitIdx; 1.1263 + ut->chunkNativeStart = u8b->bufNativeStart; 1.1264 + ut->chunkNativeLimit = u8b->bufNativeLimit; 1.1265 + ut->nativeIndexingLimit = u8b->bufNILimit; 1.1266 + 1.1267 + // For zero terminated strings, keep track of the maximum point 1.1268 + // scanned so far. 1.1269 + if (nulTerminated && srcIx>ut->c) { 1.1270 + ut->c = srcIx; 1.1271 + if (c==0) { 1.1272 + // We scanned to the end. 1.1273 + // Remember the actual length. 1.1274 + ut->b = srcIx; 1.1275 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.1276 + } 1.1277 + } 1.1278 + return TRUE; 1.1279 + } 1.1280 + 1.1281 + 1.1282 +fillReverse: 1.1283 + { 1.1284 + // Move the incoming index to a code point boundary. 1.1285 + // Can only do this if the incoming index is somewhere in the interior of the string. 1.1286 + // If index is at the end, there is no character there to look at. 1.1287 + if (ix != ut->b) { 1.1288 + U8_SET_CP_START(s8, 0, ix); 1.1289 + } 1.1290 + 1.1291 + // Swap the UText buffers. 1.1292 + // We want to fill what was previously the alternate buffer, 1.1293 + // and make what was the current buffer be the new alternate. 1.1294 + UTF8Buf *u8b = (UTF8Buf *)ut->q; 1.1295 + ut->q = ut->p; 1.1296 + ut->p = u8b; 1.1297 + 1.1298 + UChar *buf = u8b->buf; 1.1299 + uint8_t *mapToNative = u8b->mapToNative; 1.1300 + uint8_t *mapToUChars = u8b->mapToUChars; 1.1301 + int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1); 1.1302 + int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region 1.1303 + // at end of buffer to leave room 1.1304 + // for a surrogate pair at the 1.1305 + // buffer start. 1.1306 + int32_t srcIx = ix; 1.1307 + int32_t bufNILimit = destIx; 1.1308 + UChar32 c; 1.1309 + 1.1310 + // Map to/from Native Indexes, fill in for the position at the end of 1.1311 + // the buffer. 1.1312 + // 1.1313 + mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1.1314 + mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1.1315 + 1.1316 + // Fill the chunk buffer 1.1317 + // Work backwards, filling from the end of the buffer towards the front. 1.1318 + // 1.1319 + while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) { 1.1320 + srcIx--; 1.1321 + destIx--; 1.1322 + 1.1323 + // Get last byte of the UTF-8 character 1.1324 + c = s8[srcIx]; 1.1325 + if (c<0x80) { 1.1326 + // Special case ASCII range for speed. 1.1327 + buf[destIx] = (UChar)c; 1.1328 + mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; 1.1329 + mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1.1330 + } else { 1.1331 + // General case, handle everything non-ASCII. 1.1332 + 1.1333 + int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char 1.1334 + 1.1335 + // Get the full character from the UTF8 string. 1.1336 + // use code derived from tbe macros in utf8.h 1.1337 + // Leaves srcIx pointing at the first byte of the UTF-8 char. 1.1338 + // 1.1339 + c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3); 1.1340 + // leaves srcIx at first byte of the multi-byte char. 1.1341 + 1.1342 + // Store the character in UTF-16 buffer. 1.1343 + if (c<0x10000) { 1.1344 + buf[destIx] = (UChar)c; 1.1345 + mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1.1346 + } else { 1.1347 + buf[destIx] = U16_TRAIL(c); 1.1348 + mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1.1349 + buf[--destIx] = U16_LEAD(c); 1.1350 + mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); 1.1351 + } 1.1352 + 1.1353 + // Fill in the map from native indexes to UChars buf index. 1.1354 + do { 1.1355 + mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx; 1.1356 + } while (sIx >= srcIx); 1.1357 + 1.1358 + // Set native indexing limit to be the current position. 1.1359 + // We are processing a non-ascii, non-native-indexing char now; 1.1360 + // the limit will be here if the rest of the chars to be 1.1361 + // added to this buffer are ascii. 1.1362 + bufNILimit = destIx; 1.1363 + } 1.1364 + } 1.1365 + u8b->bufNativeStart = srcIx; 1.1366 + u8b->bufNativeLimit = ix; 1.1367 + u8b->bufStartIdx = destIx; 1.1368 + u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2; 1.1369 + u8b->bufNILimit = bufNILimit - u8b->bufStartIdx; 1.1370 + u8b->toUCharsMapStart = toUCharsMapStart; 1.1371 + 1.1372 + ut->chunkContents = &buf[u8b->bufStartIdx]; 1.1373 + ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; 1.1374 + ut->chunkOffset = ut->chunkLength; 1.1375 + ut->chunkNativeStart = u8b->bufNativeStart; 1.1376 + ut->chunkNativeLimit = u8b->bufNativeLimit; 1.1377 + ut->nativeIndexingLimit = u8b->bufNILimit; 1.1378 + return TRUE; 1.1379 + } 1.1380 + 1.1381 +} 1.1382 + 1.1383 + 1.1384 + 1.1385 +// 1.1386 +// This is a slightly modified copy of u_strFromUTF8, 1.1387 +// Inserts a Replacement Char rather than failing on invalid UTF-8 1.1388 +// Removes unnecessary features. 1.1389 +// 1.1390 +static UChar* 1.1391 +utext_strFromUTF8(UChar *dest, 1.1392 + int32_t destCapacity, 1.1393 + int32_t *pDestLength, 1.1394 + const char* src, 1.1395 + int32_t srcLength, // required. NUL terminated not supported. 1.1396 + UErrorCode *pErrorCode 1.1397 + ) 1.1398 +{ 1.1399 + 1.1400 + UChar *pDest = dest; 1.1401 + UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 1.1402 + UChar32 ch=0; 1.1403 + int32_t index = 0; 1.1404 + int32_t reqLength = 0; 1.1405 + uint8_t* pSrc = (uint8_t*) src; 1.1406 + 1.1407 + 1.1408 + while((index < srcLength)&&(pDest<pDestLimit)){ 1.1409 + ch = pSrc[index++]; 1.1410 + if(ch <=0x7f){ 1.1411 + *pDest++=(UChar)ch; 1.1412 + }else{ 1.1413 + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1.1414 + if(U_IS_BMP(ch)){ 1.1415 + *(pDest++)=(UChar)ch; 1.1416 + }else{ 1.1417 + *(pDest++)=U16_LEAD(ch); 1.1418 + if(pDest<pDestLimit){ 1.1419 + *(pDest++)=U16_TRAIL(ch); 1.1420 + }else{ 1.1421 + reqLength++; 1.1422 + break; 1.1423 + } 1.1424 + } 1.1425 + } 1.1426 + } 1.1427 + /* donot fill the dest buffer just count the UChars needed */ 1.1428 + while(index < srcLength){ 1.1429 + ch = pSrc[index++]; 1.1430 + if(ch <= 0x7f){ 1.1431 + reqLength++; 1.1432 + }else{ 1.1433 + ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); 1.1434 + reqLength+=U16_LENGTH(ch); 1.1435 + } 1.1436 + } 1.1437 + 1.1438 + reqLength+=(int32_t)(pDest - dest); 1.1439 + 1.1440 + if(pDestLength){ 1.1441 + *pDestLength = reqLength; 1.1442 + } 1.1443 + 1.1444 + /* Terminate the buffer */ 1.1445 + u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 1.1446 + 1.1447 + return dest; 1.1448 +} 1.1449 + 1.1450 + 1.1451 + 1.1452 +static int32_t U_CALLCONV 1.1453 +utf8TextExtract(UText *ut, 1.1454 + int64_t start, int64_t limit, 1.1455 + UChar *dest, int32_t destCapacity, 1.1456 + UErrorCode *pErrorCode) { 1.1457 + if(U_FAILURE(*pErrorCode)) { 1.1458 + return 0; 1.1459 + } 1.1460 + if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 1.1461 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1462 + return 0; 1.1463 + } 1.1464 + int32_t length = ut->b; 1.1465 + int32_t start32 = pinIndex(start, length); 1.1466 + int32_t limit32 = pinIndex(limit, length); 1.1467 + 1.1468 + if(start32>limit32) { 1.1469 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.1470 + return 0; 1.1471 + } 1.1472 + 1.1473 + 1.1474 + // adjust the incoming indexes to land on code point boundaries if needed. 1.1475 + // adjust by no more than three, because that is the largest number of trail bytes 1.1476 + // in a well formed UTF8 character. 1.1477 + const uint8_t *buf = (const uint8_t *)ut->context; 1.1478 + int i; 1.1479 + if (start32 < ut->chunkNativeLimit) { 1.1480 + for (i=0; i<3; i++) { 1.1481 + if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) { 1.1482 + break; 1.1483 + } 1.1484 + start32--; 1.1485 + } 1.1486 + } 1.1487 + 1.1488 + if (limit32 < ut->chunkNativeLimit) { 1.1489 + for (i=0; i<3; i++) { 1.1490 + if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) { 1.1491 + break; 1.1492 + } 1.1493 + limit32--; 1.1494 + } 1.1495 + } 1.1496 + 1.1497 + // Do the actual extract. 1.1498 + int32_t destLength=0; 1.1499 + utext_strFromUTF8(dest, destCapacity, &destLength, 1.1500 + (const char *)ut->context+start32, limit32-start32, 1.1501 + pErrorCode); 1.1502 + utf8TextAccess(ut, limit32, TRUE); 1.1503 + return destLength; 1.1504 +} 1.1505 + 1.1506 +// 1.1507 +// utf8TextMapOffsetToNative 1.1508 +// 1.1509 +// Map a chunk (UTF-16) offset to a native index. 1.1510 +static int64_t U_CALLCONV 1.1511 +utf8TextMapOffsetToNative(const UText *ut) { 1.1512 + // 1.1513 + UTF8Buf *u8b = (UTF8Buf *)ut->p; 1.1514 + U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength); 1.1515 + int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart; 1.1516 + U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit); 1.1517 + return nativeOffset; 1.1518 +} 1.1519 + 1.1520 +// 1.1521 +// Map a native index to the corrsponding chunk offset 1.1522 +// 1.1523 +static int32_t U_CALLCONV 1.1524 +utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) { 1.1525 + U_ASSERT(index64 <= 0x7fffffff); 1.1526 + int32_t index = (int32_t)index64; 1.1527 + UTF8Buf *u8b = (UTF8Buf *)ut->p; 1.1528 + U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit); 1.1529 + U_ASSERT(index<=ut->chunkNativeLimit); 1.1530 + int32_t mapIndex = index - u8b->toUCharsMapStart; 1.1531 + int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; 1.1532 + U_ASSERT(offset>=0 && offset<=ut->chunkLength); 1.1533 + return offset; 1.1534 +} 1.1535 + 1.1536 +static UText * U_CALLCONV 1.1537 +utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) 1.1538 +{ 1.1539 + // First do a generic shallow clone. Does everything needed for the UText struct itself. 1.1540 + dest = shallowTextClone(dest, src, status); 1.1541 + 1.1542 + // For deep clones, make a copy of the string. 1.1543 + // The copied storage is owned by the newly created clone. 1.1544 + // 1.1545 + // TODO: There is an isssue with using utext_nativeLength(). 1.1546 + // That function is non-const in cases where the input was NUL terminated 1.1547 + // and the length has not yet been determined. 1.1548 + // This function (clone()) is const. 1.1549 + // There potentially a thread safety issue lurking here. 1.1550 + // 1.1551 + if (deep && U_SUCCESS(*status)) { 1.1552 + int32_t len = (int32_t)utext_nativeLength((UText *)src); 1.1553 + char *copyStr = (char *)uprv_malloc(len+1); 1.1554 + if (copyStr == NULL) { 1.1555 + *status = U_MEMORY_ALLOCATION_ERROR; 1.1556 + } else { 1.1557 + uprv_memcpy(copyStr, src->context, len+1); 1.1558 + dest->context = copyStr; 1.1559 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1.1560 + } 1.1561 + } 1.1562 + return dest; 1.1563 +} 1.1564 + 1.1565 + 1.1566 +static void U_CALLCONV 1.1567 +utf8TextClose(UText *ut) { 1.1568 + // Most of the work of close is done by the generic UText framework close. 1.1569 + // All that needs to be done here is to delete the UTF8 string if the UText 1.1570 + // owns it. This occurs if the UText was created by cloning. 1.1571 + if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1.1572 + char *s = (char *)ut->context; 1.1573 + uprv_free(s); 1.1574 + ut->context = NULL; 1.1575 + } 1.1576 +} 1.1577 + 1.1578 +U_CDECL_END 1.1579 + 1.1580 + 1.1581 +static const struct UTextFuncs utf8Funcs = 1.1582 +{ 1.1583 + sizeof(UTextFuncs), 1.1584 + 0, 0, 0, // Reserved alignment padding 1.1585 + utf8TextClone, 1.1586 + utf8TextLength, 1.1587 + utf8TextAccess, 1.1588 + utf8TextExtract, 1.1589 + NULL, /* replace*/ 1.1590 + NULL, /* copy */ 1.1591 + utf8TextMapOffsetToNative, 1.1592 + utf8TextMapIndexToUTF16, 1.1593 + utf8TextClose, 1.1594 + NULL, // spare 1 1.1595 + NULL, // spare 2 1.1596 + NULL // spare 3 1.1597 +}; 1.1598 + 1.1599 + 1.1600 +static const char gEmptyString[] = {0}; 1.1601 + 1.1602 +U_CAPI UText * U_EXPORT2 1.1603 +utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) { 1.1604 + if(U_FAILURE(*status)) { 1.1605 + return NULL; 1.1606 + } 1.1607 + if(s==NULL && length==0) { 1.1608 + s = gEmptyString; 1.1609 + } 1.1610 + 1.1611 + if(s==NULL || length<-1 || length>INT32_MAX) { 1.1612 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.1613 + return NULL; 1.1614 + } 1.1615 + 1.1616 + ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status); 1.1617 + if (U_FAILURE(*status)) { 1.1618 + return ut; 1.1619 + } 1.1620 + 1.1621 + ut->pFuncs = &utf8Funcs; 1.1622 + ut->context = s; 1.1623 + ut->b = (int32_t)length; 1.1624 + ut->c = (int32_t)length; 1.1625 + if (ut->c < 0) { 1.1626 + ut->c = 0; 1.1627 + ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.1628 + } 1.1629 + ut->p = ut->pExtra; 1.1630 + ut->q = (char *)ut->pExtra + sizeof(UTF8Buf); 1.1631 + return ut; 1.1632 + 1.1633 +} 1.1634 + 1.1635 + 1.1636 + 1.1637 + 1.1638 + 1.1639 + 1.1640 + 1.1641 + 1.1642 +//------------------------------------------------------------------------------ 1.1643 +// 1.1644 +// UText implementation wrapper for Replaceable (read/write) 1.1645 +// 1.1646 +// Use of UText data members: 1.1647 +// context pointer to Replaceable. 1.1648 +// p pointer to Replaceable if it is owned by the UText. 1.1649 +// 1.1650 +//------------------------------------------------------------------------------ 1.1651 + 1.1652 + 1.1653 + 1.1654 +// minimum chunk size for this implementation: 3 1.1655 +// to allow for possible trimming for code point boundaries 1.1656 +enum { REP_TEXT_CHUNK_SIZE=10 }; 1.1657 + 1.1658 +struct ReplExtra { 1.1659 + /* 1.1660 + * Chunk UChars. 1.1661 + * +1 to simplify filling with surrogate pair at the end. 1.1662 + */ 1.1663 + UChar s[REP_TEXT_CHUNK_SIZE+1]; 1.1664 +}; 1.1665 + 1.1666 + 1.1667 +U_CDECL_BEGIN 1.1668 + 1.1669 +static UText * U_CALLCONV 1.1670 +repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 1.1671 + // First do a generic shallow clone. Does everything needed for the UText struct itself. 1.1672 + dest = shallowTextClone(dest, src, status); 1.1673 + 1.1674 + // For deep clones, make a copy of the Replaceable. 1.1675 + // The copied Replaceable storage is owned by the newly created UText clone. 1.1676 + // A non-NULL pointer in UText.p is the signal to the close() function to delete 1.1677 + // it. 1.1678 + // 1.1679 + if (deep && U_SUCCESS(*status)) { 1.1680 + const Replaceable *replSrc = (const Replaceable *)src->context; 1.1681 + dest->context = replSrc->clone(); 1.1682 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1.1683 + 1.1684 + // with deep clone, the copy is writable, even when the source is not. 1.1685 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1.1686 + } 1.1687 + return dest; 1.1688 +} 1.1689 + 1.1690 + 1.1691 +static void U_CALLCONV 1.1692 +repTextClose(UText *ut) { 1.1693 + // Most of the work of close is done by the generic UText framework close. 1.1694 + // All that needs to be done here is delete the Replaceable if the UText 1.1695 + // owns it. This occurs if the UText was created by cloning. 1.1696 + if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1.1697 + Replaceable *rep = (Replaceable *)ut->context; 1.1698 + delete rep; 1.1699 + ut->context = NULL; 1.1700 + } 1.1701 +} 1.1702 + 1.1703 + 1.1704 +static int64_t U_CALLCONV 1.1705 +repTextLength(UText *ut) { 1.1706 + const Replaceable *replSrc = (const Replaceable *)ut->context; 1.1707 + int32_t len = replSrc->length(); 1.1708 + return len; 1.1709 +} 1.1710 + 1.1711 + 1.1712 +static UBool U_CALLCONV 1.1713 +repTextAccess(UText *ut, int64_t index, UBool forward) { 1.1714 + const Replaceable *rep=(const Replaceable *)ut->context; 1.1715 + int32_t length=rep->length(); // Full length of the input text (bigger than a chunk) 1.1716 + 1.1717 + // clip the requested index to the limits of the text. 1.1718 + int32_t index32 = pinIndex(index, length); 1.1719 + U_ASSERT(index<=INT32_MAX); 1.1720 + 1.1721 + 1.1722 + /* 1.1723 + * Compute start/limit boundaries around index, for a segment of text 1.1724 + * to be extracted. 1.1725 + * To allow for the possibility that our user gave an index to the trailing 1.1726 + * half of a surrogate pair, we must request one extra preceding UChar when 1.1727 + * going in the forward direction. This will ensure that the buffer has the 1.1728 + * entire code point at the specified index. 1.1729 + */ 1.1730 + if(forward) { 1.1731 + 1.1732 + if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) { 1.1733 + // Buffer already contains the requested position. 1.1734 + ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); 1.1735 + return TRUE; 1.1736 + } 1.1737 + if (index32>=length && ut->chunkNativeLimit==length) { 1.1738 + // Request for end of string, and buffer already extends up to it. 1.1739 + // Can't get the data, but don't change the buffer. 1.1740 + ut->chunkOffset = length - (int32_t)ut->chunkNativeStart; 1.1741 + return FALSE; 1.1742 + } 1.1743 + 1.1744 + ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1; 1.1745 + // Going forward, so we want to have the buffer with stuff at and beyond 1.1746 + // the requested index. The -1 gets us one code point before the 1.1747 + // requested index also, to handle the case of the index being on 1.1748 + // a trail surrogate of a surrogate pair. 1.1749 + if(ut->chunkNativeLimit > length) { 1.1750 + ut->chunkNativeLimit = length; 1.1751 + } 1.1752 + // unless buffer ran off end, start is index-1. 1.1753 + ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE; 1.1754 + if(ut->chunkNativeStart < 0) { 1.1755 + ut->chunkNativeStart = 0; 1.1756 + } 1.1757 + } else { 1.1758 + // Reverse iteration. Fill buffer with data preceding the requested index. 1.1759 + if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) { 1.1760 + // Requested position already in buffer. 1.1761 + ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart; 1.1762 + return TRUE; 1.1763 + } 1.1764 + if (index32==0 && ut->chunkNativeStart==0) { 1.1765 + // Request for start, buffer already begins at start. 1.1766 + // No data, but keep the buffer as is. 1.1767 + ut->chunkOffset = 0; 1.1768 + return FALSE; 1.1769 + } 1.1770 + 1.1771 + // Figure out the bounds of the chunk to extract for reverse iteration. 1.1772 + // Need to worry about chunk not splitting surrogate pairs, and while still 1.1773 + // containing the data we need. 1.1774 + // Fix by requesting a chunk that includes an extra UChar at the end. 1.1775 + // If this turns out to be a lead surrogate, we can lop it off and still have 1.1776 + // the data we wanted. 1.1777 + ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE; 1.1778 + if (ut->chunkNativeStart < 0) { 1.1779 + ut->chunkNativeStart = 0; 1.1780 + } 1.1781 + 1.1782 + ut->chunkNativeLimit = index32 + 1; 1.1783 + if (ut->chunkNativeLimit > length) { 1.1784 + ut->chunkNativeLimit = length; 1.1785 + } 1.1786 + } 1.1787 + 1.1788 + // Extract the new chunk of text from the Replaceable source. 1.1789 + ReplExtra *ex = (ReplExtra *)ut->pExtra; 1.1790 + // UnicodeString with its buffer a writable alias to the chunk buffer 1.1791 + UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/); 1.1792 + rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer); 1.1793 + 1.1794 + ut->chunkContents = ex->s; 1.1795 + ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart); 1.1796 + ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart); 1.1797 + 1.1798 + // Surrogate pairs from the input text must not span chunk boundaries. 1.1799 + // If end of chunk could be the start of a surrogate, trim it off. 1.1800 + if (ut->chunkNativeLimit < length && 1.1801 + U16_IS_LEAD(ex->s[ut->chunkLength-1])) { 1.1802 + ut->chunkLength--; 1.1803 + ut->chunkNativeLimit--; 1.1804 + if (ut->chunkOffset > ut->chunkLength) { 1.1805 + ut->chunkOffset = ut->chunkLength; 1.1806 + } 1.1807 + } 1.1808 + 1.1809 + // if the first UChar in the chunk could be the trailing half of a surrogate pair, 1.1810 + // trim it off. 1.1811 + if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) { 1.1812 + ++(ut->chunkContents); 1.1813 + ++(ut->chunkNativeStart); 1.1814 + --(ut->chunkLength); 1.1815 + --(ut->chunkOffset); 1.1816 + } 1.1817 + 1.1818 + // adjust the index/chunkOffset to a code point boundary 1.1819 + U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset); 1.1820 + 1.1821 + // Use fast indexing for get/setNativeIndex() 1.1822 + ut->nativeIndexingLimit = ut->chunkLength; 1.1823 + 1.1824 + return TRUE; 1.1825 +} 1.1826 + 1.1827 + 1.1828 + 1.1829 +static int32_t U_CALLCONV 1.1830 +repTextExtract(UText *ut, 1.1831 + int64_t start, int64_t limit, 1.1832 + UChar *dest, int32_t destCapacity, 1.1833 + UErrorCode *status) { 1.1834 + const Replaceable *rep=(const Replaceable *)ut->context; 1.1835 + int32_t length=rep->length(); 1.1836 + 1.1837 + if(U_FAILURE(*status)) { 1.1838 + return 0; 1.1839 + } 1.1840 + if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 1.1841 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.1842 + } 1.1843 + if(start>limit) { 1.1844 + *status=U_INDEX_OUTOFBOUNDS_ERROR; 1.1845 + return 0; 1.1846 + } 1.1847 + 1.1848 + int32_t start32 = pinIndex(start, length); 1.1849 + int32_t limit32 = pinIndex(limit, length); 1.1850 + 1.1851 + // adjust start, limit if they point to trail half of surrogates 1.1852 + if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) && 1.1853 + U_IS_SUPPLEMENTARY(rep->char32At(start32))){ 1.1854 + start32--; 1.1855 + } 1.1856 + if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) && 1.1857 + U_IS_SUPPLEMENTARY(rep->char32At(limit32))){ 1.1858 + limit32--; 1.1859 + } 1.1860 + 1.1861 + length=limit32-start32; 1.1862 + if(length>destCapacity) { 1.1863 + limit32 = start32 + destCapacity; 1.1864 + } 1.1865 + UnicodeString buffer(dest, 0, destCapacity); // writable alias 1.1866 + rep->extractBetween(start32, limit32, buffer); 1.1867 + repTextAccess(ut, limit32, TRUE); 1.1868 + 1.1869 + return u_terminateUChars(dest, destCapacity, length, status); 1.1870 +} 1.1871 + 1.1872 +static int32_t U_CALLCONV 1.1873 +repTextReplace(UText *ut, 1.1874 + int64_t start, int64_t limit, 1.1875 + const UChar *src, int32_t length, 1.1876 + UErrorCode *status) { 1.1877 + Replaceable *rep=(Replaceable *)ut->context; 1.1878 + int32_t oldLength; 1.1879 + 1.1880 + if(U_FAILURE(*status)) { 1.1881 + return 0; 1.1882 + } 1.1883 + if(src==NULL && length!=0) { 1.1884 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.1885 + return 0; 1.1886 + } 1.1887 + oldLength=rep->length(); // will subtract from new length 1.1888 + if(start>limit ) { 1.1889 + *status=U_INDEX_OUTOFBOUNDS_ERROR; 1.1890 + return 0; 1.1891 + } 1.1892 + 1.1893 + int32_t start32 = pinIndex(start, oldLength); 1.1894 + int32_t limit32 = pinIndex(limit, oldLength); 1.1895 + 1.1896 + // Snap start & limit to code point boundaries. 1.1897 + if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) && 1.1898 + start32>0 && U16_IS_LEAD(rep->charAt(start32-1))) 1.1899 + { 1.1900 + start32--; 1.1901 + } 1.1902 + if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) && 1.1903 + U16_IS_TRAIL(rep->charAt(limit32))) 1.1904 + { 1.1905 + limit32++; 1.1906 + } 1.1907 + 1.1908 + // Do the actual replace operation using methods of the Replaceable class 1.1909 + UnicodeString replStr((UBool)(length<0), src, length); // read-only alias 1.1910 + rep->handleReplaceBetween(start32, limit32, replStr); 1.1911 + int32_t newLength = rep->length(); 1.1912 + int32_t lengthDelta = newLength - oldLength; 1.1913 + 1.1914 + // Is the UText chunk buffer OK? 1.1915 + if (ut->chunkNativeLimit > start32) { 1.1916 + // this replace operation may have impacted the current chunk. 1.1917 + // invalidate it, which will force a reload on the next access. 1.1918 + invalidateChunk(ut); 1.1919 + } 1.1920 + 1.1921 + // set the iteration position to the end of the newly inserted replacement text. 1.1922 + int32_t newIndexPos = limit32 + lengthDelta; 1.1923 + repTextAccess(ut, newIndexPos, TRUE); 1.1924 + 1.1925 + return lengthDelta; 1.1926 +} 1.1927 + 1.1928 + 1.1929 +static void U_CALLCONV 1.1930 +repTextCopy(UText *ut, 1.1931 + int64_t start, int64_t limit, 1.1932 + int64_t destIndex, 1.1933 + UBool move, 1.1934 + UErrorCode *status) 1.1935 +{ 1.1936 + Replaceable *rep=(Replaceable *)ut->context; 1.1937 + int32_t length=rep->length(); 1.1938 + 1.1939 + if(U_FAILURE(*status)) { 1.1940 + return; 1.1941 + } 1.1942 + if (start>limit || (start<destIndex && destIndex<limit)) 1.1943 + { 1.1944 + *status=U_INDEX_OUTOFBOUNDS_ERROR; 1.1945 + return; 1.1946 + } 1.1947 + 1.1948 + int32_t start32 = pinIndex(start, length); 1.1949 + int32_t limit32 = pinIndex(limit, length); 1.1950 + int32_t destIndex32 = pinIndex(destIndex, length); 1.1951 + 1.1952 + // TODO: snap input parameters to code point boundaries. 1.1953 + 1.1954 + if(move) { 1.1955 + // move: copy to destIndex, then replace original with nothing 1.1956 + int32_t segLength=limit32-start32; 1.1957 + rep->copy(start32, limit32, destIndex32); 1.1958 + if(destIndex32<start32) { 1.1959 + start32+=segLength; 1.1960 + limit32+=segLength; 1.1961 + } 1.1962 + rep->handleReplaceBetween(start32, limit32, UnicodeString()); 1.1963 + } else { 1.1964 + // copy 1.1965 + rep->copy(start32, limit32, destIndex32); 1.1966 + } 1.1967 + 1.1968 + // If the change to the text touched the region in the chunk buffer, 1.1969 + // invalidate the buffer. 1.1970 + int32_t firstAffectedIndex = destIndex32; 1.1971 + if (move && start32<firstAffectedIndex) { 1.1972 + firstAffectedIndex = start32; 1.1973 + } 1.1974 + if (firstAffectedIndex < ut->chunkNativeLimit) { 1.1975 + // changes may have affected range covered by the chunk 1.1976 + invalidateChunk(ut); 1.1977 + } 1.1978 + 1.1979 + // Put iteration position at the newly inserted (moved) block, 1.1980 + int32_t nativeIterIndex = destIndex32 + limit32 - start32; 1.1981 + if (move && destIndex32>start32) { 1.1982 + // moved a block of text towards the end of the string. 1.1983 + nativeIterIndex = destIndex32; 1.1984 + } 1.1985 + 1.1986 + // Set position, reload chunk if needed. 1.1987 + repTextAccess(ut, nativeIterIndex, TRUE); 1.1988 +} 1.1989 + 1.1990 +static const struct UTextFuncs repFuncs = 1.1991 +{ 1.1992 + sizeof(UTextFuncs), 1.1993 + 0, 0, 0, // Reserved alignment padding 1.1994 + repTextClone, 1.1995 + repTextLength, 1.1996 + repTextAccess, 1.1997 + repTextExtract, 1.1998 + repTextReplace, 1.1999 + repTextCopy, 1.2000 + NULL, // MapOffsetToNative, 1.2001 + NULL, // MapIndexToUTF16, 1.2002 + repTextClose, 1.2003 + NULL, // spare 1 1.2004 + NULL, // spare 2 1.2005 + NULL // spare 3 1.2006 +}; 1.2007 + 1.2008 + 1.2009 +U_CAPI UText * U_EXPORT2 1.2010 +utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) 1.2011 +{ 1.2012 + if(U_FAILURE(*status)) { 1.2013 + return NULL; 1.2014 + } 1.2015 + if(rep==NULL) { 1.2016 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.2017 + return NULL; 1.2018 + } 1.2019 + ut = utext_setup(ut, sizeof(ReplExtra), status); 1.2020 + 1.2021 + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1.2022 + if(rep->hasMetaData()) { 1.2023 + ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA); 1.2024 + } 1.2025 + 1.2026 + ut->pFuncs = &repFuncs; 1.2027 + ut->context = rep; 1.2028 + return ut; 1.2029 +} 1.2030 + 1.2031 +U_CDECL_END 1.2032 + 1.2033 + 1.2034 + 1.2035 + 1.2036 + 1.2037 + 1.2038 + 1.2039 + 1.2040 +//------------------------------------------------------------------------------ 1.2041 +// 1.2042 +// UText implementation for UnicodeString (read/write) and 1.2043 +// for const UnicodeString (read only) 1.2044 +// (same implementation, only the flags are different) 1.2045 +// 1.2046 +// Use of UText data members: 1.2047 +// context pointer to UnicodeString 1.2048 +// p pointer to UnicodeString IF this UText owns the string 1.2049 +// and it must be deleted on close(). NULL otherwise. 1.2050 +// 1.2051 +//------------------------------------------------------------------------------ 1.2052 + 1.2053 +U_CDECL_BEGIN 1.2054 + 1.2055 + 1.2056 +static UText * U_CALLCONV 1.2057 +unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { 1.2058 + // First do a generic shallow clone. Does everything needed for the UText struct itself. 1.2059 + dest = shallowTextClone(dest, src, status); 1.2060 + 1.2061 + // For deep clones, make a copy of the UnicodeSring. 1.2062 + // The copied UnicodeString storage is owned by the newly created UText clone. 1.2063 + // A non-NULL pointer in UText.p is the signal to the close() function to delete 1.2064 + // the UText. 1.2065 + // 1.2066 + if (deep && U_SUCCESS(*status)) { 1.2067 + const UnicodeString *srcString = (const UnicodeString *)src->context; 1.2068 + dest->context = new UnicodeString(*srcString); 1.2069 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1.2070 + 1.2071 + // with deep clone, the copy is writable, even when the source is not. 1.2072 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1.2073 + } 1.2074 + return dest; 1.2075 +} 1.2076 + 1.2077 +static void U_CALLCONV 1.2078 +unistrTextClose(UText *ut) { 1.2079 + // Most of the work of close is done by the generic UText framework close. 1.2080 + // All that needs to be done here is delete the UnicodeString if the UText 1.2081 + // owns it. This occurs if the UText was created by cloning. 1.2082 + if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1.2083 + UnicodeString *str = (UnicodeString *)ut->context; 1.2084 + delete str; 1.2085 + ut->context = NULL; 1.2086 + } 1.2087 +} 1.2088 + 1.2089 + 1.2090 +static int64_t U_CALLCONV 1.2091 +unistrTextLength(UText *t) { 1.2092 + return ((const UnicodeString *)t->context)->length(); 1.2093 +} 1.2094 + 1.2095 + 1.2096 +static UBool U_CALLCONV 1.2097 +unistrTextAccess(UText *ut, int64_t index, UBool forward) { 1.2098 + int32_t length = ut->chunkLength; 1.2099 + ut->chunkOffset = pinIndex(index, length); 1.2100 + 1.2101 + // Check whether request is at the start or end 1.2102 + UBool retVal = (forward && index<length) || (!forward && index>0); 1.2103 + return retVal; 1.2104 +} 1.2105 + 1.2106 + 1.2107 + 1.2108 +static int32_t U_CALLCONV 1.2109 +unistrTextExtract(UText *t, 1.2110 + int64_t start, int64_t limit, 1.2111 + UChar *dest, int32_t destCapacity, 1.2112 + UErrorCode *pErrorCode) { 1.2113 + const UnicodeString *us=(const UnicodeString *)t->context; 1.2114 + int32_t length=us->length(); 1.2115 + 1.2116 + if(U_FAILURE(*pErrorCode)) { 1.2117 + return 0; 1.2118 + } 1.2119 + if(destCapacity<0 || (dest==NULL && destCapacity>0)) { 1.2120 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.2121 + } 1.2122 + if(start<0 || start>limit) { 1.2123 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2124 + return 0; 1.2125 + } 1.2126 + 1.2127 + int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length; 1.2128 + int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length; 1.2129 + 1.2130 + length=limit32-start32; 1.2131 + if (destCapacity>0 && dest!=NULL) { 1.2132 + int32_t trimmedLength = length; 1.2133 + if(trimmedLength>destCapacity) { 1.2134 + trimmedLength=destCapacity; 1.2135 + } 1.2136 + us->extract(start32, trimmedLength, dest); 1.2137 + t->chunkOffset = start32+trimmedLength; 1.2138 + } else { 1.2139 + t->chunkOffset = start32; 1.2140 + } 1.2141 + u_terminateUChars(dest, destCapacity, length, pErrorCode); 1.2142 + return length; 1.2143 +} 1.2144 + 1.2145 +static int32_t U_CALLCONV 1.2146 +unistrTextReplace(UText *ut, 1.2147 + int64_t start, int64_t limit, 1.2148 + const UChar *src, int32_t length, 1.2149 + UErrorCode *pErrorCode) { 1.2150 + UnicodeString *us=(UnicodeString *)ut->context; 1.2151 + int32_t oldLength; 1.2152 + 1.2153 + if(U_FAILURE(*pErrorCode)) { 1.2154 + return 0; 1.2155 + } 1.2156 + if(src==NULL && length!=0) { 1.2157 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.2158 + } 1.2159 + if(start>limit) { 1.2160 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2161 + return 0; 1.2162 + } 1.2163 + oldLength=us->length(); 1.2164 + int32_t start32 = pinIndex(start, oldLength); 1.2165 + int32_t limit32 = pinIndex(limit, oldLength); 1.2166 + if (start32 < oldLength) { 1.2167 + start32 = us->getChar32Start(start32); 1.2168 + } 1.2169 + if (limit32 < oldLength) { 1.2170 + limit32 = us->getChar32Start(limit32); 1.2171 + } 1.2172 + 1.2173 + // replace 1.2174 + us->replace(start32, limit32-start32, src, length); 1.2175 + int32_t newLength = us->length(); 1.2176 + 1.2177 + // Update the chunk description. 1.2178 + ut->chunkContents = us->getBuffer(); 1.2179 + ut->chunkLength = newLength; 1.2180 + ut->chunkNativeLimit = newLength; 1.2181 + ut->nativeIndexingLimit = newLength; 1.2182 + 1.2183 + // Set iteration position to the point just following the newly inserted text. 1.2184 + int32_t lengthDelta = newLength - oldLength; 1.2185 + ut->chunkOffset = limit32 + lengthDelta; 1.2186 + 1.2187 + return lengthDelta; 1.2188 +} 1.2189 + 1.2190 +static void U_CALLCONV 1.2191 +unistrTextCopy(UText *ut, 1.2192 + int64_t start, int64_t limit, 1.2193 + int64_t destIndex, 1.2194 + UBool move, 1.2195 + UErrorCode *pErrorCode) { 1.2196 + UnicodeString *us=(UnicodeString *)ut->context; 1.2197 + int32_t length=us->length(); 1.2198 + 1.2199 + if(U_FAILURE(*pErrorCode)) { 1.2200 + return; 1.2201 + } 1.2202 + int32_t start32 = pinIndex(start, length); 1.2203 + int32_t limit32 = pinIndex(limit, length); 1.2204 + int32_t destIndex32 = pinIndex(destIndex, length); 1.2205 + 1.2206 + if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) { 1.2207 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.2208 + return; 1.2209 + } 1.2210 + 1.2211 + if(move) { 1.2212 + // move: copy to destIndex, then replace original with nothing 1.2213 + int32_t segLength=limit32-start32; 1.2214 + us->copy(start32, limit32, destIndex32); 1.2215 + if(destIndex32<start32) { 1.2216 + start32+=segLength; 1.2217 + } 1.2218 + us->replace(start32, segLength, NULL, 0); 1.2219 + } else { 1.2220 + // copy 1.2221 + us->copy(start32, limit32, destIndex32); 1.2222 + } 1.2223 + 1.2224 + // update chunk description, set iteration position. 1.2225 + ut->chunkContents = us->getBuffer(); 1.2226 + if (move==FALSE) { 1.2227 + // copy operation, string length grows 1.2228 + ut->chunkLength += limit32-start32; 1.2229 + ut->chunkNativeLimit = ut->chunkLength; 1.2230 + ut->nativeIndexingLimit = ut->chunkLength; 1.2231 + } 1.2232 + 1.2233 + // Iteration position to end of the newly inserted text. 1.2234 + ut->chunkOffset = destIndex32+limit32-start32; 1.2235 + if (move && destIndex32>start32) { 1.2236 + ut->chunkOffset = destIndex32; 1.2237 + } 1.2238 + 1.2239 +} 1.2240 + 1.2241 +static const struct UTextFuncs unistrFuncs = 1.2242 +{ 1.2243 + sizeof(UTextFuncs), 1.2244 + 0, 0, 0, // Reserved alignment padding 1.2245 + unistrTextClone, 1.2246 + unistrTextLength, 1.2247 + unistrTextAccess, 1.2248 + unistrTextExtract, 1.2249 + unistrTextReplace, 1.2250 + unistrTextCopy, 1.2251 + NULL, // MapOffsetToNative, 1.2252 + NULL, // MapIndexToUTF16, 1.2253 + unistrTextClose, 1.2254 + NULL, // spare 1 1.2255 + NULL, // spare 2 1.2256 + NULL // spare 3 1.2257 +}; 1.2258 + 1.2259 + 1.2260 + 1.2261 +U_CDECL_END 1.2262 + 1.2263 + 1.2264 +U_CAPI UText * U_EXPORT2 1.2265 +utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { 1.2266 + ut = utext_openConstUnicodeString(ut, s, status); 1.2267 + if (U_SUCCESS(*status)) { 1.2268 + ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); 1.2269 + } 1.2270 + return ut; 1.2271 +} 1.2272 + 1.2273 + 1.2274 + 1.2275 +U_CAPI UText * U_EXPORT2 1.2276 +utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) { 1.2277 + if (U_SUCCESS(*status) && s->isBogus()) { 1.2278 + // The UnicodeString is bogus, but we still need to detach the UText 1.2279 + // from whatever it was hooked to before, if anything. 1.2280 + utext_openUChars(ut, NULL, 0, status); 1.2281 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2282 + return ut; 1.2283 + } 1.2284 + ut = utext_setup(ut, 0, status); 1.2285 + // note: use the standard (writable) function table for UnicodeString. 1.2286 + // The flag settings disable writing, so having the functions in 1.2287 + // the table is harmless. 1.2288 + if (U_SUCCESS(*status)) { 1.2289 + ut->pFuncs = &unistrFuncs; 1.2290 + ut->context = s; 1.2291 + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 1.2292 + ut->chunkContents = s->getBuffer(); 1.2293 + ut->chunkLength = s->length(); 1.2294 + ut->chunkNativeStart = 0; 1.2295 + ut->chunkNativeLimit = ut->chunkLength; 1.2296 + ut->nativeIndexingLimit = ut->chunkLength; 1.2297 + } 1.2298 + return ut; 1.2299 +} 1.2300 + 1.2301 +//------------------------------------------------------------------------------ 1.2302 +// 1.2303 +// UText implementation for const UChar * strings 1.2304 +// 1.2305 +// Use of UText data members: 1.2306 +// context pointer to UnicodeString 1.2307 +// a length. -1 if not yet known. 1.2308 +// 1.2309 +// TODO: support 64 bit lengths. 1.2310 +// 1.2311 +//------------------------------------------------------------------------------ 1.2312 + 1.2313 +U_CDECL_BEGIN 1.2314 + 1.2315 + 1.2316 +static UText * U_CALLCONV 1.2317 +ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) { 1.2318 + // First do a generic shallow clone. 1.2319 + dest = shallowTextClone(dest, src, status); 1.2320 + 1.2321 + // For deep clones, make a copy of the string. 1.2322 + // The copied storage is owned by the newly created clone. 1.2323 + // A non-NULL pointer in UText.p is the signal to the close() function to delete 1.2324 + // it. 1.2325 + // 1.2326 + if (deep && U_SUCCESS(*status)) { 1.2327 + U_ASSERT(utext_nativeLength(dest) < INT32_MAX); 1.2328 + int32_t len = (int32_t)utext_nativeLength(dest); 1.2329 + 1.2330 + // The cloned string IS going to be NUL terminated, whether or not the original was. 1.2331 + const UChar *srcStr = (const UChar *)src->context; 1.2332 + UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar)); 1.2333 + if (copyStr == NULL) { 1.2334 + *status = U_MEMORY_ALLOCATION_ERROR; 1.2335 + } else { 1.2336 + int64_t i; 1.2337 + for (i=0; i<len; i++) { 1.2338 + copyStr[i] = srcStr[i]; 1.2339 + } 1.2340 + copyStr[len] = 0; 1.2341 + dest->context = copyStr; 1.2342 + dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); 1.2343 + } 1.2344 + } 1.2345 + return dest; 1.2346 +} 1.2347 + 1.2348 + 1.2349 +static void U_CALLCONV 1.2350 +ucstrTextClose(UText *ut) { 1.2351 + // Most of the work of close is done by the generic UText framework close. 1.2352 + // All that needs to be done here is delete the string if the UText 1.2353 + // owns it. This occurs if the UText was created by cloning. 1.2354 + if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { 1.2355 + UChar *s = (UChar *)ut->context; 1.2356 + uprv_free(s); 1.2357 + ut->context = NULL; 1.2358 + } 1.2359 +} 1.2360 + 1.2361 + 1.2362 + 1.2363 +static int64_t U_CALLCONV 1.2364 +ucstrTextLength(UText *ut) { 1.2365 + if (ut->a < 0) { 1.2366 + // null terminated, we don't yet know the length. Scan for it. 1.2367 + // Access is not convenient for doing this 1.2368 + // because the current interation postion can't be changed. 1.2369 + const UChar *str = (const UChar *)ut->context; 1.2370 + for (;;) { 1.2371 + if (str[ut->chunkNativeLimit] == 0) { 1.2372 + break; 1.2373 + } 1.2374 + ut->chunkNativeLimit++; 1.2375 + } 1.2376 + ut->a = ut->chunkNativeLimit; 1.2377 + ut->chunkLength = (int32_t)ut->chunkNativeLimit; 1.2378 + ut->nativeIndexingLimit = ut->chunkLength; 1.2379 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.2380 + } 1.2381 + return ut->a; 1.2382 +} 1.2383 + 1.2384 + 1.2385 +static UBool U_CALLCONV 1.2386 +ucstrTextAccess(UText *ut, int64_t index, UBool forward) { 1.2387 + const UChar *str = (const UChar *)ut->context; 1.2388 + 1.2389 + // pin the requested index to the bounds of the string, 1.2390 + // and set current iteration position. 1.2391 + if (index<0) { 1.2392 + index = 0; 1.2393 + } else if (index < ut->chunkNativeLimit) { 1.2394 + // The request data is within the chunk as it is known so far. 1.2395 + // Put index on a code point boundary. 1.2396 + U16_SET_CP_START(str, 0, index); 1.2397 + } else if (ut->a >= 0) { 1.2398 + // We know the length of this string, and the user is requesting something 1.2399 + // at or beyond the length. Pin the requested index to the length. 1.2400 + index = ut->a; 1.2401 + } else { 1.2402 + // Null terminated string, length not yet known, and the requested index 1.2403 + // is beyond where we have scanned so far. 1.2404 + // Scan to 32 UChars beyond the requested index. The strategy here is 1.2405 + // to avoid fully scanning a long string when the caller only wants to 1.2406 + // see a few characters at its beginning. 1.2407 + int32_t scanLimit = (int32_t)index + 32; 1.2408 + if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression 1.2409 + scanLimit = INT32_MAX; 1.2410 + } 1.2411 + 1.2412 + int32_t chunkLimit = (int32_t)ut->chunkNativeLimit; 1.2413 + for (; chunkLimit<scanLimit; chunkLimit++) { 1.2414 + if (str[chunkLimit] == 0) { 1.2415 + // We found the end of the string. Remember it, pin the requested index to it, 1.2416 + // and bail out of here. 1.2417 + ut->a = chunkLimit; 1.2418 + ut->chunkLength = chunkLimit; 1.2419 + ut->nativeIndexingLimit = chunkLimit; 1.2420 + if (index >= chunkLimit) { 1.2421 + index = chunkLimit; 1.2422 + } else { 1.2423 + U16_SET_CP_START(str, 0, index); 1.2424 + } 1.2425 + 1.2426 + ut->chunkNativeLimit = chunkLimit; 1.2427 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.2428 + goto breakout; 1.2429 + } 1.2430 + } 1.2431 + // We scanned through the next batch of UChars without finding the end. 1.2432 + U16_SET_CP_START(str, 0, index); 1.2433 + if (chunkLimit == INT32_MAX) { 1.2434 + // Scanned to the limit of a 32 bit length. 1.2435 + // Forceably trim the overlength string back so length fits in int32 1.2436 + // TODO: add support for 64 bit strings. 1.2437 + ut->a = chunkLimit; 1.2438 + ut->chunkLength = chunkLimit; 1.2439 + ut->nativeIndexingLimit = chunkLimit; 1.2440 + if (index > chunkLimit) { 1.2441 + index = chunkLimit; 1.2442 + } 1.2443 + ut->chunkNativeLimit = chunkLimit; 1.2444 + ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.2445 + } else { 1.2446 + // The endpoint of a chunk must not be left in the middle of a surrogate pair. 1.2447 + // If the current end is on a lead surrogate, back the end up by one. 1.2448 + // It doesn't matter if the end char happens to be an unpaired surrogate, 1.2449 + // and it's simpler not to worry about it. 1.2450 + if (U16_IS_LEAD(str[chunkLimit-1])) { 1.2451 + --chunkLimit; 1.2452 + } 1.2453 + // Null-terminated chunk with end still unknown. 1.2454 + // Update the chunk length to reflect what has been scanned thus far. 1.2455 + // That the full length is still unknown is (still) flagged by 1.2456 + // ut->a being < 0. 1.2457 + ut->chunkNativeLimit = chunkLimit; 1.2458 + ut->nativeIndexingLimit = chunkLimit; 1.2459 + ut->chunkLength = chunkLimit; 1.2460 + } 1.2461 + 1.2462 + } 1.2463 +breakout: 1.2464 + U_ASSERT(index<=INT32_MAX); 1.2465 + ut->chunkOffset = (int32_t)index; 1.2466 + 1.2467 + // Check whether request is at the start or end 1.2468 + UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0); 1.2469 + return retVal; 1.2470 +} 1.2471 + 1.2472 + 1.2473 + 1.2474 +static int32_t U_CALLCONV 1.2475 +ucstrTextExtract(UText *ut, 1.2476 + int64_t start, int64_t limit, 1.2477 + UChar *dest, int32_t destCapacity, 1.2478 + UErrorCode *pErrorCode) 1.2479 +{ 1.2480 + if(U_FAILURE(*pErrorCode)) { 1.2481 + return 0; 1.2482 + } 1.2483 + if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { 1.2484 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.2485 + return 0; 1.2486 + } 1.2487 + 1.2488 + //const UChar *s=(const UChar *)ut->context; 1.2489 + int32_t si, di; 1.2490 + 1.2491 + int32_t start32; 1.2492 + int32_t limit32; 1.2493 + 1.2494 + // Access the start. Does two things we need: 1.2495 + // Pins 'start' to the length of the string, if it came in out-of-bounds. 1.2496 + // Snaps 'start' to the beginning of a code point. 1.2497 + ucstrTextAccess(ut, start, TRUE); 1.2498 + const UChar *s=ut->chunkContents; 1.2499 + start32 = ut->chunkOffset; 1.2500 + 1.2501 + int32_t strLength=(int32_t)ut->a; 1.2502 + if (strLength >= 0) { 1.2503 + limit32 = pinIndex(limit, strLength); 1.2504 + } else { 1.2505 + limit32 = pinIndex(limit, INT32_MAX); 1.2506 + } 1.2507 + di = 0; 1.2508 + for (si=start32; si<limit32; si++) { 1.2509 + if (strLength<0 && s[si]==0) { 1.2510 + // Just hit the end of a null-terminated string. 1.2511 + ut->a = si; // set string length for this UText 1.2512 + ut->chunkNativeLimit = si; 1.2513 + ut->chunkLength = si; 1.2514 + ut->nativeIndexingLimit = si; 1.2515 + strLength = si; 1.2516 + break; 1.2517 + } 1.2518 + U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */ 1.2519 + if (di<destCapacity) { 1.2520 + // only store if there is space. 1.2521 + dest[di] = s[si]; 1.2522 + } else { 1.2523 + if (strLength>=0) { 1.2524 + // We have filled the destination buffer, and the string length is known. 1.2525 + // Cut the loop short. There is no need to scan string termination. 1.2526 + di = limit32 - start32; 1.2527 + si = limit32; 1.2528 + break; 1.2529 + } 1.2530 + } 1.2531 + di++; 1.2532 + } 1.2533 + 1.2534 + // If the limit index points to a lead surrogate of a pair, 1.2535 + // add the corresponding trail surrogate to the destination. 1.2536 + if (si>0 && U16_IS_LEAD(s[si-1]) && 1.2537 + ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si]))) 1.2538 + { 1.2539 + if (di<destCapacity) { 1.2540 + // store only if there is space in the output buffer. 1.2541 + dest[di++] = s[si++]; 1.2542 + } 1.2543 + } 1.2544 + 1.2545 + // Put iteration position at the point just following the extracted text 1.2546 + ut->chunkOffset = uprv_min(strLength, start32 + destCapacity); 1.2547 + 1.2548 + // Add a terminating NUL if space in the buffer permits, 1.2549 + // and set the error status as required. 1.2550 + u_terminateUChars(dest, destCapacity, di, pErrorCode); 1.2551 + return di; 1.2552 +} 1.2553 + 1.2554 +static const struct UTextFuncs ucstrFuncs = 1.2555 +{ 1.2556 + sizeof(UTextFuncs), 1.2557 + 0, 0, 0, // Reserved alignment padding 1.2558 + ucstrTextClone, 1.2559 + ucstrTextLength, 1.2560 + ucstrTextAccess, 1.2561 + ucstrTextExtract, 1.2562 + NULL, // Replace 1.2563 + NULL, // Copy 1.2564 + NULL, // MapOffsetToNative, 1.2565 + NULL, // MapIndexToUTF16, 1.2566 + ucstrTextClose, 1.2567 + NULL, // spare 1 1.2568 + NULL, // spare 2 1.2569 + NULL, // spare 3 1.2570 +}; 1.2571 + 1.2572 +U_CDECL_END 1.2573 + 1.2574 +static const UChar gEmptyUString[] = {0}; 1.2575 + 1.2576 +U_CAPI UText * U_EXPORT2 1.2577 +utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) { 1.2578 + if (U_FAILURE(*status)) { 1.2579 + return NULL; 1.2580 + } 1.2581 + if(s==NULL && length==0) { 1.2582 + s = gEmptyUString; 1.2583 + } 1.2584 + if (s==NULL || length < -1 || length>INT32_MAX) { 1.2585 + *status = U_ILLEGAL_ARGUMENT_ERROR; 1.2586 + return NULL; 1.2587 + } 1.2588 + ut = utext_setup(ut, 0, status); 1.2589 + if (U_SUCCESS(*status)) { 1.2590 + ut->pFuncs = &ucstrFuncs; 1.2591 + ut->context = s; 1.2592 + ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); 1.2593 + if (length==-1) { 1.2594 + ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); 1.2595 + } 1.2596 + ut->a = length; 1.2597 + ut->chunkContents = s; 1.2598 + ut->chunkNativeStart = 0; 1.2599 + ut->chunkNativeLimit = length>=0? length : 0; 1.2600 + ut->chunkLength = (int32_t)ut->chunkNativeLimit; 1.2601 + ut->chunkOffset = 0; 1.2602 + ut->nativeIndexingLimit = ut->chunkLength; 1.2603 + } 1.2604 + return ut; 1.2605 +} 1.2606 + 1.2607 + 1.2608 +//------------------------------------------------------------------------------ 1.2609 +// 1.2610 +// UText implementation for text from ICU CharacterIterators 1.2611 +// 1.2612 +// Use of UText data members: 1.2613 +// context pointer to the CharacterIterator 1.2614 +// a length of the full text. 1.2615 +// p pointer to buffer 1 1.2616 +// b start index of local buffer 1 contents 1.2617 +// q pointer to buffer 2 1.2618 +// c start index of local buffer 2 contents 1.2619 +// r pointer to the character iterator if the UText owns it. 1.2620 +// Null otherwise. 1.2621 +// 1.2622 +//------------------------------------------------------------------------------ 1.2623 +#define CIBufSize 16 1.2624 + 1.2625 +U_CDECL_BEGIN 1.2626 +static void U_CALLCONV 1.2627 +charIterTextClose(UText *ut) { 1.2628 + // Most of the work of close is done by the generic UText framework close. 1.2629 + // All that needs to be done here is delete the CharacterIterator if the UText 1.2630 + // owns it. This occurs if the UText was created by cloning. 1.2631 + CharacterIterator *ci = (CharacterIterator *)ut->r; 1.2632 + delete ci; 1.2633 + ut->r = NULL; 1.2634 +} 1.2635 + 1.2636 +static int64_t U_CALLCONV 1.2637 +charIterTextLength(UText *ut) { 1.2638 + return (int32_t)ut->a; 1.2639 +} 1.2640 + 1.2641 +static UBool U_CALLCONV 1.2642 +charIterTextAccess(UText *ut, int64_t index, UBool forward) { 1.2643 + CharacterIterator *ci = (CharacterIterator *)ut->context; 1.2644 + 1.2645 + int32_t clippedIndex = (int32_t)index; 1.2646 + if (clippedIndex<0) { 1.2647 + clippedIndex=0; 1.2648 + } else if (clippedIndex>=ut->a) { 1.2649 + clippedIndex=(int32_t)ut->a; 1.2650 + } 1.2651 + int32_t neededIndex = clippedIndex; 1.2652 + if (!forward && neededIndex>0) { 1.2653 + // reverse iteration, want the position just before what was asked for. 1.2654 + neededIndex--; 1.2655 + } else if (forward && neededIndex==ut->a && neededIndex>0) { 1.2656 + // Forward iteration, don't ask for something past the end of the text. 1.2657 + neededIndex--; 1.2658 + } 1.2659 + 1.2660 + // Find the native index of the start of the buffer containing what we want. 1.2661 + neededIndex -= neededIndex % CIBufSize; 1.2662 + 1.2663 + UChar *buf = NULL; 1.2664 + UBool needChunkSetup = TRUE; 1.2665 + int i; 1.2666 + if (ut->chunkNativeStart == neededIndex) { 1.2667 + // The buffer we want is already the current chunk. 1.2668 + needChunkSetup = FALSE; 1.2669 + } else if (ut->b == neededIndex) { 1.2670 + // The first buffer (buffer p) has what we need. 1.2671 + buf = (UChar *)ut->p; 1.2672 + } else if (ut->c == neededIndex) { 1.2673 + // The second buffer (buffer q) has what we need. 1.2674 + buf = (UChar *)ut->q; 1.2675 + } else { 1.2676 + // Neither buffer already has what we need. 1.2677 + // Load new data from the character iterator. 1.2678 + // Use the buf that is not the current buffer. 1.2679 + buf = (UChar *)ut->p; 1.2680 + if (ut->p == ut->chunkContents) { 1.2681 + buf = (UChar *)ut->q; 1.2682 + } 1.2683 + ci->setIndex(neededIndex); 1.2684 + for (i=0; i<CIBufSize; i++) { 1.2685 + buf[i] = ci->nextPostInc(); 1.2686 + if (i+neededIndex > ut->a) { 1.2687 + break; 1.2688 + } 1.2689 + } 1.2690 + } 1.2691 + 1.2692 + // We have a buffer with the data we need. 1.2693 + // Set it up as the current chunk, if it wasn't already. 1.2694 + if (needChunkSetup) { 1.2695 + ut->chunkContents = buf; 1.2696 + ut->chunkLength = CIBufSize; 1.2697 + ut->chunkNativeStart = neededIndex; 1.2698 + ut->chunkNativeLimit = neededIndex + CIBufSize; 1.2699 + if (ut->chunkNativeLimit > ut->a) { 1.2700 + ut->chunkNativeLimit = ut->a; 1.2701 + ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart); 1.2702 + } 1.2703 + ut->nativeIndexingLimit = ut->chunkLength; 1.2704 + U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize); 1.2705 + } 1.2706 + ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart; 1.2707 + UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0); 1.2708 + return success; 1.2709 +} 1.2710 + 1.2711 +static UText * U_CALLCONV 1.2712 +charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) { 1.2713 + if (U_FAILURE(*status)) { 1.2714 + return NULL; 1.2715 + } 1.2716 + 1.2717 + if (deep) { 1.2718 + // There is no CharacterIterator API for cloning the underlying text storage. 1.2719 + *status = U_UNSUPPORTED_ERROR; 1.2720 + return NULL; 1.2721 + } else { 1.2722 + CharacterIterator *srcCI =(CharacterIterator *)src->context; 1.2723 + srcCI = srcCI->clone(); 1.2724 + dest = utext_openCharacterIterator(dest, srcCI, status); 1.2725 + // cast off const on getNativeIndex. 1.2726 + // For CharacterIterator based UTexts, this is safe, the operation is const. 1.2727 + int64_t ix = utext_getNativeIndex((UText *)src); 1.2728 + utext_setNativeIndex(dest, ix); 1.2729 + dest->r = srcCI; // flags that this UText owns the CharacterIterator 1.2730 + } 1.2731 + return dest; 1.2732 +} 1.2733 + 1.2734 +static int32_t U_CALLCONV 1.2735 +charIterTextExtract(UText *ut, 1.2736 + int64_t start, int64_t limit, 1.2737 + UChar *dest, int32_t destCapacity, 1.2738 + UErrorCode *status) 1.2739 +{ 1.2740 + if(U_FAILURE(*status)) { 1.2741 + return 0; 1.2742 + } 1.2743 + if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { 1.2744 + *status=U_ILLEGAL_ARGUMENT_ERROR; 1.2745 + return 0; 1.2746 + } 1.2747 + int32_t length = (int32_t)ut->a; 1.2748 + int32_t start32 = pinIndex(start, length); 1.2749 + int32_t limit32 = pinIndex(limit, length); 1.2750 + int32_t desti = 0; 1.2751 + int32_t srci; 1.2752 + int32_t copyLimit; 1.2753 + 1.2754 + CharacterIterator *ci = (CharacterIterator *)ut->context; 1.2755 + ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed. 1.2756 + srci = ci->getIndex(); 1.2757 + copyLimit = srci; 1.2758 + while (srci<limit32) { 1.2759 + UChar32 c = ci->next32PostInc(); 1.2760 + int32_t len = U16_LENGTH(c); 1.2761 + U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */ 1.2762 + if (desti+len <= destCapacity) { 1.2763 + U16_APPEND_UNSAFE(dest, desti, c); 1.2764 + copyLimit = srci+len; 1.2765 + } else { 1.2766 + desti += len; 1.2767 + *status = U_BUFFER_OVERFLOW_ERROR; 1.2768 + } 1.2769 + srci += len; 1.2770 + } 1.2771 + 1.2772 + charIterTextAccess(ut, copyLimit, TRUE); 1.2773 + 1.2774 + u_terminateUChars(dest, destCapacity, desti, status); 1.2775 + return desti; 1.2776 +} 1.2777 + 1.2778 +static const struct UTextFuncs charIterFuncs = 1.2779 +{ 1.2780 + sizeof(UTextFuncs), 1.2781 + 0, 0, 0, // Reserved alignment padding 1.2782 + charIterTextClone, 1.2783 + charIterTextLength, 1.2784 + charIterTextAccess, 1.2785 + charIterTextExtract, 1.2786 + NULL, // Replace 1.2787 + NULL, // Copy 1.2788 + NULL, // MapOffsetToNative, 1.2789 + NULL, // MapIndexToUTF16, 1.2790 + charIterTextClose, 1.2791 + NULL, // spare 1 1.2792 + NULL, // spare 2 1.2793 + NULL // spare 3 1.2794 +}; 1.2795 +U_CDECL_END 1.2796 + 1.2797 + 1.2798 +U_CAPI UText * U_EXPORT2 1.2799 +utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) { 1.2800 + if (U_FAILURE(*status)) { 1.2801 + return NULL; 1.2802 + } 1.2803 + 1.2804 + if (ci->startIndex() > 0) { 1.2805 + // No support for CharacterIterators that do not start indexing from zero. 1.2806 + *status = U_UNSUPPORTED_ERROR; 1.2807 + return NULL; 1.2808 + } 1.2809 + 1.2810 + // Extra space in UText for 2 buffers of CIBufSize UChars each. 1.2811 + int32_t extraSpace = 2 * CIBufSize * sizeof(UChar); 1.2812 + ut = utext_setup(ut, extraSpace, status); 1.2813 + if (U_SUCCESS(*status)) { 1.2814 + ut->pFuncs = &charIterFuncs; 1.2815 + ut->context = ci; 1.2816 + ut->providerProperties = 0; 1.2817 + ut->a = ci->endIndex(); // Length of text 1.2818 + ut->p = ut->pExtra; // First buffer 1.2819 + ut->b = -1; // Native index of first buffer contents 1.2820 + ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer 1.2821 + ut->c = -1; // Native index of second buffer contents 1.2822 + 1.2823 + // Initialize current chunk contents to be empty. 1.2824 + // First access will fault something in. 1.2825 + // Note: The initial nativeStart and chunkOffset must sum to zero 1.2826 + // so that getNativeIndex() will correctly compute to zero 1.2827 + // if no call to Access() has ever been made. They can't be both 1.2828 + // zero without Access() thinking that the chunk is valid. 1.2829 + ut->chunkContents = (UChar *)ut->p; 1.2830 + ut->chunkNativeStart = -1; 1.2831 + ut->chunkOffset = 1; 1.2832 + ut->chunkNativeLimit = 0; 1.2833 + ut->chunkLength = 0; 1.2834 + ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing 1.2835 + } 1.2836 + return ut; 1.2837 +} 1.2838 + 1.2839 + 1.2840 +