intl/icu/source/common/utext.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 *******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2005-2012, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 *******************************************************************************
michael@0 8 * file name: utext.cpp
michael@0 9 * encoding: US-ASCII
michael@0 10 * tab size: 8 (not used)
michael@0 11 * indentation:4
michael@0 12 *
michael@0 13 * created on: 2005apr12
michael@0 14 * created by: Markus W. Scherer
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18 #include "unicode/ustring.h"
michael@0 19 #include "unicode/unistr.h"
michael@0 20 #include "unicode/chariter.h"
michael@0 21 #include "unicode/utext.h"
michael@0 22 #include "unicode/utf.h"
michael@0 23 #include "unicode/utf8.h"
michael@0 24 #include "unicode/utf16.h"
michael@0 25 #include "ustr_imp.h"
michael@0 26 #include "cmemory.h"
michael@0 27 #include "cstring.h"
michael@0 28 #include "uassert.h"
michael@0 29 #include "putilimp.h"
michael@0 30
michael@0 31 U_NAMESPACE_USE
michael@0 32
michael@0 33 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
michael@0 34
michael@0 35
michael@0 36 static UBool
michael@0 37 utext_access(UText *ut, int64_t index, UBool forward) {
michael@0 38 return ut->pFuncs->access(ut, index, forward);
michael@0 39 }
michael@0 40
michael@0 41
michael@0 42
michael@0 43 U_CAPI UBool U_EXPORT2
michael@0 44 utext_moveIndex32(UText *ut, int32_t delta) {
michael@0 45 UChar32 c;
michael@0 46 if (delta > 0) {
michael@0 47 do {
michael@0 48 if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
michael@0 49 return FALSE;
michael@0 50 }
michael@0 51 c = ut->chunkContents[ut->chunkOffset];
michael@0 52 if (U16_IS_SURROGATE(c)) {
michael@0 53 c = utext_next32(ut);
michael@0 54 if (c == U_SENTINEL) {
michael@0 55 return FALSE;
michael@0 56 }
michael@0 57 } else {
michael@0 58 ut->chunkOffset++;
michael@0 59 }
michael@0 60 } while(--delta>0);
michael@0 61
michael@0 62 } else if (delta<0) {
michael@0 63 do {
michael@0 64 if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
michael@0 65 return FALSE;
michael@0 66 }
michael@0 67 c = ut->chunkContents[ut->chunkOffset-1];
michael@0 68 if (U16_IS_SURROGATE(c)) {
michael@0 69 c = utext_previous32(ut);
michael@0 70 if (c == U_SENTINEL) {
michael@0 71 return FALSE;
michael@0 72 }
michael@0 73 } else {
michael@0 74 ut->chunkOffset--;
michael@0 75 }
michael@0 76 } while(++delta<0);
michael@0 77 }
michael@0 78
michael@0 79 return TRUE;
michael@0 80 }
michael@0 81
michael@0 82
michael@0 83 U_CAPI int64_t U_EXPORT2
michael@0 84 utext_nativeLength(UText *ut) {
michael@0 85 return ut->pFuncs->nativeLength(ut);
michael@0 86 }
michael@0 87
michael@0 88
michael@0 89 U_CAPI UBool U_EXPORT2
michael@0 90 utext_isLengthExpensive(const UText *ut) {
michael@0 91 UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
michael@0 92 return r;
michael@0 93 }
michael@0 94
michael@0 95
michael@0 96 U_CAPI int64_t U_EXPORT2
michael@0 97 utext_getNativeIndex(const UText *ut) {
michael@0 98 if(ut->chunkOffset <= ut->nativeIndexingLimit) {
michael@0 99 return ut->chunkNativeStart+ut->chunkOffset;
michael@0 100 } else {
michael@0 101 return ut->pFuncs->mapOffsetToNative(ut);
michael@0 102 }
michael@0 103 }
michael@0 104
michael@0 105
michael@0 106 U_CAPI void U_EXPORT2
michael@0 107 utext_setNativeIndex(UText *ut, int64_t index) {
michael@0 108 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
michael@0 109 // The desired position is outside of the current chunk.
michael@0 110 // Access the new position. Assume a forward iteration from here,
michael@0 111 // which will also be optimimum for a single random access.
michael@0 112 // Reverse iterations may suffer slightly.
michael@0 113 ut->pFuncs->access(ut, index, TRUE);
michael@0 114 } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
michael@0 115 // utf-16 indexing.
michael@0 116 ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
michael@0 117 } else {
michael@0 118 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
michael@0 119 }
michael@0 120 // The convention is that the index must always be on a code point boundary.
michael@0 121 // Adjust the index position if it is in the middle of a surrogate pair.
michael@0 122 if (ut->chunkOffset<ut->chunkLength) {
michael@0 123 UChar c= ut->chunkContents[ut->chunkOffset];
michael@0 124 if (U16_IS_TRAIL(c)) {
michael@0 125 if (ut->chunkOffset==0) {
michael@0 126 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
michael@0 127 }
michael@0 128 if (ut->chunkOffset>0) {
michael@0 129 UChar lead = ut->chunkContents[ut->chunkOffset-1];
michael@0 130 if (U16_IS_LEAD(lead)) {
michael@0 131 ut->chunkOffset--;
michael@0 132 }
michael@0 133 }
michael@0 134 }
michael@0 135 }
michael@0 136 }
michael@0 137
michael@0 138
michael@0 139
michael@0 140 U_CAPI int64_t U_EXPORT2
michael@0 141 utext_getPreviousNativeIndex(UText *ut) {
michael@0 142 //
michael@0 143 // Fast-path the common case.
michael@0 144 // Common means current position is not at the beginning of a chunk
michael@0 145 // and the preceding character is not supplementary.
michael@0 146 //
michael@0 147 int32_t i = ut->chunkOffset - 1;
michael@0 148 int64_t result;
michael@0 149 if (i >= 0) {
michael@0 150 UChar c = ut->chunkContents[i];
michael@0 151 if (U16_IS_TRAIL(c) == FALSE) {
michael@0 152 if (i <= ut->nativeIndexingLimit) {
michael@0 153 result = ut->chunkNativeStart + i;
michael@0 154 } else {
michael@0 155 ut->chunkOffset = i;
michael@0 156 result = ut->pFuncs->mapOffsetToNative(ut);
michael@0 157 ut->chunkOffset++;
michael@0 158 }
michael@0 159 return result;
michael@0 160 }
michael@0 161 }
michael@0 162
michael@0 163 // If at the start of text, simply return 0.
michael@0 164 if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
michael@0 165 return 0;
michael@0 166 }
michael@0 167
michael@0 168 // Harder, less common cases. We are at a chunk boundary, or on a surrogate.
michael@0 169 // Keep it simple, use other functions to handle the edges.
michael@0 170 //
michael@0 171 utext_previous32(ut);
michael@0 172 result = UTEXT_GETNATIVEINDEX(ut);
michael@0 173 utext_next32(ut);
michael@0 174 return result;
michael@0 175 }
michael@0 176
michael@0 177
michael@0 178 //
michael@0 179 // utext_current32. Get the UChar32 at the current position.
michael@0 180 // UText iteration position is always on a code point boundary,
michael@0 181 // never on the trail half of a surrogate pair.
michael@0 182 //
michael@0 183 U_CAPI UChar32 U_EXPORT2
michael@0 184 utext_current32(UText *ut) {
michael@0 185 UChar32 c;
michael@0 186 if (ut->chunkOffset==ut->chunkLength) {
michael@0 187 // Current position is just off the end of the chunk.
michael@0 188 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
michael@0 189 // Off the end of the text.
michael@0 190 return U_SENTINEL;
michael@0 191 }
michael@0 192 }
michael@0 193
michael@0 194 c = ut->chunkContents[ut->chunkOffset];
michael@0 195 if (U16_IS_LEAD(c) == FALSE) {
michael@0 196 // Normal, non-supplementary case.
michael@0 197 return c;
michael@0 198 }
michael@0 199
michael@0 200 //
michael@0 201 // Possible supplementary char.
michael@0 202 //
michael@0 203 UChar32 trail = 0;
michael@0 204 UChar32 supplementaryC = c;
michael@0 205 if ((ut->chunkOffset+1) < ut->chunkLength) {
michael@0 206 // The trail surrogate is in the same chunk.
michael@0 207 trail = ut->chunkContents[ut->chunkOffset+1];
michael@0 208 } else {
michael@0 209 // The trail surrogate is in a different chunk.
michael@0 210 // Because we must maintain the iteration position, we need to switch forward
michael@0 211 // into the new chunk, get the trail surrogate, then revert the chunk back to the
michael@0 212 // original one.
michael@0 213 // An edge case to be careful of: the entire text may end with an unpaired
michael@0 214 // leading surrogate. The attempt to access the trail will fail, but
michael@0 215 // the original position before the unpaired lead still needs to be restored.
michael@0 216 int64_t nativePosition = ut->chunkNativeLimit;
michael@0 217 int32_t originalOffset = ut->chunkOffset;
michael@0 218 if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
michael@0 219 trail = ut->chunkContents[ut->chunkOffset];
michael@0 220 }
michael@0 221 UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk
michael@0 222 U_ASSERT(r==TRUE);
michael@0 223 ut->chunkOffset = originalOffset;
michael@0 224 if(!r) {
michael@0 225 return U_SENTINEL;
michael@0 226 }
michael@0 227 }
michael@0 228
michael@0 229 if (U16_IS_TRAIL(trail)) {
michael@0 230 supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
michael@0 231 }
michael@0 232 return supplementaryC;
michael@0 233
michael@0 234 }
michael@0 235
michael@0 236
michael@0 237 U_CAPI UChar32 U_EXPORT2
michael@0 238 utext_char32At(UText *ut, int64_t nativeIndex) {
michael@0 239 UChar32 c = U_SENTINEL;
michael@0 240
michael@0 241 // Fast path the common case.
michael@0 242 if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
michael@0 243 ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
michael@0 244 c = ut->chunkContents[ut->chunkOffset];
michael@0 245 if (U16_IS_SURROGATE(c) == FALSE) {
michael@0 246 return c;
michael@0 247 }
michael@0 248 }
michael@0 249
michael@0 250
michael@0 251 utext_setNativeIndex(ut, nativeIndex);
michael@0 252 if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
michael@0 253 c = ut->chunkContents[ut->chunkOffset];
michael@0 254 if (U16_IS_SURROGATE(c)) {
michael@0 255 // For surrogates, let current32() deal with the complications
michael@0 256 // of supplementaries that may span chunk boundaries.
michael@0 257 c = utext_current32(ut);
michael@0 258 }
michael@0 259 }
michael@0 260 return c;
michael@0 261 }
michael@0 262
michael@0 263
michael@0 264 U_CAPI UChar32 U_EXPORT2
michael@0 265 utext_next32(UText *ut) {
michael@0 266 UChar32 c;
michael@0 267
michael@0 268 if (ut->chunkOffset >= ut->chunkLength) {
michael@0 269 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
michael@0 270 return U_SENTINEL;
michael@0 271 }
michael@0 272 }
michael@0 273
michael@0 274 c = ut->chunkContents[ut->chunkOffset++];
michael@0 275 if (U16_IS_LEAD(c) == FALSE) {
michael@0 276 // Normal case, not supplementary.
michael@0 277 // (A trail surrogate seen here is just returned as is, as a surrogate value.
michael@0 278 // It cannot be part of a pair.)
michael@0 279 return c;
michael@0 280 }
michael@0 281
michael@0 282 if (ut->chunkOffset >= ut->chunkLength) {
michael@0 283 if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
michael@0 284 // c is an unpaired lead surrogate at the end of the text.
michael@0 285 // return it as it is.
michael@0 286 return c;
michael@0 287 }
michael@0 288 }
michael@0 289 UChar32 trail = ut->chunkContents[ut->chunkOffset];
michael@0 290 if (U16_IS_TRAIL(trail) == FALSE) {
michael@0 291 // c was an unpaired lead surrogate, not at the end of the text.
michael@0 292 // return it as it is (unpaired). Iteration position is on the
michael@0 293 // following character, possibly in the next chunk, where the
michael@0 294 // trail surrogate would have been if it had existed.
michael@0 295 return c;
michael@0 296 }
michael@0 297
michael@0 298 UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
michael@0 299 ut->chunkOffset++; // move iteration position over the trail surrogate.
michael@0 300 return supplementary;
michael@0 301 }
michael@0 302
michael@0 303
michael@0 304 U_CAPI UChar32 U_EXPORT2
michael@0 305 utext_previous32(UText *ut) {
michael@0 306 UChar32 c;
michael@0 307
michael@0 308 if (ut->chunkOffset <= 0) {
michael@0 309 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
michael@0 310 return U_SENTINEL;
michael@0 311 }
michael@0 312 }
michael@0 313 ut->chunkOffset--;
michael@0 314 c = ut->chunkContents[ut->chunkOffset];
michael@0 315 if (U16_IS_TRAIL(c) == FALSE) {
michael@0 316 // Normal case, not supplementary.
michael@0 317 // (A lead surrogate seen here is just returned as is, as a surrogate value.
michael@0 318 // It cannot be part of a pair.)
michael@0 319 return c;
michael@0 320 }
michael@0 321
michael@0 322 if (ut->chunkOffset <= 0) {
michael@0 323 if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
michael@0 324 // c is an unpaired trail surrogate at the start of the text.
michael@0 325 // return it as it is.
michael@0 326 return c;
michael@0 327 }
michael@0 328 }
michael@0 329
michael@0 330 UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
michael@0 331 if (U16_IS_LEAD(lead) == FALSE) {
michael@0 332 // c was an unpaired trail surrogate, not at the end of the text.
michael@0 333 // return it as it is (unpaired). Iteration position is at c
michael@0 334 return c;
michael@0 335 }
michael@0 336
michael@0 337 UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
michael@0 338 ut->chunkOffset--; // move iteration position over the lead surrogate.
michael@0 339 return supplementary;
michael@0 340 }
michael@0 341
michael@0 342
michael@0 343
michael@0 344 U_CAPI UChar32 U_EXPORT2
michael@0 345 utext_next32From(UText *ut, int64_t index) {
michael@0 346 UChar32 c = U_SENTINEL;
michael@0 347
michael@0 348 if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
michael@0 349 // Desired position is outside of the current chunk.
michael@0 350 if(!ut->pFuncs->access(ut, index, TRUE)) {
michael@0 351 // no chunk available here
michael@0 352 return U_SENTINEL;
michael@0 353 }
michael@0 354 } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
michael@0 355 // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
michael@0 356 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
michael@0 357 } else {
michael@0 358 // Desired position is in chunk, with non-UTF16 indexing.
michael@0 359 ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
michael@0 360 }
michael@0 361
michael@0 362 c = ut->chunkContents[ut->chunkOffset++];
michael@0 363 if (U16_IS_SURROGATE(c)) {
michael@0 364 // Surrogates. Many edge cases. Use other functions that already
michael@0 365 // deal with the problems.
michael@0 366 utext_setNativeIndex(ut, index);
michael@0 367 c = utext_next32(ut);
michael@0 368 }
michael@0 369 return c;
michael@0 370 }
michael@0 371
michael@0 372
michael@0 373 U_CAPI UChar32 U_EXPORT2
michael@0 374 utext_previous32From(UText *ut, int64_t index) {
michael@0 375 //
michael@0 376 // Return the character preceding the specified index.
michael@0 377 // Leave the iteration position at the start of the character that was returned.
michael@0 378 //
michael@0 379 UChar32 cPrev; // The character preceding cCurr, which is what we will return.
michael@0 380
michael@0 381 // Address the chunk containg the position preceding the incoming index
michael@0 382 // A tricky edge case:
michael@0 383 // We try to test the requested native index against the chunkNativeStart to determine
michael@0 384 // whether the character preceding the one at the index is in the current chunk.
michael@0 385 // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
michael@0 386 // requested index is on something other than the first position of the first char.
michael@0 387 //
michael@0 388 if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
michael@0 389 // Requested native index is outside of the current chunk.
michael@0 390 if(!ut->pFuncs->access(ut, index, FALSE)) {
michael@0 391 // no chunk available here
michael@0 392 return U_SENTINEL;
michael@0 393 }
michael@0 394 } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
michael@0 395 // Direct UTF-16 indexing.
michael@0 396 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
michael@0 397 } else {
michael@0 398 ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
michael@0 399 if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
michael@0 400 // no chunk available here
michael@0 401 return U_SENTINEL;
michael@0 402 }
michael@0 403 }
michael@0 404
michael@0 405 //
michael@0 406 // Simple case with no surrogates.
michael@0 407 //
michael@0 408 ut->chunkOffset--;
michael@0 409 cPrev = ut->chunkContents[ut->chunkOffset];
michael@0 410
michael@0 411 if (U16_IS_SURROGATE(cPrev)) {
michael@0 412 // Possible supplementary. Many edge cases.
michael@0 413 // Let other functions do the heavy lifting.
michael@0 414 utext_setNativeIndex(ut, index);
michael@0 415 cPrev = utext_previous32(ut);
michael@0 416 }
michael@0 417 return cPrev;
michael@0 418 }
michael@0 419
michael@0 420
michael@0 421 U_CAPI int32_t U_EXPORT2
michael@0 422 utext_extract(UText *ut,
michael@0 423 int64_t start, int64_t limit,
michael@0 424 UChar *dest, int32_t destCapacity,
michael@0 425 UErrorCode *status) {
michael@0 426 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
michael@0 427 }
michael@0 428
michael@0 429
michael@0 430
michael@0 431 U_CAPI UBool U_EXPORT2
michael@0 432 utext_equals(const UText *a, const UText *b) {
michael@0 433 if (a==NULL || b==NULL ||
michael@0 434 a->magic != UTEXT_MAGIC ||
michael@0 435 b->magic != UTEXT_MAGIC) {
michael@0 436 // Null or invalid arguments don't compare equal to anything.
michael@0 437 return FALSE;
michael@0 438 }
michael@0 439
michael@0 440 if (a->pFuncs != b->pFuncs) {
michael@0 441 // Different types of text providers.
michael@0 442 return FALSE;
michael@0 443 }
michael@0 444
michael@0 445 if (a->context != b->context) {
michael@0 446 // Different sources (different strings)
michael@0 447 return FALSE;
michael@0 448 }
michael@0 449 if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
michael@0 450 // Different current position in the string.
michael@0 451 return FALSE;
michael@0 452 }
michael@0 453
michael@0 454 return TRUE;
michael@0 455 }
michael@0 456
michael@0 457 U_CAPI UBool U_EXPORT2
michael@0 458 utext_isWritable(const UText *ut)
michael@0 459 {
michael@0 460 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
michael@0 461 return b;
michael@0 462 }
michael@0 463
michael@0 464
michael@0 465 U_CAPI void U_EXPORT2
michael@0 466 utext_freeze(UText *ut) {
michael@0 467 // Zero out the WRITABLE flag.
michael@0 468 ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
michael@0 469 }
michael@0 470
michael@0 471
michael@0 472 U_CAPI UBool U_EXPORT2
michael@0 473 utext_hasMetaData(const UText *ut)
michael@0 474 {
michael@0 475 UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
michael@0 476 return b;
michael@0 477 }
michael@0 478
michael@0 479
michael@0 480
michael@0 481 U_CAPI int32_t U_EXPORT2
michael@0 482 utext_replace(UText *ut,
michael@0 483 int64_t nativeStart, int64_t nativeLimit,
michael@0 484 const UChar *replacementText, int32_t replacementLength,
michael@0 485 UErrorCode *status)
michael@0 486 {
michael@0 487 if (U_FAILURE(*status)) {
michael@0 488 return 0;
michael@0 489 }
michael@0 490 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
michael@0 491 *status = U_NO_WRITE_PERMISSION;
michael@0 492 return 0;
michael@0 493 }
michael@0 494 int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
michael@0 495 return i;
michael@0 496 }
michael@0 497
michael@0 498 U_CAPI void U_EXPORT2
michael@0 499 utext_copy(UText *ut,
michael@0 500 int64_t nativeStart, int64_t nativeLimit,
michael@0 501 int64_t destIndex,
michael@0 502 UBool move,
michael@0 503 UErrorCode *status)
michael@0 504 {
michael@0 505 if (U_FAILURE(*status)) {
michael@0 506 return;
michael@0 507 }
michael@0 508 if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
michael@0 509 *status = U_NO_WRITE_PERMISSION;
michael@0 510 return;
michael@0 511 }
michael@0 512 ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
michael@0 513 }
michael@0 514
michael@0 515
michael@0 516
michael@0 517 U_CAPI UText * U_EXPORT2
michael@0 518 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
michael@0 519 UText *result;
michael@0 520 result = src->pFuncs->clone(dest, src, deep, status);
michael@0 521 if (readOnly) {
michael@0 522 utext_freeze(result);
michael@0 523 }
michael@0 524 return result;
michael@0 525 }
michael@0 526
michael@0 527
michael@0 528
michael@0 529 //------------------------------------------------------------------------------
michael@0 530 //
michael@0 531 // UText common functions implementation
michael@0 532 //
michael@0 533 //------------------------------------------------------------------------------
michael@0 534
michael@0 535 //
michael@0 536 // UText.flags bit definitions
michael@0 537 //
michael@0 538 enum {
michael@0 539 UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap.
michael@0 540 // 0 if caller provided storage for the UText.
michael@0 541
michael@0 542 UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate
michael@0 543 // heap block.
michael@0 544 // 0 if there is no separate allocation. Either no extra
michael@0 545 // storage was requested, or it is appended to the end
michael@0 546 // of the main UText storage.
michael@0 547
michael@0 548 UTEXT_OPEN = 4 // 1 if this UText is currently open
michael@0 549 // 0 if this UText is not open.
michael@0 550 };
michael@0 551
michael@0 552
michael@0 553 //
michael@0 554 // Extended form of a UText. The purpose is to aid in computing the total size required
michael@0 555 // when a provider asks for a UText to be allocated with extra storage.
michael@0 556
michael@0 557 struct ExtendedUText {
michael@0 558 UText ut;
michael@0 559 UAlignedMemory extension;
michael@0 560 };
michael@0 561
michael@0 562 static const UText emptyText = UTEXT_INITIALIZER;
michael@0 563
michael@0 564 U_CAPI UText * U_EXPORT2
michael@0 565 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
michael@0 566 if (U_FAILURE(*status)) {
michael@0 567 return ut;
michael@0 568 }
michael@0 569
michael@0 570 if (ut == NULL) {
michael@0 571 // We need to heap-allocate storage for the new UText
michael@0 572 int32_t spaceRequired = sizeof(UText);
michael@0 573 if (extraSpace > 0) {
michael@0 574 spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
michael@0 575 }
michael@0 576 ut = (UText *)uprv_malloc(spaceRequired);
michael@0 577 if (ut == NULL) {
michael@0 578 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 579 return NULL;
michael@0 580 } else {
michael@0 581 *ut = emptyText;
michael@0 582 ut->flags |= UTEXT_HEAP_ALLOCATED;
michael@0 583 if (spaceRequired>0) {
michael@0 584 ut->extraSize = extraSpace;
michael@0 585 ut->pExtra = &((ExtendedUText *)ut)->extension;
michael@0 586 }
michael@0 587 }
michael@0 588 } else {
michael@0 589 // We have been supplied with an already existing UText.
michael@0 590 // Verify that it really appears to be a UText.
michael@0 591 if (ut->magic != UTEXT_MAGIC) {
michael@0 592 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 593 return ut;
michael@0 594 }
michael@0 595 // If the ut is already open and there's a provider supplied close
michael@0 596 // function, call it.
michael@0 597 if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) {
michael@0 598 ut->pFuncs->close(ut);
michael@0 599 }
michael@0 600 ut->flags &= ~UTEXT_OPEN;
michael@0 601
michael@0 602 // If extra space was requested by our caller, check whether
michael@0 603 // sufficient already exists, and allocate new if needed.
michael@0 604 if (extraSpace > ut->extraSize) {
michael@0 605 // Need more space. If there is existing separately allocated space,
michael@0 606 // delete it first, then allocate new space.
michael@0 607 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
michael@0 608 uprv_free(ut->pExtra);
michael@0 609 ut->extraSize = 0;
michael@0 610 }
michael@0 611 ut->pExtra = uprv_malloc(extraSpace);
michael@0 612 if (ut->pExtra == NULL) {
michael@0 613 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 614 } else {
michael@0 615 ut->extraSize = extraSpace;
michael@0 616 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
michael@0 617 }
michael@0 618 }
michael@0 619 }
michael@0 620 if (U_SUCCESS(*status)) {
michael@0 621 ut->flags |= UTEXT_OPEN;
michael@0 622
michael@0 623 // Initialize all remaining fields of the UText.
michael@0 624 //
michael@0 625 ut->context = NULL;
michael@0 626 ut->chunkContents = NULL;
michael@0 627 ut->p = NULL;
michael@0 628 ut->q = NULL;
michael@0 629 ut->r = NULL;
michael@0 630 ut->a = 0;
michael@0 631 ut->b = 0;
michael@0 632 ut->c = 0;
michael@0 633 ut->chunkOffset = 0;
michael@0 634 ut->chunkLength = 0;
michael@0 635 ut->chunkNativeStart = 0;
michael@0 636 ut->chunkNativeLimit = 0;
michael@0 637 ut->nativeIndexingLimit = 0;
michael@0 638 ut->providerProperties = 0;
michael@0 639 ut->privA = 0;
michael@0 640 ut->privB = 0;
michael@0 641 ut->privC = 0;
michael@0 642 ut->privP = NULL;
michael@0 643 if (ut->pExtra!=NULL && ut->extraSize>0)
michael@0 644 uprv_memset(ut->pExtra, 0, ut->extraSize);
michael@0 645
michael@0 646 }
michael@0 647 return ut;
michael@0 648 }
michael@0 649
michael@0 650
michael@0 651 U_CAPI UText * U_EXPORT2
michael@0 652 utext_close(UText *ut) {
michael@0 653 if (ut==NULL ||
michael@0 654 ut->magic != UTEXT_MAGIC ||
michael@0 655 (ut->flags & UTEXT_OPEN) == 0)
michael@0 656 {
michael@0 657 // The supplied ut is not an open UText.
michael@0 658 // Do nothing.
michael@0 659 return ut;
michael@0 660 }
michael@0 661
michael@0 662 // If the provider gave us a close function, call it now.
michael@0 663 // This will clean up anything allocated specifically by the provider.
michael@0 664 if (ut->pFuncs->close != NULL) {
michael@0 665 ut->pFuncs->close(ut);
michael@0 666 }
michael@0 667 ut->flags &= ~UTEXT_OPEN;
michael@0 668
michael@0 669 // If we (the framework) allocated the UText or subsidiary storage,
michael@0 670 // delete it.
michael@0 671 if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
michael@0 672 uprv_free(ut->pExtra);
michael@0 673 ut->pExtra = NULL;
michael@0 674 ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
michael@0 675 ut->extraSize = 0;
michael@0 676 }
michael@0 677
michael@0 678 // Zero out function table of the closed UText. This is a defensive move,
michael@0 679 // inteded to cause applications that inadvertantly use a closed
michael@0 680 // utext to crash with null pointer errors.
michael@0 681 ut->pFuncs = NULL;
michael@0 682
michael@0 683 if (ut->flags & UTEXT_HEAP_ALLOCATED) {
michael@0 684 // This UText was allocated by UText setup. We need to free it.
michael@0 685 // Clear magic, so we can detect if the user messes up and immediately
michael@0 686 // tries to reopen another UText using the deleted storage.
michael@0 687 ut->magic = 0;
michael@0 688 uprv_free(ut);
michael@0 689 ut = NULL;
michael@0 690 }
michael@0 691 return ut;
michael@0 692 }
michael@0 693
michael@0 694
michael@0 695
michael@0 696
michael@0 697 //
michael@0 698 // invalidateChunk Reset a chunk to have no contents, so that the next call
michael@0 699 // to access will cause new data to load.
michael@0 700 // This is needed when copy/move/replace operate directly on the
michael@0 701 // backing text, potentially putting it out of sync with the
michael@0 702 // contents in the chunk.
michael@0 703 //
michael@0 704 static void
michael@0 705 invalidateChunk(UText *ut) {
michael@0 706 ut->chunkLength = 0;
michael@0 707 ut->chunkNativeLimit = 0;
michael@0 708 ut->chunkNativeStart = 0;
michael@0 709 ut->chunkOffset = 0;
michael@0 710 ut->nativeIndexingLimit = 0;
michael@0 711 }
michael@0 712
michael@0 713 //
michael@0 714 // pinIndex Do range pinning on a native index parameter.
michael@0 715 // 64 bit pinning is done in place.
michael@0 716 // 32 bit truncated result is returned as a convenience for
michael@0 717 // use in providers that don't need 64 bits.
michael@0 718 static int32_t
michael@0 719 pinIndex(int64_t &index, int64_t limit) {
michael@0 720 if (index<0) {
michael@0 721 index = 0;
michael@0 722 } else if (index > limit) {
michael@0 723 index = limit;
michael@0 724 }
michael@0 725 return (int32_t)index;
michael@0 726 }
michael@0 727
michael@0 728
michael@0 729 U_CDECL_BEGIN
michael@0 730
michael@0 731 //
michael@0 732 // Pointer relocation function,
michael@0 733 // a utility used by shallow clone.
michael@0 734 // Adjust a pointer that refers to something within one UText (the source)
michael@0 735 // to refer to the same relative offset within a another UText (the target)
michael@0 736 //
michael@0 737 static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
michael@0 738 // convert all pointers to (char *) so that byte address arithmetic will work.
michael@0 739 char *dptr = (char *)*destPtr;
michael@0 740 char *dUText = (char *)dest;
michael@0 741 char *sUText = (char *)src;
michael@0 742
michael@0 743 if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
michael@0 744 // target ptr was to something within the src UText's pExtra storage.
michael@0 745 // relocate it into the target UText's pExtra region.
michael@0 746 *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
michael@0 747 } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
michael@0 748 // target ptr was pointing to somewhere within the source UText itself.
michael@0 749 // Move it to the same offset within the target UText.
michael@0 750 *destPtr = dUText + (dptr-sUText);
michael@0 751 }
michael@0 752 }
michael@0 753
michael@0 754
michael@0 755 //
michael@0 756 // Clone. This is a generic copy-the-utext-by-value clone function that can be
michael@0 757 // used as-is with some utext types, and as a helper by other clones.
michael@0 758 //
michael@0 759 static UText * U_CALLCONV
michael@0 760 shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
michael@0 761 if (U_FAILURE(*status)) {
michael@0 762 return NULL;
michael@0 763 }
michael@0 764 int32_t srcExtraSize = src->extraSize;
michael@0 765
michael@0 766 //
michael@0 767 // Use the generic text_setup to allocate storage if required.
michael@0 768 //
michael@0 769 dest = utext_setup(dest, srcExtraSize, status);
michael@0 770 if (U_FAILURE(*status)) {
michael@0 771 return dest;
michael@0 772 }
michael@0 773
michael@0 774 //
michael@0 775 // flags (how the UText was allocated) and the pointer to the
michael@0 776 // extra storage must retain the values in the cloned utext that
michael@0 777 // were set up by utext_setup. Save them separately before
michael@0 778 // copying the whole struct.
michael@0 779 //
michael@0 780 void *destExtra = dest->pExtra;
michael@0 781 int32_t flags = dest->flags;
michael@0 782
michael@0 783
michael@0 784 //
michael@0 785 // Copy the whole UText struct by value.
michael@0 786 // Any "Extra" storage is copied also.
michael@0 787 //
michael@0 788 int sizeToCopy = src->sizeOfStruct;
michael@0 789 if (sizeToCopy > dest->sizeOfStruct) {
michael@0 790 sizeToCopy = dest->sizeOfStruct;
michael@0 791 }
michael@0 792 uprv_memcpy(dest, src, sizeToCopy);
michael@0 793 dest->pExtra = destExtra;
michael@0 794 dest->flags = flags;
michael@0 795 if (srcExtraSize > 0) {
michael@0 796 uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
michael@0 797 }
michael@0 798
michael@0 799 //
michael@0 800 // Relocate any pointers in the target that refer to the UText itself
michael@0 801 // to point to the cloned copy rather than the original source.
michael@0 802 //
michael@0 803 adjustPointer(dest, &dest->context, src);
michael@0 804 adjustPointer(dest, &dest->p, src);
michael@0 805 adjustPointer(dest, &dest->q, src);
michael@0 806 adjustPointer(dest, &dest->r, src);
michael@0 807 adjustPointer(dest, (const void **)&dest->chunkContents, src);
michael@0 808
michael@0 809 return dest;
michael@0 810 }
michael@0 811
michael@0 812
michael@0 813 U_CDECL_END
michael@0 814
michael@0 815
michael@0 816
michael@0 817 //------------------------------------------------------------------------------
michael@0 818 //
michael@0 819 // UText implementation for UTF-8 char * strings (read-only)
michael@0 820 // Limitation: string length must be <= 0x7fffffff in length.
michael@0 821 // (length must for in an int32_t variable)
michael@0 822 //
michael@0 823 // Use of UText data members:
michael@0 824 // context pointer to UTF-8 string
michael@0 825 // utext.b is the input string length (bytes).
michael@0 826 // utext.c Length scanned so far in string
michael@0 827 // (for optimizing finding length of zero terminated strings.)
michael@0 828 // utext.p pointer to the current buffer
michael@0 829 // utext.q pointer to the other buffer.
michael@0 830 //
michael@0 831 //------------------------------------------------------------------------------
michael@0 832
michael@0 833 // Chunk size.
michael@0 834 // Must be less than 85, because of byte mapping from UChar indexes to native indexes.
michael@0 835 // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes
michael@0 836 // to two UChars.)
michael@0 837 //
michael@0 838 enum { UTF8_TEXT_CHUNK_SIZE=32 };
michael@0 839
michael@0 840 //
michael@0 841 // UTF8Buf Two of these structs will be set up in the UText's extra allocated space.
michael@0 842 // Each contains the UChar chunk buffer, the to and from native maps, and
michael@0 843 // header info.
michael@0 844 //
michael@0 845 // because backwards iteration fills the buffers starting at the end and
michael@0 846 // working towards the front, the filled part of the buffers may not begin
michael@0 847 // at the start of the available storage for the buffers.
michael@0 848 //
michael@0 849 // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
michael@0 850 // the last character added being a supplementary, and thus requiring a surrogate
michael@0 851 // pair. Doing this is simpler than checking for the edge case.
michael@0 852 //
michael@0 853
michael@0 854 struct UTF8Buf {
michael@0 855 int32_t bufNativeStart; // Native index of first char in UChar buf
michael@0 856 int32_t bufNativeLimit; // Native index following last char in buf.
michael@0 857 int32_t bufStartIdx; // First filled position in buf.
michael@0 858 int32_t bufLimitIdx; // Limit of filled range in buf.
michael@0 859 int32_t bufNILimit; // Limit of native indexing part of buf
michael@0 860 int32_t toUCharsMapStart; // Native index corresponding to
michael@0 861 // mapToUChars[0].
michael@0 862 // Set to bufNativeStart when filling forwards.
michael@0 863 // Set to computed value when filling backwards.
michael@0 864
michael@0 865 UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the
michael@0 866 // the chunk size, to allow for surrogate at the end.
michael@0 867 // Length must be identical to mapToNative array, below,
michael@0 868 // because of the way indexing works when the array is
michael@0 869 // filled backwards during a reverse iteration. Thus,
michael@0 870 // the additional extra size.
michael@0 871 uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to
michael@0 872 // native offset from bufNativeStart.
michael@0 873 // Requires two extra slots,
michael@0 874 // one for a supplementary starting in the last normal position,
michael@0 875 // and one for an entry for the buffer limit position.
michael@0 876 uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
michael@0 877 // correspoding offset in filled part of buf.
michael@0 878 int32_t align;
michael@0 879 };
michael@0 880
michael@0 881 U_CDECL_BEGIN
michael@0 882
michael@0 883 //
michael@0 884 // utf8TextLength
michael@0 885 //
michael@0 886 // Get the length of the string. If we don't already know it,
michael@0 887 // we'll need to scan for the trailing nul.
michael@0 888 //
michael@0 889 static int64_t U_CALLCONV
michael@0 890 utf8TextLength(UText *ut) {
michael@0 891 if (ut->b < 0) {
michael@0 892 // Zero terminated string, and we haven't scanned to the end yet.
michael@0 893 // Scan it now.
michael@0 894 const char *r = (const char *)ut->context + ut->c;
michael@0 895 while (*r != 0) {
michael@0 896 r++;
michael@0 897 }
michael@0 898 if ((r - (const char *)ut->context) < 0x7fffffff) {
michael@0 899 ut->b = (int32_t)(r - (const char *)ut->context);
michael@0 900 } else {
michael@0 901 // Actual string was bigger (more than 2 gig) than we
michael@0 902 // can handle. Clip it to 2 GB.
michael@0 903 ut->b = 0x7fffffff;
michael@0 904 }
michael@0 905 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 906 }
michael@0 907 return ut->b;
michael@0 908 }
michael@0 909
michael@0 910
michael@0 911
michael@0 912
michael@0 913
michael@0 914
michael@0 915 static UBool U_CALLCONV
michael@0 916 utf8TextAccess(UText *ut, int64_t index, UBool forward) {
michael@0 917 //
michael@0 918 // Apologies to those who are allergic to goto statements.
michael@0 919 // Consider each goto to a labelled block to be the equivalent of
michael@0 920 // call the named block as if it were a function();
michael@0 921 // return;
michael@0 922 //
michael@0 923 const uint8_t *s8=(const uint8_t *)ut->context;
michael@0 924 UTF8Buf *u8b = NULL;
michael@0 925 int32_t length = ut->b; // Length of original utf-8
michael@0 926 int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits.
michael@0 927 int32_t mapIndex = 0;
michael@0 928 if (index<0) {
michael@0 929 ix=0;
michael@0 930 } else if (index > 0x7fffffff) {
michael@0 931 // Strings with 64 bit lengths not supported by this UTF-8 provider.
michael@0 932 ix = 0x7fffffff;
michael@0 933 }
michael@0 934
michael@0 935 // Pin requested index to the string length.
michael@0 936 if (ix>length) {
michael@0 937 if (length>=0) {
michael@0 938 ix=length;
michael@0 939 } else if (ix>=ut->c) {
michael@0 940 // Zero terminated string, and requested index is beyond
michael@0 941 // the region that has already been scanned.
michael@0 942 // Scan up to either the end of the string or to the
michael@0 943 // requested position, whichever comes first.
michael@0 944 while (ut->c<ix && s8[ut->c]!=0) {
michael@0 945 ut->c++;
michael@0 946 }
michael@0 947 // TODO: support for null terminated string length > 32 bits.
michael@0 948 if (s8[ut->c] == 0) {
michael@0 949 // We just found the actual length of the string.
michael@0 950 // Trim the requested index back to that.
michael@0 951 ix = ut->c;
michael@0 952 ut->b = ut->c;
michael@0 953 length = ut->c;
michael@0 954 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 955 }
michael@0 956 }
michael@0 957 }
michael@0 958
michael@0 959 //
michael@0 960 // Dispatch to the appropriate action for a forward iteration request.
michael@0 961 //
michael@0 962 if (forward) {
michael@0 963 if (ix==ut->chunkNativeLimit) {
michael@0 964 // Check for normal sequential iteration cases first.
michael@0 965 if (ix==length) {
michael@0 966 // Just reached end of string
michael@0 967 // Don't swap buffers, but do set the
michael@0 968 // current buffer position.
michael@0 969 ut->chunkOffset = ut->chunkLength;
michael@0 970 return FALSE;
michael@0 971 } else {
michael@0 972 // End of current buffer.
michael@0 973 // check whether other buffer already has what we need.
michael@0 974 UTF8Buf *altB = (UTF8Buf *)ut->q;
michael@0 975 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
michael@0 976 goto swapBuffers;
michael@0 977 }
michael@0 978 }
michael@0 979 }
michael@0 980
michael@0 981 // A random access. Desired index could be in either or niether buf.
michael@0 982 // For optimizing the order of testing, first check for the index
michael@0 983 // being in the other buffer. This will be the case for uses that
michael@0 984 // move back and forth over a fairly limited range
michael@0 985 {
michael@0 986 u8b = (UTF8Buf *)ut->q; // the alternate buffer
michael@0 987 if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
michael@0 988 // Requested index is in the other buffer.
michael@0 989 goto swapBuffers;
michael@0 990 }
michael@0 991 if (ix == length) {
michael@0 992 // Requested index is end-of-string.
michael@0 993 // (this is the case of randomly seeking to the end.
michael@0 994 // The case of iterating off the end is handled earlier.)
michael@0 995 if (ix == ut->chunkNativeLimit) {
michael@0 996 // Current buffer extends up to the end of the string.
michael@0 997 // Leave it as the current buffer.
michael@0 998 ut->chunkOffset = ut->chunkLength;
michael@0 999 return FALSE;
michael@0 1000 }
michael@0 1001 if (ix == u8b->bufNativeLimit) {
michael@0 1002 // Alternate buffer extends to the end of string.
michael@0 1003 // Swap it in as the current buffer.
michael@0 1004 goto swapBuffersAndFail;
michael@0 1005 }
michael@0 1006
michael@0 1007 // Neither existing buffer extends to the end of the string.
michael@0 1008 goto makeStubBuffer;
michael@0 1009 }
michael@0 1010
michael@0 1011 if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
michael@0 1012 // Requested index is in neither buffer.
michael@0 1013 goto fillForward;
michael@0 1014 }
michael@0 1015
michael@0 1016 // Requested index is in this buffer.
michael@0 1017 u8b = (UTF8Buf *)ut->p; // the current buffer
michael@0 1018 mapIndex = ix - u8b->toUCharsMapStart;
michael@0 1019 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
michael@0 1020 return TRUE;
michael@0 1021
michael@0 1022 }
michael@0 1023 }
michael@0 1024
michael@0 1025
michael@0 1026 //
michael@0 1027 // Dispatch to the appropriate action for a
michael@0 1028 // Backwards Diretion iteration request.
michael@0 1029 //
michael@0 1030 if (ix==ut->chunkNativeStart) {
michael@0 1031 // Check for normal sequential iteration cases first.
michael@0 1032 if (ix==0) {
michael@0 1033 // Just reached the start of string
michael@0 1034 // Don't swap buffers, but do set the
michael@0 1035 // current buffer position.
michael@0 1036 ut->chunkOffset = 0;
michael@0 1037 return FALSE;
michael@0 1038 } else {
michael@0 1039 // Start of current buffer.
michael@0 1040 // check whether other buffer already has what we need.
michael@0 1041 UTF8Buf *altB = (UTF8Buf *)ut->q;
michael@0 1042 if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
michael@0 1043 goto swapBuffers;
michael@0 1044 }
michael@0 1045 }
michael@0 1046 }
michael@0 1047
michael@0 1048 // A random access. Desired index could be in either or niether buf.
michael@0 1049 // For optimizing the order of testing,
michael@0 1050 // Most likely case: in the other buffer.
michael@0 1051 // Second most likely: in neither buffer.
michael@0 1052 // Unlikely, but must work: in the current buffer.
michael@0 1053 u8b = (UTF8Buf *)ut->q; // the alternate buffer
michael@0 1054 if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
michael@0 1055 // Requested index is in the other buffer.
michael@0 1056 goto swapBuffers;
michael@0 1057 }
michael@0 1058 // Requested index is start-of-string.
michael@0 1059 // (this is the case of randomly seeking to the start.
michael@0 1060 // The case of iterating off the start is handled earlier.)
michael@0 1061 if (ix==0) {
michael@0 1062 if (u8b->bufNativeStart==0) {
michael@0 1063 // Alternate buffer contains the data for the start string.
michael@0 1064 // Make it be the current buffer.
michael@0 1065 goto swapBuffersAndFail;
michael@0 1066 } else {
michael@0 1067 // Request for data before the start of string,
michael@0 1068 // neither buffer is usable.
michael@0 1069 // set up a zero-length buffer.
michael@0 1070 goto makeStubBuffer;
michael@0 1071 }
michael@0 1072 }
michael@0 1073
michael@0 1074 if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
michael@0 1075 // Requested index is in neither buffer.
michael@0 1076 goto fillReverse;
michael@0 1077 }
michael@0 1078
michael@0 1079 // Requested index is in this buffer.
michael@0 1080 // Set the utf16 buffer index.
michael@0 1081 u8b = (UTF8Buf *)ut->p;
michael@0 1082 mapIndex = ix - u8b->toUCharsMapStart;
michael@0 1083 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
michael@0 1084 if (ut->chunkOffset==0) {
michael@0 1085 // This occurs when the first character in the text is
michael@0 1086 // a multi-byte UTF-8 char, and the requested index is to
michael@0 1087 // one of the trailing bytes. Because there is no preceding ,
michael@0 1088 // character, this access fails. We can't pick up on the
michael@0 1089 // situation sooner because the requested index is not zero.
michael@0 1090 return FALSE;
michael@0 1091 } else {
michael@0 1092 return TRUE;
michael@0 1093 }
michael@0 1094
michael@0 1095
michael@0 1096
michael@0 1097 swapBuffers:
michael@0 1098 // The alternate buffer (ut->q) has the string data that was requested.
michael@0 1099 // Swap the primary and alternate buffers, and set the
michael@0 1100 // chunk index into the new primary buffer.
michael@0 1101 {
michael@0 1102 u8b = (UTF8Buf *)ut->q;
michael@0 1103 ut->q = ut->p;
michael@0 1104 ut->p = u8b;
michael@0 1105 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
michael@0 1106 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
michael@0 1107 ut->chunkNativeStart = u8b->bufNativeStart;
michael@0 1108 ut->chunkNativeLimit = u8b->bufNativeLimit;
michael@0 1109 ut->nativeIndexingLimit = u8b->bufNILimit;
michael@0 1110
michael@0 1111 // Index into the (now current) chunk
michael@0 1112 // Use the map to set the chunk index. It's more trouble than it's worth
michael@0 1113 // to check whether native indexing can be used.
michael@0 1114 U_ASSERT(ix>=u8b->bufNativeStart);
michael@0 1115 U_ASSERT(ix<=u8b->bufNativeLimit);
michael@0 1116 mapIndex = ix - u8b->toUCharsMapStart;
michael@0 1117 U_ASSERT(mapIndex>=0);
michael@0 1118 U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
michael@0 1119 ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
michael@0 1120
michael@0 1121 return TRUE;
michael@0 1122 }
michael@0 1123
michael@0 1124
michael@0 1125 swapBuffersAndFail:
michael@0 1126 // We got a request for either the start or end of the string,
michael@0 1127 // with iteration continuing in the out-of-bounds direction.
michael@0 1128 // The alternate buffer already contains the data up to the
michael@0 1129 // start/end.
michael@0 1130 // Swap the buffers, then return failure, indicating that we couldn't
michael@0 1131 // make things correct for continuing the iteration in the requested
michael@0 1132 // direction. The position & buffer are correct should the
michael@0 1133 // user decide to iterate in the opposite direction.
michael@0 1134 u8b = (UTF8Buf *)ut->q;
michael@0 1135 ut->q = ut->p;
michael@0 1136 ut->p = u8b;
michael@0 1137 ut->chunkContents = &u8b->buf[u8b->bufStartIdx];
michael@0 1138 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
michael@0 1139 ut->chunkNativeStart = u8b->bufNativeStart;
michael@0 1140 ut->chunkNativeLimit = u8b->bufNativeLimit;
michael@0 1141 ut->nativeIndexingLimit = u8b->bufNILimit;
michael@0 1142
michael@0 1143 // Index into the (now current) chunk
michael@0 1144 // For this function (swapBuffersAndFail), the requested index
michael@0 1145 // will always be at either the start or end of the chunk.
michael@0 1146 if (ix==u8b->bufNativeLimit) {
michael@0 1147 ut->chunkOffset = ut->chunkLength;
michael@0 1148 } else {
michael@0 1149 ut->chunkOffset = 0;
michael@0 1150 U_ASSERT(ix == u8b->bufNativeStart);
michael@0 1151 }
michael@0 1152 return FALSE;
michael@0 1153
michael@0 1154 makeStubBuffer:
michael@0 1155 // The user has done a seek/access past the start or end
michael@0 1156 // of the string. Rather than loading data that is likely
michael@0 1157 // to never be used, just set up a zero-length buffer at
michael@0 1158 // the position.
michael@0 1159 u8b = (UTF8Buf *)ut->q;
michael@0 1160 u8b->bufNativeStart = ix;
michael@0 1161 u8b->bufNativeLimit = ix;
michael@0 1162 u8b->bufStartIdx = 0;
michael@0 1163 u8b->bufLimitIdx = 0;
michael@0 1164 u8b->bufNILimit = 0;
michael@0 1165 u8b->toUCharsMapStart = ix;
michael@0 1166 u8b->mapToNative[0] = 0;
michael@0 1167 u8b->mapToUChars[0] = 0;
michael@0 1168 goto swapBuffersAndFail;
michael@0 1169
michael@0 1170
michael@0 1171
michael@0 1172 fillForward:
michael@0 1173 {
michael@0 1174 // Move the incoming index to a code point boundary.
michael@0 1175 U8_SET_CP_START(s8, 0, ix);
michael@0 1176
michael@0 1177 // Swap the UText buffers.
michael@0 1178 // We want to fill what was previously the alternate buffer,
michael@0 1179 // and make what was the current buffer be the new alternate.
michael@0 1180 UTF8Buf *u8b = (UTF8Buf *)ut->q;
michael@0 1181 ut->q = ut->p;
michael@0 1182 ut->p = u8b;
michael@0 1183
michael@0 1184 int32_t strLen = ut->b;
michael@0 1185 UBool nulTerminated = FALSE;
michael@0 1186 if (strLen < 0) {
michael@0 1187 strLen = 0x7fffffff;
michael@0 1188 nulTerminated = TRUE;
michael@0 1189 }
michael@0 1190
michael@0 1191 UChar *buf = u8b->buf;
michael@0 1192 uint8_t *mapToNative = u8b->mapToNative;
michael@0 1193 uint8_t *mapToUChars = u8b->mapToUChars;
michael@0 1194 int32_t destIx = 0;
michael@0 1195 int32_t srcIx = ix;
michael@0 1196 UBool seenNonAscii = FALSE;
michael@0 1197 UChar32 c = 0;
michael@0 1198
michael@0 1199 // Fill the chunk buffer and mapping arrays.
michael@0 1200 while (destIx<UTF8_TEXT_CHUNK_SIZE) {
michael@0 1201 c = s8[srcIx];
michael@0 1202 if (c>0 && c<0x80) {
michael@0 1203 // Special case ASCII range for speed.
michael@0 1204 // zero is excluded to simplify bounds checking.
michael@0 1205 buf[destIx] = (UChar)c;
michael@0 1206 mapToNative[destIx] = (uint8_t)(srcIx - ix);
michael@0 1207 mapToUChars[srcIx-ix] = (uint8_t)destIx;
michael@0 1208 srcIx++;
michael@0 1209 destIx++;
michael@0 1210 } else {
michael@0 1211 // General case, handle everything.
michael@0 1212 if (seenNonAscii == FALSE) {
michael@0 1213 seenNonAscii = TRUE;
michael@0 1214 u8b->bufNILimit = destIx;
michael@0 1215 }
michael@0 1216
michael@0 1217 int32_t cIx = srcIx;
michael@0 1218 int32_t dIx = destIx;
michael@0 1219 int32_t dIxSaved = destIx;
michael@0 1220 U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
michael@0 1221 if (c==0 && nulTerminated) {
michael@0 1222 srcIx--;
michael@0 1223 break;
michael@0 1224 }
michael@0 1225
michael@0 1226 U16_APPEND_UNSAFE(buf, destIx, c);
michael@0 1227 do {
michael@0 1228 mapToNative[dIx++] = (uint8_t)(cIx - ix);
michael@0 1229 } while (dIx < destIx);
michael@0 1230
michael@0 1231 do {
michael@0 1232 mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
michael@0 1233 } while (cIx < srcIx);
michael@0 1234 }
michael@0 1235 if (srcIx>=strLen) {
michael@0 1236 break;
michael@0 1237 }
michael@0 1238
michael@0 1239 }
michael@0 1240
michael@0 1241 // store Native <--> Chunk Map entries for the end of the buffer.
michael@0 1242 // There is no actual character here, but the index position is valid.
michael@0 1243 mapToNative[destIx] = (uint8_t)(srcIx - ix);
michael@0 1244 mapToUChars[srcIx - ix] = (uint8_t)destIx;
michael@0 1245
michael@0 1246 // fill in Buffer descriptor
michael@0 1247 u8b->bufNativeStart = ix;
michael@0 1248 u8b->bufNativeLimit = srcIx;
michael@0 1249 u8b->bufStartIdx = 0;
michael@0 1250 u8b->bufLimitIdx = destIx;
michael@0 1251 if (seenNonAscii == FALSE) {
michael@0 1252 u8b->bufNILimit = destIx;
michael@0 1253 }
michael@0 1254 u8b->toUCharsMapStart = u8b->bufNativeStart;
michael@0 1255
michael@0 1256 // Set UText chunk to refer to this buffer.
michael@0 1257 ut->chunkContents = buf;
michael@0 1258 ut->chunkOffset = 0;
michael@0 1259 ut->chunkLength = u8b->bufLimitIdx;
michael@0 1260 ut->chunkNativeStart = u8b->bufNativeStart;
michael@0 1261 ut->chunkNativeLimit = u8b->bufNativeLimit;
michael@0 1262 ut->nativeIndexingLimit = u8b->bufNILimit;
michael@0 1263
michael@0 1264 // For zero terminated strings, keep track of the maximum point
michael@0 1265 // scanned so far.
michael@0 1266 if (nulTerminated && srcIx>ut->c) {
michael@0 1267 ut->c = srcIx;
michael@0 1268 if (c==0) {
michael@0 1269 // We scanned to the end.
michael@0 1270 // Remember the actual length.
michael@0 1271 ut->b = srcIx;
michael@0 1272 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 1273 }
michael@0 1274 }
michael@0 1275 return TRUE;
michael@0 1276 }
michael@0 1277
michael@0 1278
michael@0 1279 fillReverse:
michael@0 1280 {
michael@0 1281 // Move the incoming index to a code point boundary.
michael@0 1282 // Can only do this if the incoming index is somewhere in the interior of the string.
michael@0 1283 // If index is at the end, there is no character there to look at.
michael@0 1284 if (ix != ut->b) {
michael@0 1285 U8_SET_CP_START(s8, 0, ix);
michael@0 1286 }
michael@0 1287
michael@0 1288 // Swap the UText buffers.
michael@0 1289 // We want to fill what was previously the alternate buffer,
michael@0 1290 // and make what was the current buffer be the new alternate.
michael@0 1291 UTF8Buf *u8b = (UTF8Buf *)ut->q;
michael@0 1292 ut->q = ut->p;
michael@0 1293 ut->p = u8b;
michael@0 1294
michael@0 1295 UChar *buf = u8b->buf;
michael@0 1296 uint8_t *mapToNative = u8b->mapToNative;
michael@0 1297 uint8_t *mapToUChars = u8b->mapToUChars;
michael@0 1298 int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
michael@0 1299 int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region
michael@0 1300 // at end of buffer to leave room
michael@0 1301 // for a surrogate pair at the
michael@0 1302 // buffer start.
michael@0 1303 int32_t srcIx = ix;
michael@0 1304 int32_t bufNILimit = destIx;
michael@0 1305 UChar32 c;
michael@0 1306
michael@0 1307 // Map to/from Native Indexes, fill in for the position at the end of
michael@0 1308 // the buffer.
michael@0 1309 //
michael@0 1310 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
michael@0 1311 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
michael@0 1312
michael@0 1313 // Fill the chunk buffer
michael@0 1314 // Work backwards, filling from the end of the buffer towards the front.
michael@0 1315 //
michael@0 1316 while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
michael@0 1317 srcIx--;
michael@0 1318 destIx--;
michael@0 1319
michael@0 1320 // Get last byte of the UTF-8 character
michael@0 1321 c = s8[srcIx];
michael@0 1322 if (c<0x80) {
michael@0 1323 // Special case ASCII range for speed.
michael@0 1324 buf[destIx] = (UChar)c;
michael@0 1325 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
michael@0 1326 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
michael@0 1327 } else {
michael@0 1328 // General case, handle everything non-ASCII.
michael@0 1329
michael@0 1330 int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char
michael@0 1331
michael@0 1332 // Get the full character from the UTF8 string.
michael@0 1333 // use code derived from tbe macros in utf8.h
michael@0 1334 // Leaves srcIx pointing at the first byte of the UTF-8 char.
michael@0 1335 //
michael@0 1336 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
michael@0 1337 // leaves srcIx at first byte of the multi-byte char.
michael@0 1338
michael@0 1339 // Store the character in UTF-16 buffer.
michael@0 1340 if (c<0x10000) {
michael@0 1341 buf[destIx] = (UChar)c;
michael@0 1342 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
michael@0 1343 } else {
michael@0 1344 buf[destIx] = U16_TRAIL(c);
michael@0 1345 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
michael@0 1346 buf[--destIx] = U16_LEAD(c);
michael@0 1347 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
michael@0 1348 }
michael@0 1349
michael@0 1350 // Fill in the map from native indexes to UChars buf index.
michael@0 1351 do {
michael@0 1352 mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
michael@0 1353 } while (sIx >= srcIx);
michael@0 1354
michael@0 1355 // Set native indexing limit to be the current position.
michael@0 1356 // We are processing a non-ascii, non-native-indexing char now;
michael@0 1357 // the limit will be here if the rest of the chars to be
michael@0 1358 // added to this buffer are ascii.
michael@0 1359 bufNILimit = destIx;
michael@0 1360 }
michael@0 1361 }
michael@0 1362 u8b->bufNativeStart = srcIx;
michael@0 1363 u8b->bufNativeLimit = ix;
michael@0 1364 u8b->bufStartIdx = destIx;
michael@0 1365 u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2;
michael@0 1366 u8b->bufNILimit = bufNILimit - u8b->bufStartIdx;
michael@0 1367 u8b->toUCharsMapStart = toUCharsMapStart;
michael@0 1368
michael@0 1369 ut->chunkContents = &buf[u8b->bufStartIdx];
michael@0 1370 ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx;
michael@0 1371 ut->chunkOffset = ut->chunkLength;
michael@0 1372 ut->chunkNativeStart = u8b->bufNativeStart;
michael@0 1373 ut->chunkNativeLimit = u8b->bufNativeLimit;
michael@0 1374 ut->nativeIndexingLimit = u8b->bufNILimit;
michael@0 1375 return TRUE;
michael@0 1376 }
michael@0 1377
michael@0 1378 }
michael@0 1379
michael@0 1380
michael@0 1381
michael@0 1382 //
michael@0 1383 // This is a slightly modified copy of u_strFromUTF8,
michael@0 1384 // Inserts a Replacement Char rather than failing on invalid UTF-8
michael@0 1385 // Removes unnecessary features.
michael@0 1386 //
michael@0 1387 static UChar*
michael@0 1388 utext_strFromUTF8(UChar *dest,
michael@0 1389 int32_t destCapacity,
michael@0 1390 int32_t *pDestLength,
michael@0 1391 const char* src,
michael@0 1392 int32_t srcLength, // required. NUL terminated not supported.
michael@0 1393 UErrorCode *pErrorCode
michael@0 1394 )
michael@0 1395 {
michael@0 1396
michael@0 1397 UChar *pDest = dest;
michael@0 1398 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
michael@0 1399 UChar32 ch=0;
michael@0 1400 int32_t index = 0;
michael@0 1401 int32_t reqLength = 0;
michael@0 1402 uint8_t* pSrc = (uint8_t*) src;
michael@0 1403
michael@0 1404
michael@0 1405 while((index < srcLength)&&(pDest<pDestLimit)){
michael@0 1406 ch = pSrc[index++];
michael@0 1407 if(ch <=0x7f){
michael@0 1408 *pDest++=(UChar)ch;
michael@0 1409 }else{
michael@0 1410 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
michael@0 1411 if(U_IS_BMP(ch)){
michael@0 1412 *(pDest++)=(UChar)ch;
michael@0 1413 }else{
michael@0 1414 *(pDest++)=U16_LEAD(ch);
michael@0 1415 if(pDest<pDestLimit){
michael@0 1416 *(pDest++)=U16_TRAIL(ch);
michael@0 1417 }else{
michael@0 1418 reqLength++;
michael@0 1419 break;
michael@0 1420 }
michael@0 1421 }
michael@0 1422 }
michael@0 1423 }
michael@0 1424 /* donot fill the dest buffer just count the UChars needed */
michael@0 1425 while(index < srcLength){
michael@0 1426 ch = pSrc[index++];
michael@0 1427 if(ch <= 0x7f){
michael@0 1428 reqLength++;
michael@0 1429 }else{
michael@0 1430 ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
michael@0 1431 reqLength+=U16_LENGTH(ch);
michael@0 1432 }
michael@0 1433 }
michael@0 1434
michael@0 1435 reqLength+=(int32_t)(pDest - dest);
michael@0 1436
michael@0 1437 if(pDestLength){
michael@0 1438 *pDestLength = reqLength;
michael@0 1439 }
michael@0 1440
michael@0 1441 /* Terminate the buffer */
michael@0 1442 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
michael@0 1443
michael@0 1444 return dest;
michael@0 1445 }
michael@0 1446
michael@0 1447
michael@0 1448
michael@0 1449 static int32_t U_CALLCONV
michael@0 1450 utf8TextExtract(UText *ut,
michael@0 1451 int64_t start, int64_t limit,
michael@0 1452 UChar *dest, int32_t destCapacity,
michael@0 1453 UErrorCode *pErrorCode) {
michael@0 1454 if(U_FAILURE(*pErrorCode)) {
michael@0 1455 return 0;
michael@0 1456 }
michael@0 1457 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
michael@0 1458 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1459 return 0;
michael@0 1460 }
michael@0 1461 int32_t length = ut->b;
michael@0 1462 int32_t start32 = pinIndex(start, length);
michael@0 1463 int32_t limit32 = pinIndex(limit, length);
michael@0 1464
michael@0 1465 if(start32>limit32) {
michael@0 1466 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1467 return 0;
michael@0 1468 }
michael@0 1469
michael@0 1470
michael@0 1471 // adjust the incoming indexes to land on code point boundaries if needed.
michael@0 1472 // adjust by no more than three, because that is the largest number of trail bytes
michael@0 1473 // in a well formed UTF8 character.
michael@0 1474 const uint8_t *buf = (const uint8_t *)ut->context;
michael@0 1475 int i;
michael@0 1476 if (start32 < ut->chunkNativeLimit) {
michael@0 1477 for (i=0; i<3; i++) {
michael@0 1478 if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
michael@0 1479 break;
michael@0 1480 }
michael@0 1481 start32--;
michael@0 1482 }
michael@0 1483 }
michael@0 1484
michael@0 1485 if (limit32 < ut->chunkNativeLimit) {
michael@0 1486 for (i=0; i<3; i++) {
michael@0 1487 if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
michael@0 1488 break;
michael@0 1489 }
michael@0 1490 limit32--;
michael@0 1491 }
michael@0 1492 }
michael@0 1493
michael@0 1494 // Do the actual extract.
michael@0 1495 int32_t destLength=0;
michael@0 1496 utext_strFromUTF8(dest, destCapacity, &destLength,
michael@0 1497 (const char *)ut->context+start32, limit32-start32,
michael@0 1498 pErrorCode);
michael@0 1499 utf8TextAccess(ut, limit32, TRUE);
michael@0 1500 return destLength;
michael@0 1501 }
michael@0 1502
michael@0 1503 //
michael@0 1504 // utf8TextMapOffsetToNative
michael@0 1505 //
michael@0 1506 // Map a chunk (UTF-16) offset to a native index.
michael@0 1507 static int64_t U_CALLCONV
michael@0 1508 utf8TextMapOffsetToNative(const UText *ut) {
michael@0 1509 //
michael@0 1510 UTF8Buf *u8b = (UTF8Buf *)ut->p;
michael@0 1511 U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
michael@0 1512 int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
michael@0 1513 U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
michael@0 1514 return nativeOffset;
michael@0 1515 }
michael@0 1516
michael@0 1517 //
michael@0 1518 // Map a native index to the corrsponding chunk offset
michael@0 1519 //
michael@0 1520 static int32_t U_CALLCONV
michael@0 1521 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
michael@0 1522 U_ASSERT(index64 <= 0x7fffffff);
michael@0 1523 int32_t index = (int32_t)index64;
michael@0 1524 UTF8Buf *u8b = (UTF8Buf *)ut->p;
michael@0 1525 U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
michael@0 1526 U_ASSERT(index<=ut->chunkNativeLimit);
michael@0 1527 int32_t mapIndex = index - u8b->toUCharsMapStart;
michael@0 1528 int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
michael@0 1529 U_ASSERT(offset>=0 && offset<=ut->chunkLength);
michael@0 1530 return offset;
michael@0 1531 }
michael@0 1532
michael@0 1533 static UText * U_CALLCONV
michael@0 1534 utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
michael@0 1535 {
michael@0 1536 // First do a generic shallow clone. Does everything needed for the UText struct itself.
michael@0 1537 dest = shallowTextClone(dest, src, status);
michael@0 1538
michael@0 1539 // For deep clones, make a copy of the string.
michael@0 1540 // The copied storage is owned by the newly created clone.
michael@0 1541 //
michael@0 1542 // TODO: There is an isssue with using utext_nativeLength().
michael@0 1543 // That function is non-const in cases where the input was NUL terminated
michael@0 1544 // and the length has not yet been determined.
michael@0 1545 // This function (clone()) is const.
michael@0 1546 // There potentially a thread safety issue lurking here.
michael@0 1547 //
michael@0 1548 if (deep && U_SUCCESS(*status)) {
michael@0 1549 int32_t len = (int32_t)utext_nativeLength((UText *)src);
michael@0 1550 char *copyStr = (char *)uprv_malloc(len+1);
michael@0 1551 if (copyStr == NULL) {
michael@0 1552 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 1553 } else {
michael@0 1554 uprv_memcpy(copyStr, src->context, len+1);
michael@0 1555 dest->context = copyStr;
michael@0 1556 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
michael@0 1557 }
michael@0 1558 }
michael@0 1559 return dest;
michael@0 1560 }
michael@0 1561
michael@0 1562
michael@0 1563 static void U_CALLCONV
michael@0 1564 utf8TextClose(UText *ut) {
michael@0 1565 // Most of the work of close is done by the generic UText framework close.
michael@0 1566 // All that needs to be done here is to delete the UTF8 string if the UText
michael@0 1567 // owns it. This occurs if the UText was created by cloning.
michael@0 1568 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
michael@0 1569 char *s = (char *)ut->context;
michael@0 1570 uprv_free(s);
michael@0 1571 ut->context = NULL;
michael@0 1572 }
michael@0 1573 }
michael@0 1574
michael@0 1575 U_CDECL_END
michael@0 1576
michael@0 1577
michael@0 1578 static const struct UTextFuncs utf8Funcs =
michael@0 1579 {
michael@0 1580 sizeof(UTextFuncs),
michael@0 1581 0, 0, 0, // Reserved alignment padding
michael@0 1582 utf8TextClone,
michael@0 1583 utf8TextLength,
michael@0 1584 utf8TextAccess,
michael@0 1585 utf8TextExtract,
michael@0 1586 NULL, /* replace*/
michael@0 1587 NULL, /* copy */
michael@0 1588 utf8TextMapOffsetToNative,
michael@0 1589 utf8TextMapIndexToUTF16,
michael@0 1590 utf8TextClose,
michael@0 1591 NULL, // spare 1
michael@0 1592 NULL, // spare 2
michael@0 1593 NULL // spare 3
michael@0 1594 };
michael@0 1595
michael@0 1596
michael@0 1597 static const char gEmptyString[] = {0};
michael@0 1598
michael@0 1599 U_CAPI UText * U_EXPORT2
michael@0 1600 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
michael@0 1601 if(U_FAILURE(*status)) {
michael@0 1602 return NULL;
michael@0 1603 }
michael@0 1604 if(s==NULL && length==0) {
michael@0 1605 s = gEmptyString;
michael@0 1606 }
michael@0 1607
michael@0 1608 if(s==NULL || length<-1 || length>INT32_MAX) {
michael@0 1609 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1610 return NULL;
michael@0 1611 }
michael@0 1612
michael@0 1613 ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
michael@0 1614 if (U_FAILURE(*status)) {
michael@0 1615 return ut;
michael@0 1616 }
michael@0 1617
michael@0 1618 ut->pFuncs = &utf8Funcs;
michael@0 1619 ut->context = s;
michael@0 1620 ut->b = (int32_t)length;
michael@0 1621 ut->c = (int32_t)length;
michael@0 1622 if (ut->c < 0) {
michael@0 1623 ut->c = 0;
michael@0 1624 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 1625 }
michael@0 1626 ut->p = ut->pExtra;
michael@0 1627 ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
michael@0 1628 return ut;
michael@0 1629
michael@0 1630 }
michael@0 1631
michael@0 1632
michael@0 1633
michael@0 1634
michael@0 1635
michael@0 1636
michael@0 1637
michael@0 1638
michael@0 1639 //------------------------------------------------------------------------------
michael@0 1640 //
michael@0 1641 // UText implementation wrapper for Replaceable (read/write)
michael@0 1642 //
michael@0 1643 // Use of UText data members:
michael@0 1644 // context pointer to Replaceable.
michael@0 1645 // p pointer to Replaceable if it is owned by the UText.
michael@0 1646 //
michael@0 1647 //------------------------------------------------------------------------------
michael@0 1648
michael@0 1649
michael@0 1650
michael@0 1651 // minimum chunk size for this implementation: 3
michael@0 1652 // to allow for possible trimming for code point boundaries
michael@0 1653 enum { REP_TEXT_CHUNK_SIZE=10 };
michael@0 1654
michael@0 1655 struct ReplExtra {
michael@0 1656 /*
michael@0 1657 * Chunk UChars.
michael@0 1658 * +1 to simplify filling with surrogate pair at the end.
michael@0 1659 */
michael@0 1660 UChar s[REP_TEXT_CHUNK_SIZE+1];
michael@0 1661 };
michael@0 1662
michael@0 1663
michael@0 1664 U_CDECL_BEGIN
michael@0 1665
michael@0 1666 static UText * U_CALLCONV
michael@0 1667 repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
michael@0 1668 // First do a generic shallow clone. Does everything needed for the UText struct itself.
michael@0 1669 dest = shallowTextClone(dest, src, status);
michael@0 1670
michael@0 1671 // For deep clones, make a copy of the Replaceable.
michael@0 1672 // The copied Replaceable storage is owned by the newly created UText clone.
michael@0 1673 // A non-NULL pointer in UText.p is the signal to the close() function to delete
michael@0 1674 // it.
michael@0 1675 //
michael@0 1676 if (deep && U_SUCCESS(*status)) {
michael@0 1677 const Replaceable *replSrc = (const Replaceable *)src->context;
michael@0 1678 dest->context = replSrc->clone();
michael@0 1679 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
michael@0 1680
michael@0 1681 // with deep clone, the copy is writable, even when the source is not.
michael@0 1682 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
michael@0 1683 }
michael@0 1684 return dest;
michael@0 1685 }
michael@0 1686
michael@0 1687
michael@0 1688 static void U_CALLCONV
michael@0 1689 repTextClose(UText *ut) {
michael@0 1690 // Most of the work of close is done by the generic UText framework close.
michael@0 1691 // All that needs to be done here is delete the Replaceable if the UText
michael@0 1692 // owns it. This occurs if the UText was created by cloning.
michael@0 1693 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
michael@0 1694 Replaceable *rep = (Replaceable *)ut->context;
michael@0 1695 delete rep;
michael@0 1696 ut->context = NULL;
michael@0 1697 }
michael@0 1698 }
michael@0 1699
michael@0 1700
michael@0 1701 static int64_t U_CALLCONV
michael@0 1702 repTextLength(UText *ut) {
michael@0 1703 const Replaceable *replSrc = (const Replaceable *)ut->context;
michael@0 1704 int32_t len = replSrc->length();
michael@0 1705 return len;
michael@0 1706 }
michael@0 1707
michael@0 1708
michael@0 1709 static UBool U_CALLCONV
michael@0 1710 repTextAccess(UText *ut, int64_t index, UBool forward) {
michael@0 1711 const Replaceable *rep=(const Replaceable *)ut->context;
michael@0 1712 int32_t length=rep->length(); // Full length of the input text (bigger than a chunk)
michael@0 1713
michael@0 1714 // clip the requested index to the limits of the text.
michael@0 1715 int32_t index32 = pinIndex(index, length);
michael@0 1716 U_ASSERT(index<=INT32_MAX);
michael@0 1717
michael@0 1718
michael@0 1719 /*
michael@0 1720 * Compute start/limit boundaries around index, for a segment of text
michael@0 1721 * to be extracted.
michael@0 1722 * To allow for the possibility that our user gave an index to the trailing
michael@0 1723 * half of a surrogate pair, we must request one extra preceding UChar when
michael@0 1724 * going in the forward direction. This will ensure that the buffer has the
michael@0 1725 * entire code point at the specified index.
michael@0 1726 */
michael@0 1727 if(forward) {
michael@0 1728
michael@0 1729 if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
michael@0 1730 // Buffer already contains the requested position.
michael@0 1731 ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
michael@0 1732 return TRUE;
michael@0 1733 }
michael@0 1734 if (index32>=length && ut->chunkNativeLimit==length) {
michael@0 1735 // Request for end of string, and buffer already extends up to it.
michael@0 1736 // Can't get the data, but don't change the buffer.
michael@0 1737 ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
michael@0 1738 return FALSE;
michael@0 1739 }
michael@0 1740
michael@0 1741 ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
michael@0 1742 // Going forward, so we want to have the buffer with stuff at and beyond
michael@0 1743 // the requested index. The -1 gets us one code point before the
michael@0 1744 // requested index also, to handle the case of the index being on
michael@0 1745 // a trail surrogate of a surrogate pair.
michael@0 1746 if(ut->chunkNativeLimit > length) {
michael@0 1747 ut->chunkNativeLimit = length;
michael@0 1748 }
michael@0 1749 // unless buffer ran off end, start is index-1.
michael@0 1750 ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
michael@0 1751 if(ut->chunkNativeStart < 0) {
michael@0 1752 ut->chunkNativeStart = 0;
michael@0 1753 }
michael@0 1754 } else {
michael@0 1755 // Reverse iteration. Fill buffer with data preceding the requested index.
michael@0 1756 if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
michael@0 1757 // Requested position already in buffer.
michael@0 1758 ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
michael@0 1759 return TRUE;
michael@0 1760 }
michael@0 1761 if (index32==0 && ut->chunkNativeStart==0) {
michael@0 1762 // Request for start, buffer already begins at start.
michael@0 1763 // No data, but keep the buffer as is.
michael@0 1764 ut->chunkOffset = 0;
michael@0 1765 return FALSE;
michael@0 1766 }
michael@0 1767
michael@0 1768 // Figure out the bounds of the chunk to extract for reverse iteration.
michael@0 1769 // Need to worry about chunk not splitting surrogate pairs, and while still
michael@0 1770 // containing the data we need.
michael@0 1771 // Fix by requesting a chunk that includes an extra UChar at the end.
michael@0 1772 // If this turns out to be a lead surrogate, we can lop it off and still have
michael@0 1773 // the data we wanted.
michael@0 1774 ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
michael@0 1775 if (ut->chunkNativeStart < 0) {
michael@0 1776 ut->chunkNativeStart = 0;
michael@0 1777 }
michael@0 1778
michael@0 1779 ut->chunkNativeLimit = index32 + 1;
michael@0 1780 if (ut->chunkNativeLimit > length) {
michael@0 1781 ut->chunkNativeLimit = length;
michael@0 1782 }
michael@0 1783 }
michael@0 1784
michael@0 1785 // Extract the new chunk of text from the Replaceable source.
michael@0 1786 ReplExtra *ex = (ReplExtra *)ut->pExtra;
michael@0 1787 // UnicodeString with its buffer a writable alias to the chunk buffer
michael@0 1788 UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
michael@0 1789 rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
michael@0 1790
michael@0 1791 ut->chunkContents = ex->s;
michael@0 1792 ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
michael@0 1793 ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart);
michael@0 1794
michael@0 1795 // Surrogate pairs from the input text must not span chunk boundaries.
michael@0 1796 // If end of chunk could be the start of a surrogate, trim it off.
michael@0 1797 if (ut->chunkNativeLimit < length &&
michael@0 1798 U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
michael@0 1799 ut->chunkLength--;
michael@0 1800 ut->chunkNativeLimit--;
michael@0 1801 if (ut->chunkOffset > ut->chunkLength) {
michael@0 1802 ut->chunkOffset = ut->chunkLength;
michael@0 1803 }
michael@0 1804 }
michael@0 1805
michael@0 1806 // if the first UChar in the chunk could be the trailing half of a surrogate pair,
michael@0 1807 // trim it off.
michael@0 1808 if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
michael@0 1809 ++(ut->chunkContents);
michael@0 1810 ++(ut->chunkNativeStart);
michael@0 1811 --(ut->chunkLength);
michael@0 1812 --(ut->chunkOffset);
michael@0 1813 }
michael@0 1814
michael@0 1815 // adjust the index/chunkOffset to a code point boundary
michael@0 1816 U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
michael@0 1817
michael@0 1818 // Use fast indexing for get/setNativeIndex()
michael@0 1819 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 1820
michael@0 1821 return TRUE;
michael@0 1822 }
michael@0 1823
michael@0 1824
michael@0 1825
michael@0 1826 static int32_t U_CALLCONV
michael@0 1827 repTextExtract(UText *ut,
michael@0 1828 int64_t start, int64_t limit,
michael@0 1829 UChar *dest, int32_t destCapacity,
michael@0 1830 UErrorCode *status) {
michael@0 1831 const Replaceable *rep=(const Replaceable *)ut->context;
michael@0 1832 int32_t length=rep->length();
michael@0 1833
michael@0 1834 if(U_FAILURE(*status)) {
michael@0 1835 return 0;
michael@0 1836 }
michael@0 1837 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
michael@0 1838 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1839 }
michael@0 1840 if(start>limit) {
michael@0 1841 *status=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1842 return 0;
michael@0 1843 }
michael@0 1844
michael@0 1845 int32_t start32 = pinIndex(start, length);
michael@0 1846 int32_t limit32 = pinIndex(limit, length);
michael@0 1847
michael@0 1848 // adjust start, limit if they point to trail half of surrogates
michael@0 1849 if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
michael@0 1850 U_IS_SUPPLEMENTARY(rep->char32At(start32))){
michael@0 1851 start32--;
michael@0 1852 }
michael@0 1853 if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
michael@0 1854 U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
michael@0 1855 limit32--;
michael@0 1856 }
michael@0 1857
michael@0 1858 length=limit32-start32;
michael@0 1859 if(length>destCapacity) {
michael@0 1860 limit32 = start32 + destCapacity;
michael@0 1861 }
michael@0 1862 UnicodeString buffer(dest, 0, destCapacity); // writable alias
michael@0 1863 rep->extractBetween(start32, limit32, buffer);
michael@0 1864 repTextAccess(ut, limit32, TRUE);
michael@0 1865
michael@0 1866 return u_terminateUChars(dest, destCapacity, length, status);
michael@0 1867 }
michael@0 1868
michael@0 1869 static int32_t U_CALLCONV
michael@0 1870 repTextReplace(UText *ut,
michael@0 1871 int64_t start, int64_t limit,
michael@0 1872 const UChar *src, int32_t length,
michael@0 1873 UErrorCode *status) {
michael@0 1874 Replaceable *rep=(Replaceable *)ut->context;
michael@0 1875 int32_t oldLength;
michael@0 1876
michael@0 1877 if(U_FAILURE(*status)) {
michael@0 1878 return 0;
michael@0 1879 }
michael@0 1880 if(src==NULL && length!=0) {
michael@0 1881 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1882 return 0;
michael@0 1883 }
michael@0 1884 oldLength=rep->length(); // will subtract from new length
michael@0 1885 if(start>limit ) {
michael@0 1886 *status=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1887 return 0;
michael@0 1888 }
michael@0 1889
michael@0 1890 int32_t start32 = pinIndex(start, oldLength);
michael@0 1891 int32_t limit32 = pinIndex(limit, oldLength);
michael@0 1892
michael@0 1893 // Snap start & limit to code point boundaries.
michael@0 1894 if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
michael@0 1895 start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
michael@0 1896 {
michael@0 1897 start32--;
michael@0 1898 }
michael@0 1899 if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
michael@0 1900 U16_IS_TRAIL(rep->charAt(limit32)))
michael@0 1901 {
michael@0 1902 limit32++;
michael@0 1903 }
michael@0 1904
michael@0 1905 // Do the actual replace operation using methods of the Replaceable class
michael@0 1906 UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
michael@0 1907 rep->handleReplaceBetween(start32, limit32, replStr);
michael@0 1908 int32_t newLength = rep->length();
michael@0 1909 int32_t lengthDelta = newLength - oldLength;
michael@0 1910
michael@0 1911 // Is the UText chunk buffer OK?
michael@0 1912 if (ut->chunkNativeLimit > start32) {
michael@0 1913 // this replace operation may have impacted the current chunk.
michael@0 1914 // invalidate it, which will force a reload on the next access.
michael@0 1915 invalidateChunk(ut);
michael@0 1916 }
michael@0 1917
michael@0 1918 // set the iteration position to the end of the newly inserted replacement text.
michael@0 1919 int32_t newIndexPos = limit32 + lengthDelta;
michael@0 1920 repTextAccess(ut, newIndexPos, TRUE);
michael@0 1921
michael@0 1922 return lengthDelta;
michael@0 1923 }
michael@0 1924
michael@0 1925
michael@0 1926 static void U_CALLCONV
michael@0 1927 repTextCopy(UText *ut,
michael@0 1928 int64_t start, int64_t limit,
michael@0 1929 int64_t destIndex,
michael@0 1930 UBool move,
michael@0 1931 UErrorCode *status)
michael@0 1932 {
michael@0 1933 Replaceable *rep=(Replaceable *)ut->context;
michael@0 1934 int32_t length=rep->length();
michael@0 1935
michael@0 1936 if(U_FAILURE(*status)) {
michael@0 1937 return;
michael@0 1938 }
michael@0 1939 if (start>limit || (start<destIndex && destIndex<limit))
michael@0 1940 {
michael@0 1941 *status=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1942 return;
michael@0 1943 }
michael@0 1944
michael@0 1945 int32_t start32 = pinIndex(start, length);
michael@0 1946 int32_t limit32 = pinIndex(limit, length);
michael@0 1947 int32_t destIndex32 = pinIndex(destIndex, length);
michael@0 1948
michael@0 1949 // TODO: snap input parameters to code point boundaries.
michael@0 1950
michael@0 1951 if(move) {
michael@0 1952 // move: copy to destIndex, then replace original with nothing
michael@0 1953 int32_t segLength=limit32-start32;
michael@0 1954 rep->copy(start32, limit32, destIndex32);
michael@0 1955 if(destIndex32<start32) {
michael@0 1956 start32+=segLength;
michael@0 1957 limit32+=segLength;
michael@0 1958 }
michael@0 1959 rep->handleReplaceBetween(start32, limit32, UnicodeString());
michael@0 1960 } else {
michael@0 1961 // copy
michael@0 1962 rep->copy(start32, limit32, destIndex32);
michael@0 1963 }
michael@0 1964
michael@0 1965 // If the change to the text touched the region in the chunk buffer,
michael@0 1966 // invalidate the buffer.
michael@0 1967 int32_t firstAffectedIndex = destIndex32;
michael@0 1968 if (move && start32<firstAffectedIndex) {
michael@0 1969 firstAffectedIndex = start32;
michael@0 1970 }
michael@0 1971 if (firstAffectedIndex < ut->chunkNativeLimit) {
michael@0 1972 // changes may have affected range covered by the chunk
michael@0 1973 invalidateChunk(ut);
michael@0 1974 }
michael@0 1975
michael@0 1976 // Put iteration position at the newly inserted (moved) block,
michael@0 1977 int32_t nativeIterIndex = destIndex32 + limit32 - start32;
michael@0 1978 if (move && destIndex32>start32) {
michael@0 1979 // moved a block of text towards the end of the string.
michael@0 1980 nativeIterIndex = destIndex32;
michael@0 1981 }
michael@0 1982
michael@0 1983 // Set position, reload chunk if needed.
michael@0 1984 repTextAccess(ut, nativeIterIndex, TRUE);
michael@0 1985 }
michael@0 1986
michael@0 1987 static const struct UTextFuncs repFuncs =
michael@0 1988 {
michael@0 1989 sizeof(UTextFuncs),
michael@0 1990 0, 0, 0, // Reserved alignment padding
michael@0 1991 repTextClone,
michael@0 1992 repTextLength,
michael@0 1993 repTextAccess,
michael@0 1994 repTextExtract,
michael@0 1995 repTextReplace,
michael@0 1996 repTextCopy,
michael@0 1997 NULL, // MapOffsetToNative,
michael@0 1998 NULL, // MapIndexToUTF16,
michael@0 1999 repTextClose,
michael@0 2000 NULL, // spare 1
michael@0 2001 NULL, // spare 2
michael@0 2002 NULL // spare 3
michael@0 2003 };
michael@0 2004
michael@0 2005
michael@0 2006 U_CAPI UText * U_EXPORT2
michael@0 2007 utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
michael@0 2008 {
michael@0 2009 if(U_FAILURE(*status)) {
michael@0 2010 return NULL;
michael@0 2011 }
michael@0 2012 if(rep==NULL) {
michael@0 2013 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2014 return NULL;
michael@0 2015 }
michael@0 2016 ut = utext_setup(ut, sizeof(ReplExtra), status);
michael@0 2017
michael@0 2018 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
michael@0 2019 if(rep->hasMetaData()) {
michael@0 2020 ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
michael@0 2021 }
michael@0 2022
michael@0 2023 ut->pFuncs = &repFuncs;
michael@0 2024 ut->context = rep;
michael@0 2025 return ut;
michael@0 2026 }
michael@0 2027
michael@0 2028 U_CDECL_END
michael@0 2029
michael@0 2030
michael@0 2031
michael@0 2032
michael@0 2033
michael@0 2034
michael@0 2035
michael@0 2036
michael@0 2037 //------------------------------------------------------------------------------
michael@0 2038 //
michael@0 2039 // UText implementation for UnicodeString (read/write) and
michael@0 2040 // for const UnicodeString (read only)
michael@0 2041 // (same implementation, only the flags are different)
michael@0 2042 //
michael@0 2043 // Use of UText data members:
michael@0 2044 // context pointer to UnicodeString
michael@0 2045 // p pointer to UnicodeString IF this UText owns the string
michael@0 2046 // and it must be deleted on close(). NULL otherwise.
michael@0 2047 //
michael@0 2048 //------------------------------------------------------------------------------
michael@0 2049
michael@0 2050 U_CDECL_BEGIN
michael@0 2051
michael@0 2052
michael@0 2053 static UText * U_CALLCONV
michael@0 2054 unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
michael@0 2055 // First do a generic shallow clone. Does everything needed for the UText struct itself.
michael@0 2056 dest = shallowTextClone(dest, src, status);
michael@0 2057
michael@0 2058 // For deep clones, make a copy of the UnicodeSring.
michael@0 2059 // The copied UnicodeString storage is owned by the newly created UText clone.
michael@0 2060 // A non-NULL pointer in UText.p is the signal to the close() function to delete
michael@0 2061 // the UText.
michael@0 2062 //
michael@0 2063 if (deep && U_SUCCESS(*status)) {
michael@0 2064 const UnicodeString *srcString = (const UnicodeString *)src->context;
michael@0 2065 dest->context = new UnicodeString(*srcString);
michael@0 2066 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
michael@0 2067
michael@0 2068 // with deep clone, the copy is writable, even when the source is not.
michael@0 2069 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
michael@0 2070 }
michael@0 2071 return dest;
michael@0 2072 }
michael@0 2073
michael@0 2074 static void U_CALLCONV
michael@0 2075 unistrTextClose(UText *ut) {
michael@0 2076 // Most of the work of close is done by the generic UText framework close.
michael@0 2077 // All that needs to be done here is delete the UnicodeString if the UText
michael@0 2078 // owns it. This occurs if the UText was created by cloning.
michael@0 2079 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
michael@0 2080 UnicodeString *str = (UnicodeString *)ut->context;
michael@0 2081 delete str;
michael@0 2082 ut->context = NULL;
michael@0 2083 }
michael@0 2084 }
michael@0 2085
michael@0 2086
michael@0 2087 static int64_t U_CALLCONV
michael@0 2088 unistrTextLength(UText *t) {
michael@0 2089 return ((const UnicodeString *)t->context)->length();
michael@0 2090 }
michael@0 2091
michael@0 2092
michael@0 2093 static UBool U_CALLCONV
michael@0 2094 unistrTextAccess(UText *ut, int64_t index, UBool forward) {
michael@0 2095 int32_t length = ut->chunkLength;
michael@0 2096 ut->chunkOffset = pinIndex(index, length);
michael@0 2097
michael@0 2098 // Check whether request is at the start or end
michael@0 2099 UBool retVal = (forward && index<length) || (!forward && index>0);
michael@0 2100 return retVal;
michael@0 2101 }
michael@0 2102
michael@0 2103
michael@0 2104
michael@0 2105 static int32_t U_CALLCONV
michael@0 2106 unistrTextExtract(UText *t,
michael@0 2107 int64_t start, int64_t limit,
michael@0 2108 UChar *dest, int32_t destCapacity,
michael@0 2109 UErrorCode *pErrorCode) {
michael@0 2110 const UnicodeString *us=(const UnicodeString *)t->context;
michael@0 2111 int32_t length=us->length();
michael@0 2112
michael@0 2113 if(U_FAILURE(*pErrorCode)) {
michael@0 2114 return 0;
michael@0 2115 }
michael@0 2116 if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
michael@0 2117 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2118 }
michael@0 2119 if(start<0 || start>limit) {
michael@0 2120 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2121 return 0;
michael@0 2122 }
michael@0 2123
michael@0 2124 int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
michael@0 2125 int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
michael@0 2126
michael@0 2127 length=limit32-start32;
michael@0 2128 if (destCapacity>0 && dest!=NULL) {
michael@0 2129 int32_t trimmedLength = length;
michael@0 2130 if(trimmedLength>destCapacity) {
michael@0 2131 trimmedLength=destCapacity;
michael@0 2132 }
michael@0 2133 us->extract(start32, trimmedLength, dest);
michael@0 2134 t->chunkOffset = start32+trimmedLength;
michael@0 2135 } else {
michael@0 2136 t->chunkOffset = start32;
michael@0 2137 }
michael@0 2138 u_terminateUChars(dest, destCapacity, length, pErrorCode);
michael@0 2139 return length;
michael@0 2140 }
michael@0 2141
michael@0 2142 static int32_t U_CALLCONV
michael@0 2143 unistrTextReplace(UText *ut,
michael@0 2144 int64_t start, int64_t limit,
michael@0 2145 const UChar *src, int32_t length,
michael@0 2146 UErrorCode *pErrorCode) {
michael@0 2147 UnicodeString *us=(UnicodeString *)ut->context;
michael@0 2148 int32_t oldLength;
michael@0 2149
michael@0 2150 if(U_FAILURE(*pErrorCode)) {
michael@0 2151 return 0;
michael@0 2152 }
michael@0 2153 if(src==NULL && length!=0) {
michael@0 2154 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2155 }
michael@0 2156 if(start>limit) {
michael@0 2157 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2158 return 0;
michael@0 2159 }
michael@0 2160 oldLength=us->length();
michael@0 2161 int32_t start32 = pinIndex(start, oldLength);
michael@0 2162 int32_t limit32 = pinIndex(limit, oldLength);
michael@0 2163 if (start32 < oldLength) {
michael@0 2164 start32 = us->getChar32Start(start32);
michael@0 2165 }
michael@0 2166 if (limit32 < oldLength) {
michael@0 2167 limit32 = us->getChar32Start(limit32);
michael@0 2168 }
michael@0 2169
michael@0 2170 // replace
michael@0 2171 us->replace(start32, limit32-start32, src, length);
michael@0 2172 int32_t newLength = us->length();
michael@0 2173
michael@0 2174 // Update the chunk description.
michael@0 2175 ut->chunkContents = us->getBuffer();
michael@0 2176 ut->chunkLength = newLength;
michael@0 2177 ut->chunkNativeLimit = newLength;
michael@0 2178 ut->nativeIndexingLimit = newLength;
michael@0 2179
michael@0 2180 // Set iteration position to the point just following the newly inserted text.
michael@0 2181 int32_t lengthDelta = newLength - oldLength;
michael@0 2182 ut->chunkOffset = limit32 + lengthDelta;
michael@0 2183
michael@0 2184 return lengthDelta;
michael@0 2185 }
michael@0 2186
michael@0 2187 static void U_CALLCONV
michael@0 2188 unistrTextCopy(UText *ut,
michael@0 2189 int64_t start, int64_t limit,
michael@0 2190 int64_t destIndex,
michael@0 2191 UBool move,
michael@0 2192 UErrorCode *pErrorCode) {
michael@0 2193 UnicodeString *us=(UnicodeString *)ut->context;
michael@0 2194 int32_t length=us->length();
michael@0 2195
michael@0 2196 if(U_FAILURE(*pErrorCode)) {
michael@0 2197 return;
michael@0 2198 }
michael@0 2199 int32_t start32 = pinIndex(start, length);
michael@0 2200 int32_t limit32 = pinIndex(limit, length);
michael@0 2201 int32_t destIndex32 = pinIndex(destIndex, length);
michael@0 2202
michael@0 2203 if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
michael@0 2204 *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 2205 return;
michael@0 2206 }
michael@0 2207
michael@0 2208 if(move) {
michael@0 2209 // move: copy to destIndex, then replace original with nothing
michael@0 2210 int32_t segLength=limit32-start32;
michael@0 2211 us->copy(start32, limit32, destIndex32);
michael@0 2212 if(destIndex32<start32) {
michael@0 2213 start32+=segLength;
michael@0 2214 }
michael@0 2215 us->replace(start32, segLength, NULL, 0);
michael@0 2216 } else {
michael@0 2217 // copy
michael@0 2218 us->copy(start32, limit32, destIndex32);
michael@0 2219 }
michael@0 2220
michael@0 2221 // update chunk description, set iteration position.
michael@0 2222 ut->chunkContents = us->getBuffer();
michael@0 2223 if (move==FALSE) {
michael@0 2224 // copy operation, string length grows
michael@0 2225 ut->chunkLength += limit32-start32;
michael@0 2226 ut->chunkNativeLimit = ut->chunkLength;
michael@0 2227 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 2228 }
michael@0 2229
michael@0 2230 // Iteration position to end of the newly inserted text.
michael@0 2231 ut->chunkOffset = destIndex32+limit32-start32;
michael@0 2232 if (move && destIndex32>start32) {
michael@0 2233 ut->chunkOffset = destIndex32;
michael@0 2234 }
michael@0 2235
michael@0 2236 }
michael@0 2237
michael@0 2238 static const struct UTextFuncs unistrFuncs =
michael@0 2239 {
michael@0 2240 sizeof(UTextFuncs),
michael@0 2241 0, 0, 0, // Reserved alignment padding
michael@0 2242 unistrTextClone,
michael@0 2243 unistrTextLength,
michael@0 2244 unistrTextAccess,
michael@0 2245 unistrTextExtract,
michael@0 2246 unistrTextReplace,
michael@0 2247 unistrTextCopy,
michael@0 2248 NULL, // MapOffsetToNative,
michael@0 2249 NULL, // MapIndexToUTF16,
michael@0 2250 unistrTextClose,
michael@0 2251 NULL, // spare 1
michael@0 2252 NULL, // spare 2
michael@0 2253 NULL // spare 3
michael@0 2254 };
michael@0 2255
michael@0 2256
michael@0 2257
michael@0 2258 U_CDECL_END
michael@0 2259
michael@0 2260
michael@0 2261 U_CAPI UText * U_EXPORT2
michael@0 2262 utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
michael@0 2263 ut = utext_openConstUnicodeString(ut, s, status);
michael@0 2264 if (U_SUCCESS(*status)) {
michael@0 2265 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
michael@0 2266 }
michael@0 2267 return ut;
michael@0 2268 }
michael@0 2269
michael@0 2270
michael@0 2271
michael@0 2272 U_CAPI UText * U_EXPORT2
michael@0 2273 utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
michael@0 2274 if (U_SUCCESS(*status) && s->isBogus()) {
michael@0 2275 // The UnicodeString is bogus, but we still need to detach the UText
michael@0 2276 // from whatever it was hooked to before, if anything.
michael@0 2277 utext_openUChars(ut, NULL, 0, status);
michael@0 2278 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2279 return ut;
michael@0 2280 }
michael@0 2281 ut = utext_setup(ut, 0, status);
michael@0 2282 // note: use the standard (writable) function table for UnicodeString.
michael@0 2283 // The flag settings disable writing, so having the functions in
michael@0 2284 // the table is harmless.
michael@0 2285 if (U_SUCCESS(*status)) {
michael@0 2286 ut->pFuncs = &unistrFuncs;
michael@0 2287 ut->context = s;
michael@0 2288 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
michael@0 2289 ut->chunkContents = s->getBuffer();
michael@0 2290 ut->chunkLength = s->length();
michael@0 2291 ut->chunkNativeStart = 0;
michael@0 2292 ut->chunkNativeLimit = ut->chunkLength;
michael@0 2293 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 2294 }
michael@0 2295 return ut;
michael@0 2296 }
michael@0 2297
michael@0 2298 //------------------------------------------------------------------------------
michael@0 2299 //
michael@0 2300 // UText implementation for const UChar * strings
michael@0 2301 //
michael@0 2302 // Use of UText data members:
michael@0 2303 // context pointer to UnicodeString
michael@0 2304 // a length. -1 if not yet known.
michael@0 2305 //
michael@0 2306 // TODO: support 64 bit lengths.
michael@0 2307 //
michael@0 2308 //------------------------------------------------------------------------------
michael@0 2309
michael@0 2310 U_CDECL_BEGIN
michael@0 2311
michael@0 2312
michael@0 2313 static UText * U_CALLCONV
michael@0 2314 ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
michael@0 2315 // First do a generic shallow clone.
michael@0 2316 dest = shallowTextClone(dest, src, status);
michael@0 2317
michael@0 2318 // For deep clones, make a copy of the string.
michael@0 2319 // The copied storage is owned by the newly created clone.
michael@0 2320 // A non-NULL pointer in UText.p is the signal to the close() function to delete
michael@0 2321 // it.
michael@0 2322 //
michael@0 2323 if (deep && U_SUCCESS(*status)) {
michael@0 2324 U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
michael@0 2325 int32_t len = (int32_t)utext_nativeLength(dest);
michael@0 2326
michael@0 2327 // The cloned string IS going to be NUL terminated, whether or not the original was.
michael@0 2328 const UChar *srcStr = (const UChar *)src->context;
michael@0 2329 UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
michael@0 2330 if (copyStr == NULL) {
michael@0 2331 *status = U_MEMORY_ALLOCATION_ERROR;
michael@0 2332 } else {
michael@0 2333 int64_t i;
michael@0 2334 for (i=0; i<len; i++) {
michael@0 2335 copyStr[i] = srcStr[i];
michael@0 2336 }
michael@0 2337 copyStr[len] = 0;
michael@0 2338 dest->context = copyStr;
michael@0 2339 dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
michael@0 2340 }
michael@0 2341 }
michael@0 2342 return dest;
michael@0 2343 }
michael@0 2344
michael@0 2345
michael@0 2346 static void U_CALLCONV
michael@0 2347 ucstrTextClose(UText *ut) {
michael@0 2348 // Most of the work of close is done by the generic UText framework close.
michael@0 2349 // All that needs to be done here is delete the string if the UText
michael@0 2350 // owns it. This occurs if the UText was created by cloning.
michael@0 2351 if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
michael@0 2352 UChar *s = (UChar *)ut->context;
michael@0 2353 uprv_free(s);
michael@0 2354 ut->context = NULL;
michael@0 2355 }
michael@0 2356 }
michael@0 2357
michael@0 2358
michael@0 2359
michael@0 2360 static int64_t U_CALLCONV
michael@0 2361 ucstrTextLength(UText *ut) {
michael@0 2362 if (ut->a < 0) {
michael@0 2363 // null terminated, we don't yet know the length. Scan for it.
michael@0 2364 // Access is not convenient for doing this
michael@0 2365 // because the current interation postion can't be changed.
michael@0 2366 const UChar *str = (const UChar *)ut->context;
michael@0 2367 for (;;) {
michael@0 2368 if (str[ut->chunkNativeLimit] == 0) {
michael@0 2369 break;
michael@0 2370 }
michael@0 2371 ut->chunkNativeLimit++;
michael@0 2372 }
michael@0 2373 ut->a = ut->chunkNativeLimit;
michael@0 2374 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
michael@0 2375 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 2376 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 2377 }
michael@0 2378 return ut->a;
michael@0 2379 }
michael@0 2380
michael@0 2381
michael@0 2382 static UBool U_CALLCONV
michael@0 2383 ucstrTextAccess(UText *ut, int64_t index, UBool forward) {
michael@0 2384 const UChar *str = (const UChar *)ut->context;
michael@0 2385
michael@0 2386 // pin the requested index to the bounds of the string,
michael@0 2387 // and set current iteration position.
michael@0 2388 if (index<0) {
michael@0 2389 index = 0;
michael@0 2390 } else if (index < ut->chunkNativeLimit) {
michael@0 2391 // The request data is within the chunk as it is known so far.
michael@0 2392 // Put index on a code point boundary.
michael@0 2393 U16_SET_CP_START(str, 0, index);
michael@0 2394 } else if (ut->a >= 0) {
michael@0 2395 // We know the length of this string, and the user is requesting something
michael@0 2396 // at or beyond the length. Pin the requested index to the length.
michael@0 2397 index = ut->a;
michael@0 2398 } else {
michael@0 2399 // Null terminated string, length not yet known, and the requested index
michael@0 2400 // is beyond where we have scanned so far.
michael@0 2401 // Scan to 32 UChars beyond the requested index. The strategy here is
michael@0 2402 // to avoid fully scanning a long string when the caller only wants to
michael@0 2403 // see a few characters at its beginning.
michael@0 2404 int32_t scanLimit = (int32_t)index + 32;
michael@0 2405 if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression
michael@0 2406 scanLimit = INT32_MAX;
michael@0 2407 }
michael@0 2408
michael@0 2409 int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
michael@0 2410 for (; chunkLimit<scanLimit; chunkLimit++) {
michael@0 2411 if (str[chunkLimit] == 0) {
michael@0 2412 // We found the end of the string. Remember it, pin the requested index to it,
michael@0 2413 // and bail out of here.
michael@0 2414 ut->a = chunkLimit;
michael@0 2415 ut->chunkLength = chunkLimit;
michael@0 2416 ut->nativeIndexingLimit = chunkLimit;
michael@0 2417 if (index >= chunkLimit) {
michael@0 2418 index = chunkLimit;
michael@0 2419 } else {
michael@0 2420 U16_SET_CP_START(str, 0, index);
michael@0 2421 }
michael@0 2422
michael@0 2423 ut->chunkNativeLimit = chunkLimit;
michael@0 2424 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 2425 goto breakout;
michael@0 2426 }
michael@0 2427 }
michael@0 2428 // We scanned through the next batch of UChars without finding the end.
michael@0 2429 U16_SET_CP_START(str, 0, index);
michael@0 2430 if (chunkLimit == INT32_MAX) {
michael@0 2431 // Scanned to the limit of a 32 bit length.
michael@0 2432 // Forceably trim the overlength string back so length fits in int32
michael@0 2433 // TODO: add support for 64 bit strings.
michael@0 2434 ut->a = chunkLimit;
michael@0 2435 ut->chunkLength = chunkLimit;
michael@0 2436 ut->nativeIndexingLimit = chunkLimit;
michael@0 2437 if (index > chunkLimit) {
michael@0 2438 index = chunkLimit;
michael@0 2439 }
michael@0 2440 ut->chunkNativeLimit = chunkLimit;
michael@0 2441 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 2442 } else {
michael@0 2443 // The endpoint of a chunk must not be left in the middle of a surrogate pair.
michael@0 2444 // If the current end is on a lead surrogate, back the end up by one.
michael@0 2445 // It doesn't matter if the end char happens to be an unpaired surrogate,
michael@0 2446 // and it's simpler not to worry about it.
michael@0 2447 if (U16_IS_LEAD(str[chunkLimit-1])) {
michael@0 2448 --chunkLimit;
michael@0 2449 }
michael@0 2450 // Null-terminated chunk with end still unknown.
michael@0 2451 // Update the chunk length to reflect what has been scanned thus far.
michael@0 2452 // That the full length is still unknown is (still) flagged by
michael@0 2453 // ut->a being < 0.
michael@0 2454 ut->chunkNativeLimit = chunkLimit;
michael@0 2455 ut->nativeIndexingLimit = chunkLimit;
michael@0 2456 ut->chunkLength = chunkLimit;
michael@0 2457 }
michael@0 2458
michael@0 2459 }
michael@0 2460 breakout:
michael@0 2461 U_ASSERT(index<=INT32_MAX);
michael@0 2462 ut->chunkOffset = (int32_t)index;
michael@0 2463
michael@0 2464 // Check whether request is at the start or end
michael@0 2465 UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
michael@0 2466 return retVal;
michael@0 2467 }
michael@0 2468
michael@0 2469
michael@0 2470
michael@0 2471 static int32_t U_CALLCONV
michael@0 2472 ucstrTextExtract(UText *ut,
michael@0 2473 int64_t start, int64_t limit,
michael@0 2474 UChar *dest, int32_t destCapacity,
michael@0 2475 UErrorCode *pErrorCode)
michael@0 2476 {
michael@0 2477 if(U_FAILURE(*pErrorCode)) {
michael@0 2478 return 0;
michael@0 2479 }
michael@0 2480 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
michael@0 2481 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2482 return 0;
michael@0 2483 }
michael@0 2484
michael@0 2485 //const UChar *s=(const UChar *)ut->context;
michael@0 2486 int32_t si, di;
michael@0 2487
michael@0 2488 int32_t start32;
michael@0 2489 int32_t limit32;
michael@0 2490
michael@0 2491 // Access the start. Does two things we need:
michael@0 2492 // Pins 'start' to the length of the string, if it came in out-of-bounds.
michael@0 2493 // Snaps 'start' to the beginning of a code point.
michael@0 2494 ucstrTextAccess(ut, start, TRUE);
michael@0 2495 const UChar *s=ut->chunkContents;
michael@0 2496 start32 = ut->chunkOffset;
michael@0 2497
michael@0 2498 int32_t strLength=(int32_t)ut->a;
michael@0 2499 if (strLength >= 0) {
michael@0 2500 limit32 = pinIndex(limit, strLength);
michael@0 2501 } else {
michael@0 2502 limit32 = pinIndex(limit, INT32_MAX);
michael@0 2503 }
michael@0 2504 di = 0;
michael@0 2505 for (si=start32; si<limit32; si++) {
michael@0 2506 if (strLength<0 && s[si]==0) {
michael@0 2507 // Just hit the end of a null-terminated string.
michael@0 2508 ut->a = si; // set string length for this UText
michael@0 2509 ut->chunkNativeLimit = si;
michael@0 2510 ut->chunkLength = si;
michael@0 2511 ut->nativeIndexingLimit = si;
michael@0 2512 strLength = si;
michael@0 2513 break;
michael@0 2514 }
michael@0 2515 U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
michael@0 2516 if (di<destCapacity) {
michael@0 2517 // only store if there is space.
michael@0 2518 dest[di] = s[si];
michael@0 2519 } else {
michael@0 2520 if (strLength>=0) {
michael@0 2521 // We have filled the destination buffer, and the string length is known.
michael@0 2522 // Cut the loop short. There is no need to scan string termination.
michael@0 2523 di = limit32 - start32;
michael@0 2524 si = limit32;
michael@0 2525 break;
michael@0 2526 }
michael@0 2527 }
michael@0 2528 di++;
michael@0 2529 }
michael@0 2530
michael@0 2531 // If the limit index points to a lead surrogate of a pair,
michael@0 2532 // add the corresponding trail surrogate to the destination.
michael@0 2533 if (si>0 && U16_IS_LEAD(s[si-1]) &&
michael@0 2534 ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si])))
michael@0 2535 {
michael@0 2536 if (di<destCapacity) {
michael@0 2537 // store only if there is space in the output buffer.
michael@0 2538 dest[di++] = s[si++];
michael@0 2539 }
michael@0 2540 }
michael@0 2541
michael@0 2542 // Put iteration position at the point just following the extracted text
michael@0 2543 ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
michael@0 2544
michael@0 2545 // Add a terminating NUL if space in the buffer permits,
michael@0 2546 // and set the error status as required.
michael@0 2547 u_terminateUChars(dest, destCapacity, di, pErrorCode);
michael@0 2548 return di;
michael@0 2549 }
michael@0 2550
michael@0 2551 static const struct UTextFuncs ucstrFuncs =
michael@0 2552 {
michael@0 2553 sizeof(UTextFuncs),
michael@0 2554 0, 0, 0, // Reserved alignment padding
michael@0 2555 ucstrTextClone,
michael@0 2556 ucstrTextLength,
michael@0 2557 ucstrTextAccess,
michael@0 2558 ucstrTextExtract,
michael@0 2559 NULL, // Replace
michael@0 2560 NULL, // Copy
michael@0 2561 NULL, // MapOffsetToNative,
michael@0 2562 NULL, // MapIndexToUTF16,
michael@0 2563 ucstrTextClose,
michael@0 2564 NULL, // spare 1
michael@0 2565 NULL, // spare 2
michael@0 2566 NULL, // spare 3
michael@0 2567 };
michael@0 2568
michael@0 2569 U_CDECL_END
michael@0 2570
michael@0 2571 static const UChar gEmptyUString[] = {0};
michael@0 2572
michael@0 2573 U_CAPI UText * U_EXPORT2
michael@0 2574 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
michael@0 2575 if (U_FAILURE(*status)) {
michael@0 2576 return NULL;
michael@0 2577 }
michael@0 2578 if(s==NULL && length==0) {
michael@0 2579 s = gEmptyUString;
michael@0 2580 }
michael@0 2581 if (s==NULL || length < -1 || length>INT32_MAX) {
michael@0 2582 *status = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2583 return NULL;
michael@0 2584 }
michael@0 2585 ut = utext_setup(ut, 0, status);
michael@0 2586 if (U_SUCCESS(*status)) {
michael@0 2587 ut->pFuncs = &ucstrFuncs;
michael@0 2588 ut->context = s;
michael@0 2589 ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
michael@0 2590 if (length==-1) {
michael@0 2591 ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
michael@0 2592 }
michael@0 2593 ut->a = length;
michael@0 2594 ut->chunkContents = s;
michael@0 2595 ut->chunkNativeStart = 0;
michael@0 2596 ut->chunkNativeLimit = length>=0? length : 0;
michael@0 2597 ut->chunkLength = (int32_t)ut->chunkNativeLimit;
michael@0 2598 ut->chunkOffset = 0;
michael@0 2599 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 2600 }
michael@0 2601 return ut;
michael@0 2602 }
michael@0 2603
michael@0 2604
michael@0 2605 //------------------------------------------------------------------------------
michael@0 2606 //
michael@0 2607 // UText implementation for text from ICU CharacterIterators
michael@0 2608 //
michael@0 2609 // Use of UText data members:
michael@0 2610 // context pointer to the CharacterIterator
michael@0 2611 // a length of the full text.
michael@0 2612 // p pointer to buffer 1
michael@0 2613 // b start index of local buffer 1 contents
michael@0 2614 // q pointer to buffer 2
michael@0 2615 // c start index of local buffer 2 contents
michael@0 2616 // r pointer to the character iterator if the UText owns it.
michael@0 2617 // Null otherwise.
michael@0 2618 //
michael@0 2619 //------------------------------------------------------------------------------
michael@0 2620 #define CIBufSize 16
michael@0 2621
michael@0 2622 U_CDECL_BEGIN
michael@0 2623 static void U_CALLCONV
michael@0 2624 charIterTextClose(UText *ut) {
michael@0 2625 // Most of the work of close is done by the generic UText framework close.
michael@0 2626 // All that needs to be done here is delete the CharacterIterator if the UText
michael@0 2627 // owns it. This occurs if the UText was created by cloning.
michael@0 2628 CharacterIterator *ci = (CharacterIterator *)ut->r;
michael@0 2629 delete ci;
michael@0 2630 ut->r = NULL;
michael@0 2631 }
michael@0 2632
michael@0 2633 static int64_t U_CALLCONV
michael@0 2634 charIterTextLength(UText *ut) {
michael@0 2635 return (int32_t)ut->a;
michael@0 2636 }
michael@0 2637
michael@0 2638 static UBool U_CALLCONV
michael@0 2639 charIterTextAccess(UText *ut, int64_t index, UBool forward) {
michael@0 2640 CharacterIterator *ci = (CharacterIterator *)ut->context;
michael@0 2641
michael@0 2642 int32_t clippedIndex = (int32_t)index;
michael@0 2643 if (clippedIndex<0) {
michael@0 2644 clippedIndex=0;
michael@0 2645 } else if (clippedIndex>=ut->a) {
michael@0 2646 clippedIndex=(int32_t)ut->a;
michael@0 2647 }
michael@0 2648 int32_t neededIndex = clippedIndex;
michael@0 2649 if (!forward && neededIndex>0) {
michael@0 2650 // reverse iteration, want the position just before what was asked for.
michael@0 2651 neededIndex--;
michael@0 2652 } else if (forward && neededIndex==ut->a && neededIndex>0) {
michael@0 2653 // Forward iteration, don't ask for something past the end of the text.
michael@0 2654 neededIndex--;
michael@0 2655 }
michael@0 2656
michael@0 2657 // Find the native index of the start of the buffer containing what we want.
michael@0 2658 neededIndex -= neededIndex % CIBufSize;
michael@0 2659
michael@0 2660 UChar *buf = NULL;
michael@0 2661 UBool needChunkSetup = TRUE;
michael@0 2662 int i;
michael@0 2663 if (ut->chunkNativeStart == neededIndex) {
michael@0 2664 // The buffer we want is already the current chunk.
michael@0 2665 needChunkSetup = FALSE;
michael@0 2666 } else if (ut->b == neededIndex) {
michael@0 2667 // The first buffer (buffer p) has what we need.
michael@0 2668 buf = (UChar *)ut->p;
michael@0 2669 } else if (ut->c == neededIndex) {
michael@0 2670 // The second buffer (buffer q) has what we need.
michael@0 2671 buf = (UChar *)ut->q;
michael@0 2672 } else {
michael@0 2673 // Neither buffer already has what we need.
michael@0 2674 // Load new data from the character iterator.
michael@0 2675 // Use the buf that is not the current buffer.
michael@0 2676 buf = (UChar *)ut->p;
michael@0 2677 if (ut->p == ut->chunkContents) {
michael@0 2678 buf = (UChar *)ut->q;
michael@0 2679 }
michael@0 2680 ci->setIndex(neededIndex);
michael@0 2681 for (i=0; i<CIBufSize; i++) {
michael@0 2682 buf[i] = ci->nextPostInc();
michael@0 2683 if (i+neededIndex > ut->a) {
michael@0 2684 break;
michael@0 2685 }
michael@0 2686 }
michael@0 2687 }
michael@0 2688
michael@0 2689 // We have a buffer with the data we need.
michael@0 2690 // Set it up as the current chunk, if it wasn't already.
michael@0 2691 if (needChunkSetup) {
michael@0 2692 ut->chunkContents = buf;
michael@0 2693 ut->chunkLength = CIBufSize;
michael@0 2694 ut->chunkNativeStart = neededIndex;
michael@0 2695 ut->chunkNativeLimit = neededIndex + CIBufSize;
michael@0 2696 if (ut->chunkNativeLimit > ut->a) {
michael@0 2697 ut->chunkNativeLimit = ut->a;
michael@0 2698 ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
michael@0 2699 }
michael@0 2700 ut->nativeIndexingLimit = ut->chunkLength;
michael@0 2701 U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
michael@0 2702 }
michael@0 2703 ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
michael@0 2704 UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
michael@0 2705 return success;
michael@0 2706 }
michael@0 2707
michael@0 2708 static UText * U_CALLCONV
michael@0 2709 charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
michael@0 2710 if (U_FAILURE(*status)) {
michael@0 2711 return NULL;
michael@0 2712 }
michael@0 2713
michael@0 2714 if (deep) {
michael@0 2715 // There is no CharacterIterator API for cloning the underlying text storage.
michael@0 2716 *status = U_UNSUPPORTED_ERROR;
michael@0 2717 return NULL;
michael@0 2718 } else {
michael@0 2719 CharacterIterator *srcCI =(CharacterIterator *)src->context;
michael@0 2720 srcCI = srcCI->clone();
michael@0 2721 dest = utext_openCharacterIterator(dest, srcCI, status);
michael@0 2722 // cast off const on getNativeIndex.
michael@0 2723 // For CharacterIterator based UTexts, this is safe, the operation is const.
michael@0 2724 int64_t ix = utext_getNativeIndex((UText *)src);
michael@0 2725 utext_setNativeIndex(dest, ix);
michael@0 2726 dest->r = srcCI; // flags that this UText owns the CharacterIterator
michael@0 2727 }
michael@0 2728 return dest;
michael@0 2729 }
michael@0 2730
michael@0 2731 static int32_t U_CALLCONV
michael@0 2732 charIterTextExtract(UText *ut,
michael@0 2733 int64_t start, int64_t limit,
michael@0 2734 UChar *dest, int32_t destCapacity,
michael@0 2735 UErrorCode *status)
michael@0 2736 {
michael@0 2737 if(U_FAILURE(*status)) {
michael@0 2738 return 0;
michael@0 2739 }
michael@0 2740 if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
michael@0 2741 *status=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 2742 return 0;
michael@0 2743 }
michael@0 2744 int32_t length = (int32_t)ut->a;
michael@0 2745 int32_t start32 = pinIndex(start, length);
michael@0 2746 int32_t limit32 = pinIndex(limit, length);
michael@0 2747 int32_t desti = 0;
michael@0 2748 int32_t srci;
michael@0 2749 int32_t copyLimit;
michael@0 2750
michael@0 2751 CharacterIterator *ci = (CharacterIterator *)ut->context;
michael@0 2752 ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed.
michael@0 2753 srci = ci->getIndex();
michael@0 2754 copyLimit = srci;
michael@0 2755 while (srci<limit32) {
michael@0 2756 UChar32 c = ci->next32PostInc();
michael@0 2757 int32_t len = U16_LENGTH(c);
michael@0 2758 U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
michael@0 2759 if (desti+len <= destCapacity) {
michael@0 2760 U16_APPEND_UNSAFE(dest, desti, c);
michael@0 2761 copyLimit = srci+len;
michael@0 2762 } else {
michael@0 2763 desti += len;
michael@0 2764 *status = U_BUFFER_OVERFLOW_ERROR;
michael@0 2765 }
michael@0 2766 srci += len;
michael@0 2767 }
michael@0 2768
michael@0 2769 charIterTextAccess(ut, copyLimit, TRUE);
michael@0 2770
michael@0 2771 u_terminateUChars(dest, destCapacity, desti, status);
michael@0 2772 return desti;
michael@0 2773 }
michael@0 2774
michael@0 2775 static const struct UTextFuncs charIterFuncs =
michael@0 2776 {
michael@0 2777 sizeof(UTextFuncs),
michael@0 2778 0, 0, 0, // Reserved alignment padding
michael@0 2779 charIterTextClone,
michael@0 2780 charIterTextLength,
michael@0 2781 charIterTextAccess,
michael@0 2782 charIterTextExtract,
michael@0 2783 NULL, // Replace
michael@0 2784 NULL, // Copy
michael@0 2785 NULL, // MapOffsetToNative,
michael@0 2786 NULL, // MapIndexToUTF16,
michael@0 2787 charIterTextClose,
michael@0 2788 NULL, // spare 1
michael@0 2789 NULL, // spare 2
michael@0 2790 NULL // spare 3
michael@0 2791 };
michael@0 2792 U_CDECL_END
michael@0 2793
michael@0 2794
michael@0 2795 U_CAPI UText * U_EXPORT2
michael@0 2796 utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
michael@0 2797 if (U_FAILURE(*status)) {
michael@0 2798 return NULL;
michael@0 2799 }
michael@0 2800
michael@0 2801 if (ci->startIndex() > 0) {
michael@0 2802 // No support for CharacterIterators that do not start indexing from zero.
michael@0 2803 *status = U_UNSUPPORTED_ERROR;
michael@0 2804 return NULL;
michael@0 2805 }
michael@0 2806
michael@0 2807 // Extra space in UText for 2 buffers of CIBufSize UChars each.
michael@0 2808 int32_t extraSpace = 2 * CIBufSize * sizeof(UChar);
michael@0 2809 ut = utext_setup(ut, extraSpace, status);
michael@0 2810 if (U_SUCCESS(*status)) {
michael@0 2811 ut->pFuncs = &charIterFuncs;
michael@0 2812 ut->context = ci;
michael@0 2813 ut->providerProperties = 0;
michael@0 2814 ut->a = ci->endIndex(); // Length of text
michael@0 2815 ut->p = ut->pExtra; // First buffer
michael@0 2816 ut->b = -1; // Native index of first buffer contents
michael@0 2817 ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer
michael@0 2818 ut->c = -1; // Native index of second buffer contents
michael@0 2819
michael@0 2820 // Initialize current chunk contents to be empty.
michael@0 2821 // First access will fault something in.
michael@0 2822 // Note: The initial nativeStart and chunkOffset must sum to zero
michael@0 2823 // so that getNativeIndex() will correctly compute to zero
michael@0 2824 // if no call to Access() has ever been made. They can't be both
michael@0 2825 // zero without Access() thinking that the chunk is valid.
michael@0 2826 ut->chunkContents = (UChar *)ut->p;
michael@0 2827 ut->chunkNativeStart = -1;
michael@0 2828 ut->chunkOffset = 1;
michael@0 2829 ut->chunkNativeLimit = 0;
michael@0 2830 ut->chunkLength = 0;
michael@0 2831 ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing
michael@0 2832 }
michael@0 2833 return ut;
michael@0 2834 }
michael@0 2835
michael@0 2836
michael@0 2837

mercurial