The Tor Browser: diff intl/icu/source/common/utext.cpp

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/utext.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,2837 @@
     1.4 +/*
     1.5 +*******************************************************************************
     1.6 +*
     1.7 +*   Copyright (C) 2005-2012, International Business Machines
     1.8 +*   Corporation and others.  All Rights Reserved.
     1.9 +*
    1.10 +*******************************************************************************
    1.11 +*   file name:  utext.cpp
    1.12 +*   encoding:   US-ASCII
    1.13 +*   tab size:   8 (not used)
    1.14 +*   indentation:4
    1.15 +*
    1.16 +*   created on: 2005apr12
    1.17 +*   created by: Markus W. Scherer
    1.18 +*/
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +#include "unicode/ustring.h"
    1.22 +#include "unicode/unistr.h"
    1.23 +#include "unicode/chariter.h"
    1.24 +#include "unicode/utext.h"
    1.25 +#include "unicode/utf.h"
    1.26 +#include "unicode/utf8.h"
    1.27 +#include "unicode/utf16.h"
    1.28 +#include "ustr_imp.h"
    1.29 +#include "cmemory.h"
    1.30 +#include "cstring.h"
    1.31 +#include "uassert.h"
    1.32 +#include "putilimp.h"
    1.33 +
    1.34 +U_NAMESPACE_USE
    1.35 +
    1.36 +#define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
    1.37 +
    1.38 +
    1.39 +static UBool
    1.40 +utext_access(UText *ut, int64_t index, UBool forward) {
    1.41 +    return ut->pFuncs->access(ut, index, forward);
    1.42 +}
    1.43 +
    1.44 +
    1.45 +
    1.46 +U_CAPI UBool U_EXPORT2
    1.47 +utext_moveIndex32(UText *ut, int32_t delta) {
    1.48 +    UChar32  c;
    1.49 +    if (delta > 0) {
    1.50 +        do {
    1.51 +            if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
    1.52 +                return FALSE;
    1.53 +            }
    1.54 +            c = ut->chunkContents[ut->chunkOffset];
    1.55 +            if (U16_IS_SURROGATE(c)) {
    1.56 +                c = utext_next32(ut);
    1.57 +                if (c == U_SENTINEL) {
    1.58 +                    return FALSE;
    1.59 +                }
    1.60 +            } else {
    1.61 +                ut->chunkOffset++;
    1.62 +            }
    1.63 +        } while(--delta>0);
    1.64 +
    1.65 +    } else if (delta<0) {
    1.66 +        do {
    1.67 +            if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
    1.68 +                return FALSE;
    1.69 +            }
    1.70 +            c = ut->chunkContents[ut->chunkOffset-1];
    1.71 +            if (U16_IS_SURROGATE(c)) {
    1.72 +                c = utext_previous32(ut);
    1.73 +                if (c == U_SENTINEL) {
    1.74 +                    return FALSE;
    1.75 +                }
    1.76 +            } else {
    1.77 +                ut->chunkOffset--;
    1.78 +            }
    1.79 +        } while(++delta<0);
    1.80 +    }
    1.81 +
    1.82 +    return TRUE;
    1.83 +}
    1.84 +
    1.85 +
    1.86 +U_CAPI int64_t U_EXPORT2
    1.87 +utext_nativeLength(UText *ut) {
    1.88 +    return ut->pFuncs->nativeLength(ut);
    1.89 +}
    1.90 +
    1.91 +
    1.92 +U_CAPI UBool U_EXPORT2
    1.93 +utext_isLengthExpensive(const UText *ut) {
    1.94 +    UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
    1.95 +    return r;
    1.96 +}
    1.97 +
    1.98 +
    1.99 +U_CAPI int64_t U_EXPORT2
   1.100 +utext_getNativeIndex(const UText *ut) {
   1.101 +    if(ut->chunkOffset <= ut->nativeIndexingLimit) {
   1.102 +        return ut->chunkNativeStart+ut->chunkOffset;
   1.103 +    } else {
   1.104 +        return ut->pFuncs->mapOffsetToNative(ut);
   1.105 +    }
   1.106 +}
   1.107 +
   1.108 +
   1.109 +U_CAPI void U_EXPORT2
   1.110 +utext_setNativeIndex(UText *ut, int64_t index) {
   1.111 +    if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
   1.112 +        // The desired position is outside of the current chunk.
   1.113 +        // Access the new position.  Assume a forward iteration from here,
   1.114 +        // which will also be optimimum for a single random access.
   1.115 +        // Reverse iterations may suffer slightly.
   1.116 +        ut->pFuncs->access(ut, index, TRUE);
   1.117 +    } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
   1.118 +        // utf-16 indexing.
   1.119 +        ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
   1.120 +    } else {
   1.121 +         ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   1.122 +    }
   1.123 +    // The convention is that the index must always be on a code point boundary.
   1.124 +    // Adjust the index position if it is in the middle of a surrogate pair.
   1.125 +    if (ut->chunkOffset<ut->chunkLength) {
   1.126 +        UChar c= ut->chunkContents[ut->chunkOffset];
   1.127 +        if (U16_IS_TRAIL(c)) {
   1.128 +            if (ut->chunkOffset==0) {
   1.129 +                ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
   1.130 +            }
   1.131 +            if (ut->chunkOffset>0) {
   1.132 +                UChar lead = ut->chunkContents[ut->chunkOffset-1];
   1.133 +                if (U16_IS_LEAD(lead)) {
   1.134 +                    ut->chunkOffset--;
   1.135 +                }
   1.136 +            }
   1.137 +        }
   1.138 +    }
   1.139 +}
   1.140 +
   1.141 +
   1.142 +
   1.143 +U_CAPI int64_t U_EXPORT2
   1.144 +utext_getPreviousNativeIndex(UText *ut) {
   1.145 +    //
   1.146 +    //  Fast-path the common case.
   1.147 +    //     Common means current position is not at the beginning of a chunk
   1.148 +    //     and the preceding character is not supplementary.
   1.149 +    //
   1.150 +    int32_t i = ut->chunkOffset - 1;
   1.151 +    int64_t result;
   1.152 +    if (i >= 0) {
   1.153 +        UChar c = ut->chunkContents[i];
   1.154 +        if (U16_IS_TRAIL(c) == FALSE) {
   1.155 +            if (i <= ut->nativeIndexingLimit) {
   1.156 +                result = ut->chunkNativeStart + i;
   1.157 +            } else {
   1.158 +                ut->chunkOffset = i;
   1.159 +                result = ut->pFuncs->mapOffsetToNative(ut);
   1.160 +                ut->chunkOffset++;
   1.161 +            }
   1.162 +            return result;
   1.163 +        }
   1.164 +    }
   1.165 +
   1.166 +    // If at the start of text, simply return 0.
   1.167 +    if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
   1.168 +        return 0;
   1.169 +    }
   1.170 +
   1.171 +    // Harder, less common cases.  We are at a chunk boundary, or on a surrogate.
   1.172 +    //    Keep it simple, use other functions to handle the edges.
   1.173 +    //
   1.174 +    utext_previous32(ut);
   1.175 +    result = UTEXT_GETNATIVEINDEX(ut);
   1.176 +    utext_next32(ut);
   1.177 +    return result;
   1.178 +}
   1.179 +
   1.180 +
   1.181 +//
   1.182 +//  utext_current32.  Get the UChar32 at the current position.
   1.183 +//                    UText iteration position is always on a code point boundary,
   1.184 +//                    never on the trail half of a surrogate pair.
   1.185 +//
   1.186 +U_CAPI UChar32 U_EXPORT2
   1.187 +utext_current32(UText *ut) {
   1.188 +    UChar32  c;
   1.189 +    if (ut->chunkOffset==ut->chunkLength) {
   1.190 +        // Current position is just off the end of the chunk.
   1.191 +        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   1.192 +            // Off the end of the text.
   1.193 +            return U_SENTINEL;
   1.194 +        }
   1.195 +    }
   1.196 +
   1.197 +    c = ut->chunkContents[ut->chunkOffset];
   1.198 +    if (U16_IS_LEAD(c) == FALSE) {
   1.199 +        // Normal, non-supplementary case.
   1.200 +        return c;
   1.201 +    }
   1.202 +
   1.203 +    //
   1.204 +    //  Possible supplementary char.
   1.205 +    //
   1.206 +    UChar32   trail = 0;
   1.207 +    UChar32   supplementaryC = c;
   1.208 +    if ((ut->chunkOffset+1) < ut->chunkLength) {
   1.209 +        // The trail surrogate is in the same chunk.
   1.210 +        trail = ut->chunkContents[ut->chunkOffset+1];
   1.211 +    } else {
   1.212 +        //  The trail surrogate is in a different chunk.
   1.213 +        //     Because we must maintain the iteration position, we need to switch forward
   1.214 +        //     into the new chunk, get the trail surrogate, then revert the chunk back to the
   1.215 +        //     original one.
   1.216 +        //     An edge case to be careful of:  the entire text may end with an unpaired
   1.217 +        //        leading surrogate.  The attempt to access the trail will fail, but
   1.218 +        //        the original position before the unpaired lead still needs to be restored.
   1.219 +        int64_t  nativePosition = ut->chunkNativeLimit;
   1.220 +        int32_t  originalOffset = ut->chunkOffset;
   1.221 +        if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
   1.222 +            trail = ut->chunkContents[ut->chunkOffset];
   1.223 +        }
   1.224 +        UBool r = ut->pFuncs->access(ut, nativePosition, FALSE);  // reverse iteration flag loads preceding chunk
   1.225 +        U_ASSERT(r==TRUE);
   1.226 +        ut->chunkOffset = originalOffset;
   1.227 +        if(!r) {
   1.228 +            return U_SENTINEL;
   1.229 +        }
   1.230 +    }
   1.231 +
   1.232 +    if (U16_IS_TRAIL(trail)) {
   1.233 +        supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
   1.234 +    }
   1.235 +    return supplementaryC;
   1.236 +
   1.237 +}
   1.238 +
   1.239 +
   1.240 +U_CAPI UChar32 U_EXPORT2
   1.241 +utext_char32At(UText *ut, int64_t nativeIndex) {
   1.242 +    UChar32 c = U_SENTINEL;
   1.243 +
   1.244 +    // Fast path the common case.
   1.245 +    if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
   1.246 +        ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
   1.247 +        c = ut->chunkContents[ut->chunkOffset];
   1.248 +        if (U16_IS_SURROGATE(c) == FALSE) {
   1.249 +            return c;
   1.250 +        }
   1.251 +    }
   1.252 +
   1.253 +
   1.254 +    utext_setNativeIndex(ut, nativeIndex);
   1.255 +    if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
   1.256 +        c = ut->chunkContents[ut->chunkOffset];
   1.257 +        if (U16_IS_SURROGATE(c)) {
   1.258 +            // For surrogates, let current32() deal with the complications
   1.259 +            //    of supplementaries that may span chunk boundaries.
   1.260 +            c = utext_current32(ut);
   1.261 +        }
   1.262 +    }
   1.263 +    return c;
   1.264 +}
   1.265 +
   1.266 +
   1.267 +U_CAPI UChar32 U_EXPORT2
   1.268 +utext_next32(UText *ut) {
   1.269 +    UChar32       c;
   1.270 +
   1.271 +    if (ut->chunkOffset >= ut->chunkLength) {
   1.272 +        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   1.273 +            return U_SENTINEL;
   1.274 +        }
   1.275 +    }
   1.276 +
   1.277 +    c = ut->chunkContents[ut->chunkOffset++];
   1.278 +    if (U16_IS_LEAD(c) == FALSE) {
   1.279 +        // Normal case, not supplementary.
   1.280 +        //   (A trail surrogate seen here is just returned as is, as a surrogate value.
   1.281 +        //    It cannot be part of a pair.)
   1.282 +        return c;
   1.283 +    }
   1.284 +
   1.285 +    if (ut->chunkOffset >= ut->chunkLength) {
   1.286 +        if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   1.287 +            // c is an unpaired lead surrogate at the end of the text.
   1.288 +            // return it as it is.
   1.289 +            return c;
   1.290 +        }
   1.291 +    }
   1.292 +    UChar32 trail = ut->chunkContents[ut->chunkOffset];
   1.293 +    if (U16_IS_TRAIL(trail) == FALSE) {
   1.294 +        // c was an unpaired lead surrogate, not at the end of the text.
   1.295 +        // return it as it is (unpaired).  Iteration position is on the
   1.296 +        // following character, possibly in the next chunk, where the
   1.297 +        //  trail surrogate would have been if it had existed.
   1.298 +        return c;
   1.299 +    }
   1.300 +
   1.301 +    UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
   1.302 +    ut->chunkOffset++;   // move iteration position over the trail surrogate.
   1.303 +    return supplementary;
   1.304 +    }
   1.305 +
   1.306 +
   1.307 +U_CAPI UChar32 U_EXPORT2
   1.308 +utext_previous32(UText *ut) {
   1.309 +    UChar32       c;
   1.310 +
   1.311 +    if (ut->chunkOffset <= 0) {
   1.312 +        if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
   1.313 +            return U_SENTINEL;
   1.314 +        }
   1.315 +    }
   1.316 +    ut->chunkOffset--;
   1.317 +    c = ut->chunkContents[ut->chunkOffset];
   1.318 +    if (U16_IS_TRAIL(c) == FALSE) {
   1.319 +        // Normal case, not supplementary.
   1.320 +        //   (A lead surrogate seen here is just returned as is, as a surrogate value.
   1.321 +        //    It cannot be part of a pair.)
   1.322 +        return c;
   1.323 +    }
   1.324 +
   1.325 +    if (ut->chunkOffset <= 0) {
   1.326 +        if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
   1.327 +            // c is an unpaired trail surrogate at the start of the text.
   1.328 +            // return it as it is.
   1.329 +            return c;
   1.330 +        }
   1.331 +    }
   1.332 +
   1.333 +    UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
   1.334 +    if (U16_IS_LEAD(lead) == FALSE) {
   1.335 +        // c was an unpaired trail surrogate, not at the end of the text.
   1.336 +        // return it as it is (unpaired).  Iteration position is at c
   1.337 +        return c;
   1.338 +    }
   1.339 +
   1.340 +    UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
   1.341 +    ut->chunkOffset--;   // move iteration position over the lead surrogate.
   1.342 +    return supplementary;
   1.343 +}
   1.344 +
   1.345 +
   1.346 +
   1.347 +U_CAPI UChar32 U_EXPORT2
   1.348 +utext_next32From(UText *ut, int64_t index) {
   1.349 +    UChar32       c      = U_SENTINEL;
   1.350 +
   1.351 +    if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
   1.352 +        // Desired position is outside of the current chunk.
   1.353 +        if(!ut->pFuncs->access(ut, index, TRUE)) {
   1.354 +            // no chunk available here
   1.355 +            return U_SENTINEL;
   1.356 +        }
   1.357 +    } else if (index - ut->chunkNativeStart  <= (int64_t)ut->nativeIndexingLimit) {
   1.358 +        // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
   1.359 +        ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
   1.360 +    } else {
   1.361 +        // Desired position is in chunk, with non-UTF16 indexing.
   1.362 +        ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   1.363 +    }
   1.364 +
   1.365 +    c = ut->chunkContents[ut->chunkOffset++];
   1.366 +    if (U16_IS_SURROGATE(c)) {
   1.367 +        // Surrogates.  Many edge cases.  Use other functions that already
   1.368 +        //              deal with the problems.
   1.369 +        utext_setNativeIndex(ut, index);
   1.370 +        c = utext_next32(ut);
   1.371 +    }
   1.372 +    return c;
   1.373 +}
   1.374 +
   1.375 +
   1.376 +U_CAPI UChar32 U_EXPORT2
   1.377 +utext_previous32From(UText *ut, int64_t index) {
   1.378 +    //
   1.379 +    //  Return the character preceding the specified index.
   1.380 +    //  Leave the iteration position at the start of the character that was returned.
   1.381 +    //
   1.382 +    UChar32     cPrev;    // The character preceding cCurr, which is what we will return.
   1.383 +
   1.384 +    // Address the chunk containg the position preceding the incoming index
   1.385 +    // A tricky edge case:
   1.386 +    //   We try to test the requested native index against the chunkNativeStart to determine
   1.387 +    //    whether the character preceding the one at the index is in the current chunk.
   1.388 +    //    BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
   1.389 +    //    requested index is on something other than the first position of the first char.
   1.390 +    //
   1.391 +    if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
   1.392 +        // Requested native index is outside of the current chunk.
   1.393 +        if(!ut->pFuncs->access(ut, index, FALSE)) {
   1.394 +            // no chunk available here
   1.395 +            return U_SENTINEL;
   1.396 +        }
   1.397 +    } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
   1.398 +        // Direct UTF-16 indexing.
   1.399 +        ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
   1.400 +    } else {
   1.401 +        ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   1.402 +        if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
   1.403 +            // no chunk available here
   1.404 +            return U_SENTINEL;
   1.405 +        }
   1.406 +    }
   1.407 +
   1.408 +    //
   1.409 +    // Simple case with no surrogates.
   1.410 +    //
   1.411 +    ut->chunkOffset--;
   1.412 +    cPrev = ut->chunkContents[ut->chunkOffset];
   1.413 +
   1.414 +    if (U16_IS_SURROGATE(cPrev)) {
   1.415 +        // Possible supplementary.  Many edge cases.
   1.416 +        // Let other functions do the heavy lifting.
   1.417 +        utext_setNativeIndex(ut, index);
   1.418 +        cPrev = utext_previous32(ut);
   1.419 +    }
   1.420 +    return cPrev;
   1.421 +}
   1.422 +
   1.423 +
   1.424 +U_CAPI int32_t U_EXPORT2
   1.425 +utext_extract(UText *ut,
   1.426 +             int64_t start, int64_t limit,
   1.427 +             UChar *dest, int32_t destCapacity,
   1.428 +             UErrorCode *status) {
   1.429 +                 return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
   1.430 +             }
   1.431 +
   1.432 +
   1.433 +
   1.434 +U_CAPI UBool U_EXPORT2
   1.435 +utext_equals(const UText *a, const UText *b) {
   1.436 +    if (a==NULL || b==NULL ||
   1.437 +        a->magic != UTEXT_MAGIC ||
   1.438 +        b->magic != UTEXT_MAGIC) {
   1.439 +            // Null or invalid arguments don't compare equal to anything.
   1.440 +            return FALSE;
   1.441 +    }
   1.442 +
   1.443 +    if (a->pFuncs != b->pFuncs) {
   1.444 +        // Different types of text providers.
   1.445 +        return FALSE;
   1.446 +    }
   1.447 +
   1.448 +    if (a->context != b->context) {
   1.449 +        // Different sources (different strings)
   1.450 +        return FALSE;
   1.451 +    }
   1.452 +    if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
   1.453 +        // Different current position in the string.
   1.454 +        return FALSE;
   1.455 +    }
   1.456 +
   1.457 +    return TRUE;
   1.458 +}
   1.459 +
   1.460 +U_CAPI UBool U_EXPORT2
   1.461 +utext_isWritable(const UText *ut)
   1.462 +{
   1.463 +    UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
   1.464 +    return b;
   1.465 +}
   1.466 +
   1.467 +
   1.468 +U_CAPI void U_EXPORT2
   1.469 +utext_freeze(UText *ut) {
   1.470 +    // Zero out the WRITABLE flag.
   1.471 +    ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
   1.472 +}
   1.473 +
   1.474 +
   1.475 +U_CAPI UBool U_EXPORT2
   1.476 +utext_hasMetaData(const UText *ut)
   1.477 +{
   1.478 +    UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
   1.479 +    return b;
   1.480 +}
   1.481 +
   1.482 +
   1.483 +
   1.484 +U_CAPI int32_t U_EXPORT2
   1.485 +utext_replace(UText *ut,
   1.486 +             int64_t nativeStart, int64_t nativeLimit,
   1.487 +             const UChar *replacementText, int32_t replacementLength,
   1.488 +             UErrorCode *status)
   1.489 +{
   1.490 +    if (U_FAILURE(*status)) {
   1.491 +        return 0;
   1.492 +    }
   1.493 +    if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
   1.494 +        *status = U_NO_WRITE_PERMISSION;
   1.495 +        return 0;
   1.496 +    }
   1.497 +    int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
   1.498 +    return i;
   1.499 +}
   1.500 +
   1.501 +U_CAPI void U_EXPORT2
   1.502 +utext_copy(UText *ut,
   1.503 +          int64_t nativeStart, int64_t nativeLimit,
   1.504 +          int64_t destIndex,
   1.505 +          UBool move,
   1.506 +          UErrorCode *status)
   1.507 +{
   1.508 +    if (U_FAILURE(*status)) {
   1.509 +        return;
   1.510 +    }
   1.511 +    if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
   1.512 +        *status = U_NO_WRITE_PERMISSION;
   1.513 +        return;
   1.514 +    }
   1.515 +    ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
   1.516 +}
   1.517 +
   1.518 +
   1.519 +
   1.520 +U_CAPI UText * U_EXPORT2
   1.521 +utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
   1.522 +    UText *result;
   1.523 +    result = src->pFuncs->clone(dest, src, deep, status);
   1.524 +    if (readOnly) {
   1.525 +        utext_freeze(result);
   1.526 +    }
   1.527 +    return result;
   1.528 +}
   1.529 +
   1.530 +
   1.531 +
   1.532 +//------------------------------------------------------------------------------
   1.533 +//
   1.534 +//   UText common functions implementation
   1.535 +//
   1.536 +//------------------------------------------------------------------------------
   1.537 +
   1.538 +//
   1.539 +//  UText.flags bit definitions
   1.540 +//
   1.541 +enum {
   1.542 +    UTEXT_HEAP_ALLOCATED  = 1,      //  1 if ICU has allocated this UText struct on the heap.
   1.543 +                                    //  0 if caller provided storage for the UText.
   1.544 +
   1.545 +    UTEXT_EXTRA_HEAP_ALLOCATED = 2, //  1 if ICU has allocated extra storage as a separate
   1.546 +                                    //     heap block.
   1.547 +                                    //  0 if there is no separate allocation.  Either no extra
   1.548 +                                    //     storage was requested, or it is appended to the end
   1.549 +                                    //     of the main UText storage.
   1.550 +
   1.551 +    UTEXT_OPEN = 4                  //  1 if this UText is currently open
   1.552 +                                    //  0 if this UText is not open.
   1.553 +};
   1.554 +
   1.555 +
   1.556 +//
   1.557 +//  Extended form of a UText.  The purpose is to aid in computing the total size required
   1.558 +//    when a provider asks for a UText to be allocated with extra storage.
   1.559 +
   1.560 +struct ExtendedUText {
   1.561 +    UText          ut;
   1.562 +    UAlignedMemory extension;
   1.563 +};
   1.564 +
   1.565 +static const UText emptyText = UTEXT_INITIALIZER;
   1.566 +
   1.567 +U_CAPI UText * U_EXPORT2
   1.568 +utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
   1.569 +    if (U_FAILURE(*status)) {
   1.570 +        return ut;
   1.571 +    }
   1.572 +
   1.573 +    if (ut == NULL) {
   1.574 +        // We need to heap-allocate storage for the new UText
   1.575 +        int32_t spaceRequired = sizeof(UText);
   1.576 +        if (extraSpace > 0) {
   1.577 +            spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
   1.578 +        }
   1.579 +        ut = (UText *)uprv_malloc(spaceRequired);
   1.580 +        if (ut == NULL) {
   1.581 +            *status = U_MEMORY_ALLOCATION_ERROR;
   1.582 +            return NULL;
   1.583 +        } else {
   1.584 +            *ut = emptyText;
   1.585 +            ut->flags |= UTEXT_HEAP_ALLOCATED;
   1.586 +            if (spaceRequired>0) {
   1.587 +                ut->extraSize = extraSpace;
   1.588 +                ut->pExtra    = &((ExtendedUText *)ut)->extension;
   1.589 +            }
   1.590 +        }
   1.591 +    } else {
   1.592 +        // We have been supplied with an already existing UText.
   1.593 +        // Verify that it really appears to be a UText.
   1.594 +        if (ut->magic != UTEXT_MAGIC) {
   1.595 +            *status = U_ILLEGAL_ARGUMENT_ERROR;
   1.596 +            return ut;
   1.597 +        }
   1.598 +        // If the ut is already open and there's a provider supplied close
   1.599 +        //   function, call it.
   1.600 +        if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL)  {
   1.601 +            ut->pFuncs->close(ut);
   1.602 +        }
   1.603 +        ut->flags &= ~UTEXT_OPEN;
   1.604 +
   1.605 +        // If extra space was requested by our caller, check whether
   1.606 +        //   sufficient already exists, and allocate new if needed.
   1.607 +        if (extraSpace > ut->extraSize) {
   1.608 +            // Need more space.  If there is existing separately allocated space,
   1.609 +            //   delete it first, then allocate new space.
   1.610 +            if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
   1.611 +                uprv_free(ut->pExtra);
   1.612 +                ut->extraSize = 0;
   1.613 +            }
   1.614 +            ut->pExtra = uprv_malloc(extraSpace);
   1.615 +            if (ut->pExtra == NULL) {
   1.616 +                *status = U_MEMORY_ALLOCATION_ERROR;
   1.617 +            } else {
   1.618 +                ut->extraSize = extraSpace;
   1.619 +                ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
   1.620 +            }
   1.621 +        }
   1.622 +    }
   1.623 +    if (U_SUCCESS(*status)) {
   1.624 +        ut->flags |= UTEXT_OPEN;
   1.625 +
   1.626 +        // Initialize all remaining fields of the UText.
   1.627 +        //
   1.628 +        ut->context             = NULL;
   1.629 +        ut->chunkContents       = NULL;
   1.630 +        ut->p                   = NULL;
   1.631 +        ut->q                   = NULL;
   1.632 +        ut->r                   = NULL;
   1.633 +        ut->a                   = 0;
   1.634 +        ut->b                   = 0;
   1.635 +        ut->c                   = 0;
   1.636 +        ut->chunkOffset         = 0;
   1.637 +        ut->chunkLength         = 0;
   1.638 +        ut->chunkNativeStart    = 0;
   1.639 +        ut->chunkNativeLimit    = 0;
   1.640 +        ut->nativeIndexingLimit = 0;
   1.641 +        ut->providerProperties  = 0;
   1.642 +        ut->privA               = 0;
   1.643 +        ut->privB               = 0;
   1.644 +        ut->privC               = 0;
   1.645 +        ut->privP               = NULL;
   1.646 +        if (ut->pExtra!=NULL && ut->extraSize>0)
   1.647 +            uprv_memset(ut->pExtra, 0, ut->extraSize);
   1.648 +
   1.649 +    }
   1.650 +    return ut;
   1.651 +}
   1.652 +
   1.653 +
   1.654 +U_CAPI UText * U_EXPORT2
   1.655 +utext_close(UText *ut) {
   1.656 +    if (ut==NULL ||
   1.657 +        ut->magic != UTEXT_MAGIC ||
   1.658 +        (ut->flags & UTEXT_OPEN) == 0)
   1.659 +    {
   1.660 +        // The supplied ut is not an open UText.
   1.661 +        // Do nothing.
   1.662 +        return ut;
   1.663 +    }
   1.664 +
   1.665 +    // If the provider gave us a close function, call it now.
   1.666 +    // This will clean up anything allocated specifically by the provider.
   1.667 +    if (ut->pFuncs->close != NULL) {
   1.668 +        ut->pFuncs->close(ut);
   1.669 +    }
   1.670 +    ut->flags &= ~UTEXT_OPEN;
   1.671 +
   1.672 +    // If we (the framework) allocated the UText or subsidiary storage,
   1.673 +    //   delete it.
   1.674 +    if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
   1.675 +        uprv_free(ut->pExtra);
   1.676 +        ut->pExtra = NULL;
   1.677 +        ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
   1.678 +        ut->extraSize = 0;
   1.679 +    }
   1.680 +
   1.681 +    // Zero out function table of the closed UText.  This is a defensive move,
   1.682 +    //   inteded to cause applications that inadvertantly use a closed
   1.683 +    //   utext to crash with null pointer errors.
   1.684 +    ut->pFuncs        = NULL;
   1.685 +
   1.686 +    if (ut->flags & UTEXT_HEAP_ALLOCATED) {
   1.687 +        // This UText was allocated by UText setup.  We need to free it.
   1.688 +        // Clear magic, so we can detect if the user messes up and immediately
   1.689 +        //  tries to reopen another UText using the deleted storage.
   1.690 +        ut->magic = 0;
   1.691 +        uprv_free(ut);
   1.692 +        ut = NULL;
   1.693 +    }
   1.694 +    return ut;
   1.695 +}
   1.696 +
   1.697 +
   1.698 +
   1.699 +
   1.700 +//
   1.701 +// invalidateChunk   Reset a chunk to have no contents, so that the next call
   1.702 +//                   to access will cause new data to load.
   1.703 +//                   This is needed when copy/move/replace operate directly on the
   1.704 +//                   backing text, potentially putting it out of sync with the
   1.705 +//                   contents in the chunk.
   1.706 +//
   1.707 +static void
   1.708 +invalidateChunk(UText *ut) {
   1.709 +    ut->chunkLength = 0;
   1.710 +    ut->chunkNativeLimit = 0;
   1.711 +    ut->chunkNativeStart = 0;
   1.712 +    ut->chunkOffset = 0;
   1.713 +    ut->nativeIndexingLimit = 0;
   1.714 +}
   1.715 +
   1.716 +//
   1.717 +// pinIndex        Do range pinning on a native index parameter.
   1.718 +//                 64 bit pinning is done in place.
   1.719 +//                 32 bit truncated result is returned as a convenience for
   1.720 +//                        use in providers that don't need 64 bits.
   1.721 +static int32_t
   1.722 +pinIndex(int64_t &index, int64_t limit) {
   1.723 +    if (index<0) {
   1.724 +        index = 0;
   1.725 +    } else if (index > limit) {
   1.726 +        index = limit;
   1.727 +    }
   1.728 +    return (int32_t)index;
   1.729 +}
   1.730 +
   1.731 +
   1.732 +U_CDECL_BEGIN
   1.733 +
   1.734 +//
   1.735 +// Pointer relocation function,
   1.736 +//   a utility used by shallow clone.
   1.737 +//   Adjust a pointer that refers to something within one UText (the source)
   1.738 +//   to refer to the same relative offset within a another UText (the target)
   1.739 +//
   1.740 +static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
   1.741 +    // convert all pointers to (char *) so that byte address arithmetic will work.
   1.742 +    char  *dptr = (char *)*destPtr;
   1.743 +    char  *dUText = (char *)dest;
   1.744 +    char  *sUText = (char *)src;
   1.745 +
   1.746 +    if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
   1.747 +        // target ptr was to something within the src UText's pExtra storage.
   1.748 +        //   relocate it into the target UText's pExtra region.
   1.749 +        *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
   1.750 +    } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
   1.751 +        // target ptr was pointing to somewhere within the source UText itself.
   1.752 +        //   Move it to the same offset within the target UText.
   1.753 +        *destPtr = dUText + (dptr-sUText);
   1.754 +    }
   1.755 +}
   1.756 +
   1.757 +
   1.758 +//
   1.759 +//  Clone.  This is a generic copy-the-utext-by-value clone function that can be
   1.760 +//          used as-is with some utext types, and as a helper by other clones.
   1.761 +//
   1.762 +static UText * U_CALLCONV
   1.763 +shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
   1.764 +    if (U_FAILURE(*status)) {
   1.765 +        return NULL;
   1.766 +    }
   1.767 +    int32_t  srcExtraSize = src->extraSize;
   1.768 +
   1.769 +    //
   1.770 +    // Use the generic text_setup to allocate storage if required.
   1.771 +    //
   1.772 +    dest = utext_setup(dest, srcExtraSize, status);
   1.773 +    if (U_FAILURE(*status)) {
   1.774 +        return dest;
   1.775 +    }
   1.776 +
   1.777 +    //
   1.778 +    //  flags (how the UText was allocated) and the pointer to the
   1.779 +    //   extra storage must retain the values in the cloned utext that
   1.780 +    //   were set up by utext_setup.  Save them separately before
   1.781 +    //   copying the whole struct.
   1.782 +    //
   1.783 +    void *destExtra = dest->pExtra;
   1.784 +    int32_t flags   = dest->flags;
   1.785 +
   1.786 +
   1.787 +    //
   1.788 +    //  Copy the whole UText struct by value.
   1.789 +    //  Any "Extra" storage is copied also.
   1.790 +    //
   1.791 +    int sizeToCopy = src->sizeOfStruct;
   1.792 +    if (sizeToCopy > dest->sizeOfStruct) {
   1.793 +        sizeToCopy = dest->sizeOfStruct;
   1.794 +    }
   1.795 +    uprv_memcpy(dest, src, sizeToCopy);
   1.796 +    dest->pExtra = destExtra;
   1.797 +    dest->flags  = flags;
   1.798 +    if (srcExtraSize > 0) {
   1.799 +        uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
   1.800 +    }
   1.801 +
   1.802 +    //
   1.803 +    // Relocate any pointers in the target that refer to the UText itself
   1.804 +    //   to point to the cloned copy rather than the original source.
   1.805 +    //
   1.806 +    adjustPointer(dest, &dest->context, src);
   1.807 +    adjustPointer(dest, &dest->p, src);
   1.808 +    adjustPointer(dest, &dest->q, src);
   1.809 +    adjustPointer(dest, &dest->r, src);
   1.810 +    adjustPointer(dest, (const void **)&dest->chunkContents, src);
   1.811 +
   1.812 +    return dest;
   1.813 +}
   1.814 +
   1.815 +
   1.816 +U_CDECL_END
   1.817 +
   1.818 +
   1.819 +
   1.820 +//------------------------------------------------------------------------------
   1.821 +//
   1.822 +//     UText implementation for UTF-8 char * strings (read-only)
   1.823 +//     Limitation:  string length must be <= 0x7fffffff in length.
   1.824 +//                  (length must for in an int32_t variable)
   1.825 +//
   1.826 +//         Use of UText data members:
   1.827 +//              context    pointer to UTF-8 string
   1.828 +//              utext.b    is the input string length (bytes).
   1.829 +//              utext.c    Length scanned so far in string
   1.830 +//                           (for optimizing finding length of zero terminated strings.)
   1.831 +//              utext.p    pointer to the current buffer
   1.832 +//              utext.q    pointer to the other buffer.
   1.833 +//
   1.834 +//------------------------------------------------------------------------------
   1.835 +
   1.836 +// Chunk size.
   1.837 +//     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
   1.838 +//     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
   1.839 +//     to two UChars.)
   1.840 +//
   1.841 +enum { UTF8_TEXT_CHUNK_SIZE=32 };
   1.842 +
   1.843 +//
   1.844 +// UTF8Buf  Two of these structs will be set up in the UText's extra allocated space.
   1.845 +//          Each contains the UChar chunk buffer, the to and from native maps, and
   1.846 +//          header info.
   1.847 +//
   1.848 +//     because backwards iteration fills the buffers starting at the end and
   1.849 +//     working towards the front, the filled part of the buffers may not begin
   1.850 +//     at the start of the available storage for the buffers.
   1.851 +//
   1.852 +//     Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
   1.853 +//     the last character added being a supplementary, and thus requiring a surrogate
   1.854 +//     pair.  Doing this is simpler than checking for the edge case.
   1.855 +//
   1.856 +
   1.857 +struct UTF8Buf {
   1.858 +    int32_t   bufNativeStart;                        // Native index of first char in UChar buf
   1.859 +    int32_t   bufNativeLimit;                        // Native index following last char in buf.
   1.860 +    int32_t   bufStartIdx;                           // First filled position in buf.
   1.861 +    int32_t   bufLimitIdx;                           // Limit of filled range in buf.
   1.862 +    int32_t   bufNILimit;                            // Limit of native indexing part of buf
   1.863 +    int32_t   toUCharsMapStart;                      // Native index corresponding to
   1.864 +                                                     //   mapToUChars[0].
   1.865 +                                                     //   Set to bufNativeStart when filling forwards.
   1.866 +                                                     //   Set to computed value when filling backwards.
   1.867 +
   1.868 +    UChar     buf[UTF8_TEXT_CHUNK_SIZE+4];           // The UChar buffer.  Requires one extra position beyond the
   1.869 +                                                     //   the chunk size, to allow for surrogate at the end.
   1.870 +                                                     //   Length must be identical to mapToNative array, below,
   1.871 +                                                     //   because of the way indexing works when the array is
   1.872 +                                                     //   filled backwards during a reverse iteration.  Thus,
   1.873 +                                                     //   the additional extra size.
   1.874 +    uint8_t   mapToNative[UTF8_TEXT_CHUNK_SIZE+4];   // map UChar index in buf to
   1.875 +                                                     //  native offset from bufNativeStart.
   1.876 +                                                     //  Requires two extra slots,
   1.877 +                                                     //    one for a supplementary starting in the last normal position,
   1.878 +                                                     //    and one for an entry for the buffer limit position.
   1.879 +    uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
   1.880 +                                                     //   correspoding offset in filled part of buf.
   1.881 +    int32_t   align;
   1.882 +};
   1.883 +
   1.884 +U_CDECL_BEGIN
   1.885 +
   1.886 +//
   1.887 +//   utf8TextLength
   1.888 +//
   1.889 +//        Get the length of the string.  If we don't already know it,
   1.890 +//              we'll need to scan for the trailing  nul.
   1.891 +//
   1.892 +static int64_t U_CALLCONV
   1.893 +utf8TextLength(UText *ut) {
   1.894 +    if (ut->b < 0) {
   1.895 +        // Zero terminated string, and we haven't scanned to the end yet.
   1.896 +        // Scan it now.
   1.897 +        const char *r = (const char *)ut->context + ut->c;
   1.898 +        while (*r != 0) {
   1.899 +            r++;
   1.900 +        }
   1.901 +        if ((r - (const char *)ut->context) < 0x7fffffff) {
   1.902 +            ut->b = (int32_t)(r - (const char *)ut->context);
   1.903 +        } else {
   1.904 +            // Actual string was bigger (more than 2 gig) than we
   1.905 +            //   can handle.  Clip it to 2 GB.
   1.906 +            ut->b = 0x7fffffff;
   1.907 +        }
   1.908 +        ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
   1.909 +    }
   1.910 +    return ut->b;
   1.911 +}
   1.912 +
   1.913 +
   1.914 +
   1.915 +
   1.916 +
   1.917 +
   1.918 +static UBool U_CALLCONV
   1.919 +utf8TextAccess(UText *ut, int64_t index, UBool forward) {
   1.920 +    //
   1.921 +    //  Apologies to those who are allergic to goto statements.
   1.922 +    //    Consider each goto to a labelled block to be the equivalent of
   1.923 +    //         call the named block as if it were a function();
   1.924 +    //         return;
   1.925 +    //
   1.926 +    const uint8_t *s8=(const uint8_t *)ut->context;
   1.927 +    UTF8Buf *u8b = NULL;
   1.928 +    int32_t  length = ut->b;         // Length of original utf-8
   1.929 +    int32_t  ix= (int32_t)index;     // Requested index, trimmed to 32 bits.
   1.930 +    int32_t  mapIndex = 0;
   1.931 +    if (index<0) {
   1.932 +        ix=0;
   1.933 +    } else if (index > 0x7fffffff) {
   1.934 +        // Strings with 64 bit lengths not supported by this UTF-8 provider.
   1.935 +        ix = 0x7fffffff;
   1.936 +    }
   1.937 +
   1.938 +    // Pin requested index to the string length.
   1.939 +    if (ix>length) {
   1.940 +        if (length>=0) {
   1.941 +            ix=length;
   1.942 +        } else if (ix>=ut->c) {
   1.943 +            // Zero terminated string, and requested index is beyond
   1.944 +            //   the region that has already been scanned.
   1.945 +            //   Scan up to either the end of the string or to the
   1.946 +            //   requested position, whichever comes first.
   1.947 +            while (ut->c<ix && s8[ut->c]!=0) {
   1.948 +                ut->c++;
   1.949 +            }
   1.950 +            //  TODO:  support for null terminated string length > 32 bits.
   1.951 +            if (s8[ut->c] == 0) {
   1.952 +                // We just found the actual length of the string.
   1.953 +                //  Trim the requested index back to that.
   1.954 +                ix     = ut->c;
   1.955 +                ut->b  = ut->c;
   1.956 +                length = ut->c;
   1.957 +                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
   1.958 +            }
   1.959 +        }
   1.960 +    }
   1.961 +
   1.962 +    //
   1.963 +    // Dispatch to the appropriate action for a forward iteration request.
   1.964 +    //
   1.965 +    if (forward) {
   1.966 +        if (ix==ut->chunkNativeLimit) {
   1.967 +            // Check for normal sequential iteration cases first.
   1.968 +            if (ix==length) {
   1.969 +                // Just reached end of string
   1.970 +                // Don't swap buffers, but do set the
   1.971 +                //   current buffer position.
   1.972 +                ut->chunkOffset = ut->chunkLength;
   1.973 +                return FALSE;
   1.974 +            } else {
   1.975 +                // End of current buffer.
   1.976 +                //   check whether other buffer already has what we need.
   1.977 +                UTF8Buf *altB = (UTF8Buf *)ut->q;
   1.978 +                if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
   1.979 +                    goto swapBuffers;
   1.980 +                }
   1.981 +            }
   1.982 +        }
   1.983 +
   1.984 +        // A random access.  Desired index could be in either or niether buf.
   1.985 +        // For optimizing the order of testing, first check for the index
   1.986 +        //    being in the other buffer.  This will be the case for uses that
   1.987 +        //    move back and forth over a fairly limited range
   1.988 +        {
   1.989 +            u8b = (UTF8Buf *)ut->q;   // the alternate buffer
   1.990 +            if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
   1.991 +                // Requested index is in the other buffer.
   1.992 +                goto swapBuffers;
   1.993 +            }
   1.994 +            if (ix == length) {
   1.995 +                // Requested index is end-of-string.
   1.996 +                //   (this is the case of randomly seeking to the end.
   1.997 +                //    The case of iterating off the end is handled earlier.)
   1.998 +                if (ix == ut->chunkNativeLimit) {
   1.999 +                    // Current buffer extends up to the end of the string.
  1.1000 +                    //   Leave it as the current buffer.
  1.1001 +                    ut->chunkOffset = ut->chunkLength;
  1.1002 +                    return FALSE;
  1.1003 +                }
  1.1004 +                if (ix == u8b->bufNativeLimit) {
  1.1005 +                    // Alternate buffer extends to the end of string.
  1.1006 +                    //   Swap it in as the current buffer.
  1.1007 +                    goto swapBuffersAndFail;
  1.1008 +                }
  1.1009 +
  1.1010 +                // Neither existing buffer extends to the end of the string.
  1.1011 +                goto makeStubBuffer;
  1.1012 +            }
  1.1013 +
  1.1014 +            if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
  1.1015 +                // Requested index is in neither buffer.
  1.1016 +                goto fillForward;
  1.1017 +            }
  1.1018 +
  1.1019 +            // Requested index is in this buffer.
  1.1020 +            u8b = (UTF8Buf *)ut->p;   // the current buffer
  1.1021 +            mapIndex = ix - u8b->toUCharsMapStart;
  1.1022 +            ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1.1023 +            return TRUE;
  1.1024 +
  1.1025 +        }
  1.1026 +    }
  1.1027 +
  1.1028 +
  1.1029 +    //
  1.1030 +    // Dispatch to the appropriate action for a
  1.1031 +    //   Backwards Diretion iteration request.
  1.1032 +    //
  1.1033 +    if (ix==ut->chunkNativeStart) {
  1.1034 +        // Check for normal sequential iteration cases first.
  1.1035 +        if (ix==0) {
  1.1036 +            // Just reached the start of string
  1.1037 +            // Don't swap buffers, but do set the
  1.1038 +            //   current buffer position.
  1.1039 +            ut->chunkOffset = 0;
  1.1040 +            return FALSE;
  1.1041 +        } else {
  1.1042 +            // Start of current buffer.
  1.1043 +            //   check whether other buffer already has what we need.
  1.1044 +            UTF8Buf *altB = (UTF8Buf *)ut->q;
  1.1045 +            if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
  1.1046 +                goto swapBuffers;
  1.1047 +            }
  1.1048 +        }
  1.1049 +    }
  1.1050 +
  1.1051 +    // A random access.  Desired index could be in either or niether buf.
  1.1052 +    // For optimizing the order of testing,
  1.1053 +    //    Most likely case:  in the other buffer.
  1.1054 +    //    Second most likely: in neither buffer.
  1.1055 +    //    Unlikely, but must work:  in the current buffer.
  1.1056 +    u8b = (UTF8Buf *)ut->q;   // the alternate buffer
  1.1057 +    if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
  1.1058 +        // Requested index is in the other buffer.
  1.1059 +        goto swapBuffers;
  1.1060 +    }
  1.1061 +    // Requested index is start-of-string.
  1.1062 +    //   (this is the case of randomly seeking to the start.
  1.1063 +    //    The case of iterating off the start is handled earlier.)
  1.1064 +    if (ix==0) {
  1.1065 +        if (u8b->bufNativeStart==0) {
  1.1066 +            // Alternate buffer contains the data for the start string.
  1.1067 +            // Make it be the current buffer.
  1.1068 +            goto swapBuffersAndFail;
  1.1069 +        } else {
  1.1070 +            // Request for data before the start of string,
  1.1071 +            //   neither buffer is usable.
  1.1072 +            //   set up a zero-length buffer.
  1.1073 +            goto makeStubBuffer;
  1.1074 +        }
  1.1075 +    }
  1.1076 +
  1.1077 +    if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
  1.1078 +        // Requested index is in neither buffer.
  1.1079 +        goto fillReverse;
  1.1080 +    }
  1.1081 +
  1.1082 +    // Requested index is in this buffer.
  1.1083 +    //   Set the utf16 buffer index.
  1.1084 +    u8b = (UTF8Buf *)ut->p;
  1.1085 +    mapIndex = ix - u8b->toUCharsMapStart;
  1.1086 +    ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1.1087 +    if (ut->chunkOffset==0) {
  1.1088 +        // This occurs when the first character in the text is
  1.1089 +        //   a multi-byte UTF-8 char, and the requested index is to
  1.1090 +        //   one of the trailing bytes.  Because there is no preceding ,
  1.1091 +        //   character, this access fails.  We can't pick up on the
  1.1092 +        //   situation sooner because the requested index is not zero.
  1.1093 +        return FALSE;
  1.1094 +    } else {
  1.1095 +        return TRUE;
  1.1096 +    }
  1.1097 +
  1.1098 +
  1.1099 +
  1.1100 +swapBuffers:
  1.1101 +    //  The alternate buffer (ut->q) has the string data that was requested.
  1.1102 +    //  Swap the primary and alternate buffers, and set the
  1.1103 +    //   chunk index into the new primary buffer.
  1.1104 +    {
  1.1105 +        u8b   = (UTF8Buf *)ut->q;
  1.1106 +        ut->q = ut->p;
  1.1107 +        ut->p = u8b;
  1.1108 +        ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
  1.1109 +        ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1.1110 +        ut->chunkNativeStart    = u8b->bufNativeStart;
  1.1111 +        ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1.1112 +        ut->nativeIndexingLimit = u8b->bufNILimit;
  1.1113 +
  1.1114 +        // Index into the (now current) chunk
  1.1115 +        // Use the map to set the chunk index.  It's more trouble than it's worth
  1.1116 +        //    to check whether native indexing can be used.
  1.1117 +        U_ASSERT(ix>=u8b->bufNativeStart);
  1.1118 +        U_ASSERT(ix<=u8b->bufNativeLimit);
  1.1119 +        mapIndex = ix - u8b->toUCharsMapStart;
  1.1120 +        U_ASSERT(mapIndex>=0);
  1.1121 +        U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
  1.1122 +        ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1.1123 +
  1.1124 +        return TRUE;
  1.1125 +    }
  1.1126 +
  1.1127 +
  1.1128 + swapBuffersAndFail:
  1.1129 +    // We got a request for either the start or end of the string,
  1.1130 +    //  with iteration continuing in the out-of-bounds direction.
  1.1131 +    // The alternate buffer already contains the data up to the
  1.1132 +    //  start/end.
  1.1133 +    // Swap the buffers, then return failure, indicating that we couldn't
  1.1134 +    //  make things correct for continuing the iteration in the requested
  1.1135 +    //  direction.  The position & buffer are correct should the
  1.1136 +    //  user decide to iterate in the opposite direction.
  1.1137 +    u8b   = (UTF8Buf *)ut->q;
  1.1138 +    ut->q = ut->p;
  1.1139 +    ut->p = u8b;
  1.1140 +    ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
  1.1141 +    ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1.1142 +    ut->chunkNativeStart    = u8b->bufNativeStart;
  1.1143 +    ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1.1144 +    ut->nativeIndexingLimit = u8b->bufNILimit;
  1.1145 +
  1.1146 +    // Index into the (now current) chunk
  1.1147 +    //  For this function  (swapBuffersAndFail), the requested index
  1.1148 +    //    will always be at either the start or end of the chunk.
  1.1149 +    if (ix==u8b->bufNativeLimit) {
  1.1150 +        ut->chunkOffset = ut->chunkLength;
  1.1151 +    } else  {
  1.1152 +        ut->chunkOffset = 0;
  1.1153 +        U_ASSERT(ix == u8b->bufNativeStart);
  1.1154 +    }
  1.1155 +    return FALSE;
  1.1156 +
  1.1157 +makeStubBuffer:
  1.1158 +    //   The user has done a seek/access past the start or end
  1.1159 +    //   of the string.  Rather than loading data that is likely
  1.1160 +    //   to never be used, just set up a zero-length buffer at
  1.1161 +    //   the position.
  1.1162 +    u8b = (UTF8Buf *)ut->q;
  1.1163 +    u8b->bufNativeStart   = ix;
  1.1164 +    u8b->bufNativeLimit   = ix;
  1.1165 +    u8b->bufStartIdx      = 0;
  1.1166 +    u8b->bufLimitIdx      = 0;
  1.1167 +    u8b->bufNILimit       = 0;
  1.1168 +    u8b->toUCharsMapStart = ix;
  1.1169 +    u8b->mapToNative[0]   = 0;
  1.1170 +    u8b->mapToUChars[0]   = 0;
  1.1171 +    goto swapBuffersAndFail;
  1.1172 +
  1.1173 +
  1.1174 +
  1.1175 +fillForward:
  1.1176 +    {
  1.1177 +        // Move the incoming index to a code point boundary.
  1.1178 +        U8_SET_CP_START(s8, 0, ix);
  1.1179 +
  1.1180 +        // Swap the UText buffers.
  1.1181 +        //  We want to fill what was previously the alternate buffer,
  1.1182 +        //  and make what was the current buffer be the new alternate.
  1.1183 +        UTF8Buf *u8b = (UTF8Buf *)ut->q;
  1.1184 +        ut->q = ut->p;
  1.1185 +        ut->p = u8b;
  1.1186 +
  1.1187 +        int32_t strLen = ut->b;
  1.1188 +        UBool   nulTerminated = FALSE;
  1.1189 +        if (strLen < 0) {
  1.1190 +            strLen = 0x7fffffff;
  1.1191 +            nulTerminated = TRUE;
  1.1192 +        }
  1.1193 +
  1.1194 +        UChar   *buf = u8b->buf;
  1.1195 +        uint8_t *mapToNative  = u8b->mapToNative;
  1.1196 +        uint8_t *mapToUChars  = u8b->mapToUChars;
  1.1197 +        int32_t  destIx       = 0;
  1.1198 +        int32_t  srcIx        = ix;
  1.1199 +        UBool    seenNonAscii = FALSE;
  1.1200 +        UChar32  c = 0;
  1.1201 +
  1.1202 +        // Fill the chunk buffer and mapping arrays.
  1.1203 +        while (destIx<UTF8_TEXT_CHUNK_SIZE) {
  1.1204 +            c = s8[srcIx];
  1.1205 +            if (c>0 && c<0x80) {
  1.1206 +                // Special case ASCII range for speed.
  1.1207 +                //   zero is excluded to simplify bounds checking.
  1.1208 +                buf[destIx] = (UChar)c;
  1.1209 +                mapToNative[destIx]    = (uint8_t)(srcIx - ix);
  1.1210 +                mapToUChars[srcIx-ix]  = (uint8_t)destIx;
  1.1211 +                srcIx++;
  1.1212 +                destIx++;
  1.1213 +            } else {
  1.1214 +                // General case, handle everything.
  1.1215 +                if (seenNonAscii == FALSE) {
  1.1216 +                    seenNonAscii = TRUE;
  1.1217 +                    u8b->bufNILimit = destIx;
  1.1218 +                }
  1.1219 +
  1.1220 +                int32_t  cIx      = srcIx;
  1.1221 +                int32_t  dIx      = destIx;
  1.1222 +                int32_t  dIxSaved = destIx;
  1.1223 +                U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
  1.1224 +                if (c==0 && nulTerminated) {
  1.1225 +                    srcIx--;
  1.1226 +                    break;
  1.1227 +                }
  1.1228 +
  1.1229 +                U16_APPEND_UNSAFE(buf, destIx, c);
  1.1230 +                do {
  1.1231 +                    mapToNative[dIx++] = (uint8_t)(cIx - ix);
  1.1232 +                } while (dIx < destIx);
  1.1233 +
  1.1234 +                do {
  1.1235 +                    mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
  1.1236 +                } while (cIx < srcIx);
  1.1237 +            }
  1.1238 +            if (srcIx>=strLen) {
  1.1239 +                break;
  1.1240 +            }
  1.1241 +
  1.1242 +        }
  1.1243 +
  1.1244 +        //  store Native <--> Chunk Map entries for the end of the buffer.
  1.1245 +        //    There is no actual character here, but the index position is valid.
  1.1246 +        mapToNative[destIx]     = (uint8_t)(srcIx - ix);
  1.1247 +        mapToUChars[srcIx - ix] = (uint8_t)destIx;
  1.1248 +
  1.1249 +        //  fill in Buffer descriptor
  1.1250 +        u8b->bufNativeStart     = ix;
  1.1251 +        u8b->bufNativeLimit     = srcIx;
  1.1252 +        u8b->bufStartIdx        = 0;
  1.1253 +        u8b->bufLimitIdx        = destIx;
  1.1254 +        if (seenNonAscii == FALSE) {
  1.1255 +            u8b->bufNILimit     = destIx;
  1.1256 +        }
  1.1257 +        u8b->toUCharsMapStart   = u8b->bufNativeStart;
  1.1258 +
  1.1259 +        // Set UText chunk to refer to this buffer.
  1.1260 +        ut->chunkContents       = buf;
  1.1261 +        ut->chunkOffset         = 0;
  1.1262 +        ut->chunkLength         = u8b->bufLimitIdx;
  1.1263 +        ut->chunkNativeStart    = u8b->bufNativeStart;
  1.1264 +        ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1.1265 +        ut->nativeIndexingLimit = u8b->bufNILimit;
  1.1266 +
  1.1267 +        // For zero terminated strings, keep track of the maximum point
  1.1268 +        //   scanned so far.
  1.1269 +        if (nulTerminated && srcIx>ut->c) {
  1.1270 +            ut->c = srcIx;
  1.1271 +            if (c==0) {
  1.1272 +                // We scanned to the end.
  1.1273 +                //   Remember the actual length.
  1.1274 +                ut->b = srcIx;
  1.1275 +                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.1276 +            }
  1.1277 +        }
  1.1278 +        return TRUE;
  1.1279 +    }
  1.1280 +
  1.1281 +
  1.1282 +fillReverse:
  1.1283 +    {
  1.1284 +        // Move the incoming index to a code point boundary.
  1.1285 +        // Can only do this if the incoming index is somewhere in the interior of the string.
  1.1286 +        //   If index is at the end, there is no character there to look at.
  1.1287 +        if (ix != ut->b) {
  1.1288 +            U8_SET_CP_START(s8, 0, ix);
  1.1289 +        }
  1.1290 +
  1.1291 +        // Swap the UText buffers.
  1.1292 +        //  We want to fill what was previously the alternate buffer,
  1.1293 +        //  and make what was the current buffer be the new alternate.
  1.1294 +        UTF8Buf *u8b = (UTF8Buf *)ut->q;
  1.1295 +        ut->q = ut->p;
  1.1296 +        ut->p = u8b;
  1.1297 +
  1.1298 +        UChar   *buf = u8b->buf;
  1.1299 +        uint8_t *mapToNative = u8b->mapToNative;
  1.1300 +        uint8_t *mapToUChars = u8b->mapToUChars;
  1.1301 +        int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
  1.1302 +        int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
  1.1303 +                                                    //   at end of buffer to leave room
  1.1304 +                                                    //   for a surrogate pair at the
  1.1305 +                                                    //   buffer start.
  1.1306 +        int32_t  srcIx  = ix;
  1.1307 +        int32_t  bufNILimit = destIx;
  1.1308 +        UChar32   c;
  1.1309 +
  1.1310 +        // Map to/from Native Indexes, fill in for the position at the end of
  1.1311 +        //   the buffer.
  1.1312 +        //
  1.1313 +        mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1.1314 +        mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1.1315 +
  1.1316 +        // Fill the chunk buffer
  1.1317 +        // Work backwards, filling from the end of the buffer towards the front.
  1.1318 +        //
  1.1319 +        while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
  1.1320 +            srcIx--;
  1.1321 +            destIx--;
  1.1322 +
  1.1323 +            // Get last byte of the UTF-8 character
  1.1324 +            c = s8[srcIx];
  1.1325 +            if (c<0x80) {
  1.1326 +                // Special case ASCII range for speed.
  1.1327 +                buf[destIx] = (UChar)c;
  1.1328 +                mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1.1329 +                mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1.1330 +            } else {
  1.1331 +                // General case, handle everything non-ASCII.
  1.1332 +
  1.1333 +                int32_t  sIx      = srcIx;  // ix of last byte of multi-byte u8 char
  1.1334 +
  1.1335 +                // Get the full character from the UTF8 string.
  1.1336 +                //   use code derived from tbe macros in utf8.h
  1.1337 +                //   Leaves srcIx pointing at the first byte of the UTF-8 char.
  1.1338 +                //
  1.1339 +                c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
  1.1340 +                // leaves srcIx at first byte of the multi-byte char.
  1.1341 +
  1.1342 +                // Store the character in UTF-16 buffer.
  1.1343 +                if (c<0x10000) {
  1.1344 +                    buf[destIx] = (UChar)c;
  1.1345 +                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1.1346 +                } else {
  1.1347 +                    buf[destIx]         = U16_TRAIL(c);
  1.1348 +                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1.1349 +                    buf[--destIx]       = U16_LEAD(c);
  1.1350 +                    mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1.1351 +                }
  1.1352 +
  1.1353 +                // Fill in the map from native indexes to UChars buf index.
  1.1354 +                do {
  1.1355 +                    mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
  1.1356 +                } while (sIx >= srcIx);
  1.1357 +
  1.1358 +                // Set native indexing limit to be the current position.
  1.1359 +                //   We are processing a non-ascii, non-native-indexing char now;
  1.1360 +                //     the limit will be here if the rest of the chars to be
  1.1361 +                //     added to this buffer are ascii.
  1.1362 +                bufNILimit = destIx;
  1.1363 +            }
  1.1364 +        }
  1.1365 +        u8b->bufNativeStart     = srcIx;
  1.1366 +        u8b->bufNativeLimit     = ix;
  1.1367 +        u8b->bufStartIdx        = destIx;
  1.1368 +        u8b->bufLimitIdx        = UTF8_TEXT_CHUNK_SIZE+2;
  1.1369 +        u8b->bufNILimit         = bufNILimit - u8b->bufStartIdx;
  1.1370 +        u8b->toUCharsMapStart   = toUCharsMapStart;
  1.1371 +
  1.1372 +        ut->chunkContents       = &buf[u8b->bufStartIdx];
  1.1373 +        ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1.1374 +        ut->chunkOffset         = ut->chunkLength;
  1.1375 +        ut->chunkNativeStart    = u8b->bufNativeStart;
  1.1376 +        ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1.1377 +        ut->nativeIndexingLimit = u8b->bufNILimit;
  1.1378 +        return TRUE;
  1.1379 +    }
  1.1380 +
  1.1381 +}
  1.1382 +
  1.1383 +
  1.1384 +
  1.1385 +//
  1.1386 +//  This is a slightly modified copy of u_strFromUTF8,
  1.1387 +//     Inserts a Replacement Char rather than failing on invalid UTF-8
  1.1388 +//     Removes unnecessary features.
  1.1389 +//
  1.1390 +static UChar*
  1.1391 +utext_strFromUTF8(UChar *dest,
  1.1392 +              int32_t destCapacity,
  1.1393 +              int32_t *pDestLength,
  1.1394 +              const char* src,
  1.1395 +              int32_t srcLength,        // required.  NUL terminated not supported.
  1.1396 +              UErrorCode *pErrorCode
  1.1397 +              )
  1.1398 +{
  1.1399 +
  1.1400 +    UChar *pDest = dest;
  1.1401 +    UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
  1.1402 +    UChar32 ch=0;
  1.1403 +    int32_t index = 0;
  1.1404 +    int32_t reqLength = 0;
  1.1405 +    uint8_t* pSrc = (uint8_t*) src;
  1.1406 +
  1.1407 +
  1.1408 +    while((index < srcLength)&&(pDest<pDestLimit)){
  1.1409 +        ch = pSrc[index++];
  1.1410 +        if(ch <=0x7f){
  1.1411 +            *pDest++=(UChar)ch;
  1.1412 +        }else{
  1.1413 +            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1.1414 +            if(U_IS_BMP(ch)){
  1.1415 +                *(pDest++)=(UChar)ch;
  1.1416 +            }else{
  1.1417 +                *(pDest++)=U16_LEAD(ch);
  1.1418 +                if(pDest<pDestLimit){
  1.1419 +                    *(pDest++)=U16_TRAIL(ch);
  1.1420 +                }else{
  1.1421 +                    reqLength++;
  1.1422 +                    break;
  1.1423 +                }
  1.1424 +            }
  1.1425 +        }
  1.1426 +    }
  1.1427 +    /* donot fill the dest buffer just count the UChars needed */
  1.1428 +    while(index < srcLength){
  1.1429 +        ch = pSrc[index++];
  1.1430 +        if(ch <= 0x7f){
  1.1431 +            reqLength++;
  1.1432 +        }else{
  1.1433 +            ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1.1434 +            reqLength+=U16_LENGTH(ch);
  1.1435 +        }
  1.1436 +    }
  1.1437 +
  1.1438 +    reqLength+=(int32_t)(pDest - dest);
  1.1439 +
  1.1440 +    if(pDestLength){
  1.1441 +        *pDestLength = reqLength;
  1.1442 +    }
  1.1443 +
  1.1444 +    /* Terminate the buffer */
  1.1445 +    u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
  1.1446 +
  1.1447 +    return dest;
  1.1448 +}
  1.1449 +
  1.1450 +
  1.1451 +
  1.1452 +static int32_t U_CALLCONV
  1.1453 +utf8TextExtract(UText *ut,
  1.1454 +                int64_t start, int64_t limit,
  1.1455 +                UChar *dest, int32_t destCapacity,
  1.1456 +                UErrorCode *pErrorCode) {
  1.1457 +    if(U_FAILURE(*pErrorCode)) {
  1.1458 +        return 0;
  1.1459 +    }
  1.1460 +    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  1.1461 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.1462 +        return 0;
  1.1463 +    }
  1.1464 +    int32_t  length  = ut->b;
  1.1465 +    int32_t  start32 = pinIndex(start, length);
  1.1466 +    int32_t  limit32 = pinIndex(limit, length);
  1.1467 +
  1.1468 +    if(start32>limit32) {
  1.1469 +        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1470 +        return 0;
  1.1471 +    }
  1.1472 +
  1.1473 +
  1.1474 +    // adjust the incoming indexes to land on code point boundaries if needed.
  1.1475 +    //    adjust by no more than three, because that is the largest number of trail bytes
  1.1476 +    //    in a well formed UTF8 character.
  1.1477 +    const uint8_t *buf = (const uint8_t *)ut->context;
  1.1478 +    int i;
  1.1479 +    if (start32 < ut->chunkNativeLimit) {
  1.1480 +        for (i=0; i<3; i++) {
  1.1481 +            if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
  1.1482 +                break;
  1.1483 +            }
  1.1484 +            start32--;
  1.1485 +        }
  1.1486 +    }
  1.1487 +
  1.1488 +    if (limit32 < ut->chunkNativeLimit) {
  1.1489 +        for (i=0; i<3; i++) {
  1.1490 +            if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
  1.1491 +                break;
  1.1492 +            }
  1.1493 +            limit32--;
  1.1494 +        }
  1.1495 +    }
  1.1496 +
  1.1497 +    // Do the actual extract.
  1.1498 +    int32_t destLength=0;
  1.1499 +    utext_strFromUTF8(dest, destCapacity, &destLength,
  1.1500 +                    (const char *)ut->context+start32, limit32-start32,
  1.1501 +                    pErrorCode);
  1.1502 +    utf8TextAccess(ut, limit32, TRUE);
  1.1503 +    return destLength;
  1.1504 +}
  1.1505 +
  1.1506 +//
  1.1507 +// utf8TextMapOffsetToNative
  1.1508 +//
  1.1509 +// Map a chunk (UTF-16) offset to a native index.
  1.1510 +static int64_t U_CALLCONV
  1.1511 +utf8TextMapOffsetToNative(const UText *ut) {
  1.1512 +    //
  1.1513 +    UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1.1514 +    U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
  1.1515 +    int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
  1.1516 +    U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
  1.1517 +    return nativeOffset;
  1.1518 +}
  1.1519 +
  1.1520 +//
  1.1521 +// Map a native index to the corrsponding chunk offset
  1.1522 +//
  1.1523 +static int32_t U_CALLCONV
  1.1524 +utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
  1.1525 +    U_ASSERT(index64 <= 0x7fffffff);
  1.1526 +    int32_t index = (int32_t)index64;
  1.1527 +    UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1.1528 +    U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
  1.1529 +    U_ASSERT(index<=ut->chunkNativeLimit);
  1.1530 +    int32_t mapIndex = index - u8b->toUCharsMapStart;
  1.1531 +    int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1.1532 +    U_ASSERT(offset>=0 && offset<=ut->chunkLength);
  1.1533 +    return offset;
  1.1534 +}
  1.1535 +
  1.1536 +static UText * U_CALLCONV
  1.1537 +utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
  1.1538 +{
  1.1539 +    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  1.1540 +    dest = shallowTextClone(dest, src, status);
  1.1541 +
  1.1542 +    // For deep clones, make a copy of the string.
  1.1543 +    //  The copied storage is owned by the newly created clone.
  1.1544 +    //
  1.1545 +    // TODO:  There is an isssue with using utext_nativeLength().
  1.1546 +    //        That function is non-const in cases where the input was NUL terminated
  1.1547 +    //          and the length has not yet been determined.
  1.1548 +    //        This function (clone()) is const.
  1.1549 +    //        There potentially a thread safety issue lurking here.
  1.1550 +    //
  1.1551 +    if (deep && U_SUCCESS(*status)) {
  1.1552 +        int32_t  len = (int32_t)utext_nativeLength((UText *)src);
  1.1553 +        char *copyStr = (char *)uprv_malloc(len+1);
  1.1554 +        if (copyStr == NULL) {
  1.1555 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.1556 +        } else {
  1.1557 +            uprv_memcpy(copyStr, src->context, len+1);
  1.1558 +            dest->context = copyStr;
  1.1559 +            dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1.1560 +        }
  1.1561 +    }
  1.1562 +    return dest;
  1.1563 +}
  1.1564 +
  1.1565 +
  1.1566 +static void U_CALLCONV
  1.1567 +utf8TextClose(UText *ut) {
  1.1568 +    // Most of the work of close is done by the generic UText framework close.
  1.1569 +    // All that needs to be done here is to delete the UTF8 string if the UText
  1.1570 +    //  owns it.  This occurs if the UText was created by cloning.
  1.1571 +    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1.1572 +        char *s = (char *)ut->context;
  1.1573 +        uprv_free(s);
  1.1574 +        ut->context = NULL;
  1.1575 +    }
  1.1576 +}
  1.1577 +
  1.1578 +U_CDECL_END
  1.1579 +
  1.1580 +
  1.1581 +static const struct UTextFuncs utf8Funcs = 
  1.1582 +{
  1.1583 +    sizeof(UTextFuncs),
  1.1584 +    0, 0, 0,             // Reserved alignment padding
  1.1585 +    utf8TextClone,
  1.1586 +    utf8TextLength,
  1.1587 +    utf8TextAccess,
  1.1588 +    utf8TextExtract,
  1.1589 +    NULL,                /* replace*/
  1.1590 +    NULL,                /* copy   */
  1.1591 +    utf8TextMapOffsetToNative,
  1.1592 +    utf8TextMapIndexToUTF16,
  1.1593 +    utf8TextClose,
  1.1594 +    NULL,                // spare 1
  1.1595 +    NULL,                // spare 2
  1.1596 +    NULL                 // spare 3
  1.1597 +};
  1.1598 +
  1.1599 +
  1.1600 +static const char gEmptyString[] = {0};
  1.1601 +
  1.1602 +U_CAPI UText * U_EXPORT2
  1.1603 +utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
  1.1604 +    if(U_FAILURE(*status)) {
  1.1605 +        return NULL;
  1.1606 +    }
  1.1607 +    if(s==NULL && length==0) {
  1.1608 +        s = gEmptyString;
  1.1609 +    }
  1.1610 +
  1.1611 +    if(s==NULL || length<-1 || length>INT32_MAX) {
  1.1612 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
  1.1613 +        return NULL;
  1.1614 +    }
  1.1615 +
  1.1616 +    ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
  1.1617 +    if (U_FAILURE(*status)) {
  1.1618 +        return ut;
  1.1619 +    }
  1.1620 +
  1.1621 +    ut->pFuncs  = &utf8Funcs;
  1.1622 +    ut->context = s;
  1.1623 +    ut->b       = (int32_t)length;
  1.1624 +    ut->c       = (int32_t)length;
  1.1625 +    if (ut->c < 0) {
  1.1626 +        ut->c = 0;
  1.1627 +        ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.1628 +    }
  1.1629 +    ut->p = ut->pExtra;
  1.1630 +    ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
  1.1631 +    return ut;
  1.1632 +
  1.1633 +}
  1.1634 +
  1.1635 +
  1.1636 +
  1.1637 +
  1.1638 +
  1.1639 +
  1.1640 +
  1.1641 +
  1.1642 +//------------------------------------------------------------------------------
  1.1643 +//
  1.1644 +//     UText implementation wrapper for Replaceable (read/write)
  1.1645 +//
  1.1646 +//         Use of UText data members:
  1.1647 +//            context    pointer to Replaceable.
  1.1648 +//            p          pointer to Replaceable if it is owned by the UText.
  1.1649 +//
  1.1650 +//------------------------------------------------------------------------------
  1.1651 +
  1.1652 +
  1.1653 +
  1.1654 +// minimum chunk size for this implementation: 3
  1.1655 +// to allow for possible trimming for code point boundaries
  1.1656 +enum { REP_TEXT_CHUNK_SIZE=10 };
  1.1657 +
  1.1658 +struct ReplExtra {
  1.1659 +    /*
  1.1660 +     * Chunk UChars.
  1.1661 +     * +1 to simplify filling with surrogate pair at the end.
  1.1662 +     */
  1.1663 +    UChar s[REP_TEXT_CHUNK_SIZE+1];
  1.1664 +};
  1.1665 +
  1.1666 +
  1.1667 +U_CDECL_BEGIN
  1.1668 +
  1.1669 +static UText * U_CALLCONV
  1.1670 +repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  1.1671 +    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  1.1672 +    dest = shallowTextClone(dest, src, status);
  1.1673 +
  1.1674 +    // For deep clones, make a copy of the Replaceable.
  1.1675 +    //  The copied Replaceable storage is owned by the newly created UText clone.
  1.1676 +    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  1.1677 +    //    it.
  1.1678 +    //
  1.1679 +    if (deep && U_SUCCESS(*status)) {
  1.1680 +        const Replaceable *replSrc = (const Replaceable *)src->context;
  1.1681 +        dest->context = replSrc->clone();
  1.1682 +        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1.1683 +
  1.1684 +        // with deep clone, the copy is writable, even when the source is not.
  1.1685 +        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1.1686 +    }
  1.1687 +    return dest;
  1.1688 +}
  1.1689 +
  1.1690 +
  1.1691 +static void U_CALLCONV
  1.1692 +repTextClose(UText *ut) {
  1.1693 +    // Most of the work of close is done by the generic UText framework close.
  1.1694 +    // All that needs to be done here is delete the Replaceable if the UText
  1.1695 +    //  owns it.  This occurs if the UText was created by cloning.
  1.1696 +    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1.1697 +        Replaceable *rep = (Replaceable *)ut->context;
  1.1698 +        delete rep;
  1.1699 +        ut->context = NULL;
  1.1700 +    }
  1.1701 +}
  1.1702 +
  1.1703 +
  1.1704 +static int64_t U_CALLCONV
  1.1705 +repTextLength(UText *ut) {
  1.1706 +    const Replaceable *replSrc = (const Replaceable *)ut->context;
  1.1707 +    int32_t  len = replSrc->length();
  1.1708 +    return len;
  1.1709 +}
  1.1710 +
  1.1711 +
  1.1712 +static UBool U_CALLCONV
  1.1713 +repTextAccess(UText *ut, int64_t index, UBool forward) {
  1.1714 +    const Replaceable *rep=(const Replaceable *)ut->context;
  1.1715 +    int32_t length=rep->length();   // Full length of the input text (bigger than a chunk)
  1.1716 +
  1.1717 +    // clip the requested index to the limits of the text.
  1.1718 +    int32_t index32 = pinIndex(index, length);
  1.1719 +    U_ASSERT(index<=INT32_MAX);
  1.1720 +
  1.1721 +
  1.1722 +    /*
  1.1723 +     * Compute start/limit boundaries around index, for a segment of text
  1.1724 +     * to be extracted.
  1.1725 +     * To allow for the possibility that our user gave an index to the trailing
  1.1726 +     * half of a surrogate pair, we must request one extra preceding UChar when
  1.1727 +     * going in the forward direction.  This will ensure that the buffer has the
  1.1728 +     * entire code point at the specified index.
  1.1729 +     */
  1.1730 +    if(forward) {
  1.1731 +
  1.1732 +        if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
  1.1733 +            // Buffer already contains the requested position.
  1.1734 +            ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
  1.1735 +            return TRUE;
  1.1736 +        }
  1.1737 +        if (index32>=length && ut->chunkNativeLimit==length) {
  1.1738 +            // Request for end of string, and buffer already extends up to it.
  1.1739 +            // Can't get the data, but don't change the buffer.
  1.1740 +            ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
  1.1741 +            return FALSE;
  1.1742 +        }
  1.1743 +
  1.1744 +        ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
  1.1745 +        // Going forward, so we want to have the buffer with stuff at and beyond
  1.1746 +        //   the requested index.  The -1 gets us one code point before the
  1.1747 +        //   requested index also, to handle the case of the index being on
  1.1748 +        //   a trail surrogate of a surrogate pair.
  1.1749 +        if(ut->chunkNativeLimit > length) {
  1.1750 +            ut->chunkNativeLimit = length;
  1.1751 +        }
  1.1752 +        // unless buffer ran off end, start is index-1.
  1.1753 +        ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
  1.1754 +        if(ut->chunkNativeStart < 0) {
  1.1755 +            ut->chunkNativeStart = 0;
  1.1756 +        }
  1.1757 +    } else {
  1.1758 +        // Reverse iteration.  Fill buffer with data preceding the requested index.
  1.1759 +        if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
  1.1760 +            // Requested position already in buffer.
  1.1761 +            ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
  1.1762 +            return TRUE;
  1.1763 +        }
  1.1764 +        if (index32==0 && ut->chunkNativeStart==0) {
  1.1765 +            // Request for start, buffer already begins at start.
  1.1766 +            //  No data, but keep the buffer as is.
  1.1767 +            ut->chunkOffset = 0;
  1.1768 +            return FALSE;
  1.1769 +        }
  1.1770 +
  1.1771 +        // Figure out the bounds of the chunk to extract for reverse iteration.
  1.1772 +        // Need to worry about chunk not splitting surrogate pairs, and while still
  1.1773 +        // containing the data we need.
  1.1774 +        // Fix by requesting a chunk that includes an extra UChar at the end.
  1.1775 +        // If this turns out to be a lead surrogate, we can lop it off and still have
  1.1776 +        //   the data we wanted.
  1.1777 +        ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
  1.1778 +        if (ut->chunkNativeStart < 0) {
  1.1779 +            ut->chunkNativeStart = 0;
  1.1780 +        }
  1.1781 +
  1.1782 +        ut->chunkNativeLimit = index32 + 1;
  1.1783 +        if (ut->chunkNativeLimit > length) {
  1.1784 +            ut->chunkNativeLimit = length;
  1.1785 +        }
  1.1786 +    }
  1.1787 +
  1.1788 +    // Extract the new chunk of text from the Replaceable source.
  1.1789 +    ReplExtra *ex = (ReplExtra *)ut->pExtra;
  1.1790 +    // UnicodeString with its buffer a writable alias to the chunk buffer
  1.1791 +    UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
  1.1792 +    rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
  1.1793 +
  1.1794 +    ut->chunkContents  = ex->s;
  1.1795 +    ut->chunkLength    = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
  1.1796 +    ut->chunkOffset    = (int32_t)(index32 - ut->chunkNativeStart);
  1.1797 +
  1.1798 +    // Surrogate pairs from the input text must not span chunk boundaries.
  1.1799 +    // If end of chunk could be the start of a surrogate, trim it off.
  1.1800 +    if (ut->chunkNativeLimit < length &&
  1.1801 +        U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
  1.1802 +            ut->chunkLength--;
  1.1803 +            ut->chunkNativeLimit--;
  1.1804 +            if (ut->chunkOffset > ut->chunkLength) {
  1.1805 +                ut->chunkOffset = ut->chunkLength;
  1.1806 +            }
  1.1807 +        }
  1.1808 +
  1.1809 +    // if the first UChar in the chunk could be the trailing half of a surrogate pair,
  1.1810 +    // trim it off.
  1.1811 +    if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
  1.1812 +        ++(ut->chunkContents);
  1.1813 +        ++(ut->chunkNativeStart);
  1.1814 +        --(ut->chunkLength);
  1.1815 +        --(ut->chunkOffset);
  1.1816 +    }
  1.1817 +
  1.1818 +    // adjust the index/chunkOffset to a code point boundary
  1.1819 +    U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
  1.1820 +
  1.1821 +    // Use fast indexing for get/setNativeIndex()
  1.1822 +    ut->nativeIndexingLimit = ut->chunkLength;
  1.1823 +
  1.1824 +    return TRUE;
  1.1825 +}
  1.1826 +
  1.1827 +
  1.1828 +
  1.1829 +static int32_t U_CALLCONV
  1.1830 +repTextExtract(UText *ut,
  1.1831 +               int64_t start, int64_t limit,
  1.1832 +               UChar *dest, int32_t destCapacity,
  1.1833 +               UErrorCode *status) {
  1.1834 +    const Replaceable *rep=(const Replaceable *)ut->context;
  1.1835 +    int32_t  length=rep->length();
  1.1836 +
  1.1837 +    if(U_FAILURE(*status)) {
  1.1838 +        return 0;
  1.1839 +    }
  1.1840 +    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  1.1841 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
  1.1842 +    }
  1.1843 +    if(start>limit) {
  1.1844 +        *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1845 +        return 0;
  1.1846 +    }
  1.1847 +
  1.1848 +    int32_t  start32 = pinIndex(start, length);
  1.1849 +    int32_t  limit32 = pinIndex(limit, length);
  1.1850 +
  1.1851 +    // adjust start, limit if they point to trail half of surrogates
  1.1852 +    if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
  1.1853 +        U_IS_SUPPLEMENTARY(rep->char32At(start32))){
  1.1854 +            start32--;
  1.1855 +    }
  1.1856 +    if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
  1.1857 +        U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
  1.1858 +            limit32--;
  1.1859 +    }
  1.1860 +
  1.1861 +    length=limit32-start32;
  1.1862 +    if(length>destCapacity) {
  1.1863 +        limit32 = start32 + destCapacity;
  1.1864 +    }
  1.1865 +    UnicodeString buffer(dest, 0, destCapacity); // writable alias
  1.1866 +    rep->extractBetween(start32, limit32, buffer);
  1.1867 +    repTextAccess(ut, limit32, TRUE);
  1.1868 +    
  1.1869 +    return u_terminateUChars(dest, destCapacity, length, status);
  1.1870 +}
  1.1871 +
  1.1872 +static int32_t U_CALLCONV
  1.1873 +repTextReplace(UText *ut,
  1.1874 +               int64_t start, int64_t limit,
  1.1875 +               const UChar *src, int32_t length,
  1.1876 +               UErrorCode *status) {
  1.1877 +    Replaceable *rep=(Replaceable *)ut->context;
  1.1878 +    int32_t oldLength;
  1.1879 +
  1.1880 +    if(U_FAILURE(*status)) {
  1.1881 +        return 0;
  1.1882 +    }
  1.1883 +    if(src==NULL && length!=0) {
  1.1884 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
  1.1885 +        return 0;
  1.1886 +    }
  1.1887 +    oldLength=rep->length(); // will subtract from new length
  1.1888 +    if(start>limit ) {
  1.1889 +        *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1890 +        return 0;
  1.1891 +    }
  1.1892 +
  1.1893 +    int32_t start32 = pinIndex(start, oldLength);
  1.1894 +    int32_t limit32 = pinIndex(limit, oldLength);
  1.1895 +
  1.1896 +    // Snap start & limit to code point boundaries.
  1.1897 +    if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
  1.1898 +        start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
  1.1899 +    {
  1.1900 +            start32--;
  1.1901 +    }
  1.1902 +    if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
  1.1903 +        U16_IS_TRAIL(rep->charAt(limit32)))
  1.1904 +    {
  1.1905 +            limit32++;
  1.1906 +    }
  1.1907 +
  1.1908 +    // Do the actual replace operation using methods of the Replaceable class
  1.1909 +    UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
  1.1910 +    rep->handleReplaceBetween(start32, limit32, replStr);
  1.1911 +    int32_t newLength = rep->length();
  1.1912 +    int32_t lengthDelta = newLength - oldLength;
  1.1913 +
  1.1914 +    // Is the UText chunk buffer OK?
  1.1915 +    if (ut->chunkNativeLimit > start32) {
  1.1916 +        // this replace operation may have impacted the current chunk.
  1.1917 +        // invalidate it, which will force a reload on the next access.
  1.1918 +        invalidateChunk(ut);
  1.1919 +    }
  1.1920 +
  1.1921 +    // set the iteration position to the end of the newly inserted replacement text.
  1.1922 +    int32_t newIndexPos = limit32 + lengthDelta;
  1.1923 +    repTextAccess(ut, newIndexPos, TRUE);
  1.1924 +
  1.1925 +    return lengthDelta;
  1.1926 +}
  1.1927 +
  1.1928 +
  1.1929 +static void U_CALLCONV
  1.1930 +repTextCopy(UText *ut,
  1.1931 +                int64_t start, int64_t limit,
  1.1932 +                int64_t destIndex,
  1.1933 +                UBool move,
  1.1934 +                UErrorCode *status)
  1.1935 +{
  1.1936 +    Replaceable *rep=(Replaceable *)ut->context;
  1.1937 +    int32_t length=rep->length();
  1.1938 +
  1.1939 +    if(U_FAILURE(*status)) {
  1.1940 +        return;
  1.1941 +    }
  1.1942 +    if (start>limit || (start<destIndex && destIndex<limit))
  1.1943 +    {
  1.1944 +        *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1945 +        return;
  1.1946 +    }
  1.1947 +
  1.1948 +    int32_t start32     = pinIndex(start, length);
  1.1949 +    int32_t limit32     = pinIndex(limit, length);
  1.1950 +    int32_t destIndex32 = pinIndex(destIndex, length);
  1.1951 +
  1.1952 +    // TODO:  snap input parameters to code point boundaries.
  1.1953 +
  1.1954 +    if(move) {
  1.1955 +        // move: copy to destIndex, then replace original with nothing
  1.1956 +        int32_t segLength=limit32-start32;
  1.1957 +        rep->copy(start32, limit32, destIndex32);
  1.1958 +        if(destIndex32<start32) {
  1.1959 +            start32+=segLength;
  1.1960 +            limit32+=segLength;
  1.1961 +        }
  1.1962 +        rep->handleReplaceBetween(start32, limit32, UnicodeString());
  1.1963 +    } else {
  1.1964 +        // copy
  1.1965 +        rep->copy(start32, limit32, destIndex32);
  1.1966 +    }
  1.1967 +
  1.1968 +    // If the change to the text touched the region in the chunk buffer,
  1.1969 +    //  invalidate the buffer.
  1.1970 +    int32_t firstAffectedIndex = destIndex32;
  1.1971 +    if (move && start32<firstAffectedIndex) {
  1.1972 +        firstAffectedIndex = start32;
  1.1973 +    }
  1.1974 +    if (firstAffectedIndex < ut->chunkNativeLimit) {
  1.1975 +        // changes may have affected range covered by the chunk
  1.1976 +        invalidateChunk(ut);
  1.1977 +    }
  1.1978 +
  1.1979 +    // Put iteration position at the newly inserted (moved) block,
  1.1980 +    int32_t  nativeIterIndex = destIndex32 + limit32 - start32;
  1.1981 +    if (move && destIndex32>start32) {
  1.1982 +        // moved a block of text towards the end of the string.
  1.1983 +        nativeIterIndex = destIndex32;
  1.1984 +    }
  1.1985 +
  1.1986 +    // Set position, reload chunk if needed.
  1.1987 +    repTextAccess(ut, nativeIterIndex, TRUE);
  1.1988 +}
  1.1989 +
  1.1990 +static const struct UTextFuncs repFuncs = 
  1.1991 +{
  1.1992 +    sizeof(UTextFuncs),
  1.1993 +    0, 0, 0,           // Reserved alignment padding
  1.1994 +    repTextClone,
  1.1995 +    repTextLength,
  1.1996 +    repTextAccess,
  1.1997 +    repTextExtract,
  1.1998 +    repTextReplace,   
  1.1999 +    repTextCopy,   
  1.2000 +    NULL,              // MapOffsetToNative,
  1.2001 +    NULL,              // MapIndexToUTF16,
  1.2002 +    repTextClose,
  1.2003 +    NULL,              // spare 1
  1.2004 +    NULL,              // spare 2
  1.2005 +    NULL               // spare 3
  1.2006 +};
  1.2007 +
  1.2008 +
  1.2009 +U_CAPI UText * U_EXPORT2
  1.2010 +utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
  1.2011 +{
  1.2012 +    if(U_FAILURE(*status)) {
  1.2013 +        return NULL;
  1.2014 +    }
  1.2015 +    if(rep==NULL) {
  1.2016 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
  1.2017 +        return NULL;
  1.2018 +    }
  1.2019 +    ut = utext_setup(ut, sizeof(ReplExtra), status);
  1.2020 +
  1.2021 +    ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1.2022 +    if(rep->hasMetaData()) {
  1.2023 +        ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
  1.2024 +    }
  1.2025 +
  1.2026 +    ut->pFuncs  = &repFuncs;
  1.2027 +    ut->context =  rep;
  1.2028 +    return ut;
  1.2029 +}
  1.2030 +
  1.2031 +U_CDECL_END
  1.2032 +
  1.2033 +
  1.2034 +
  1.2035 +
  1.2036 +
  1.2037 +
  1.2038 +
  1.2039 +
  1.2040 +//------------------------------------------------------------------------------
  1.2041 +//
  1.2042 +//     UText implementation for UnicodeString (read/write)  and
  1.2043 +//                    for const UnicodeString (read only)
  1.2044 +//             (same implementation, only the flags are different)
  1.2045 +//
  1.2046 +//         Use of UText data members:
  1.2047 +//            context    pointer to UnicodeString
  1.2048 +//            p          pointer to UnicodeString IF this UText owns the string
  1.2049 +//                       and it must be deleted on close().  NULL otherwise.
  1.2050 +//
  1.2051 +//------------------------------------------------------------------------------
  1.2052 +
  1.2053 +U_CDECL_BEGIN
  1.2054 +
  1.2055 +
  1.2056 +static UText * U_CALLCONV
  1.2057 +unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  1.2058 +    // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  1.2059 +    dest = shallowTextClone(dest, src, status);
  1.2060 +
  1.2061 +    // For deep clones, make a copy of the UnicodeSring.
  1.2062 +    //  The copied UnicodeString storage is owned by the newly created UText clone.
  1.2063 +    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  1.2064 +    //    the UText.
  1.2065 +    //
  1.2066 +    if (deep && U_SUCCESS(*status)) {
  1.2067 +        const UnicodeString *srcString = (const UnicodeString *)src->context;
  1.2068 +        dest->context = new UnicodeString(*srcString);
  1.2069 +        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1.2070 +
  1.2071 +        // with deep clone, the copy is writable, even when the source is not.
  1.2072 +        dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1.2073 +    }
  1.2074 +    return dest;
  1.2075 +}
  1.2076 +
  1.2077 +static void U_CALLCONV
  1.2078 +unistrTextClose(UText *ut) {
  1.2079 +    // Most of the work of close is done by the generic UText framework close.
  1.2080 +    // All that needs to be done here is delete the UnicodeString if the UText
  1.2081 +    //  owns it.  This occurs if the UText was created by cloning.
  1.2082 +    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1.2083 +        UnicodeString *str = (UnicodeString *)ut->context;
  1.2084 +        delete str;
  1.2085 +        ut->context = NULL;
  1.2086 +    }
  1.2087 +}
  1.2088 +
  1.2089 +
  1.2090 +static int64_t U_CALLCONV
  1.2091 +unistrTextLength(UText *t) {
  1.2092 +    return ((const UnicodeString *)t->context)->length();
  1.2093 +}
  1.2094 +
  1.2095 +
  1.2096 +static UBool U_CALLCONV
  1.2097 +unistrTextAccess(UText *ut, int64_t index, UBool  forward) {
  1.2098 +    int32_t length  = ut->chunkLength;
  1.2099 +    ut->chunkOffset = pinIndex(index, length);
  1.2100 +
  1.2101 +    // Check whether request is at the start or end
  1.2102 +    UBool retVal = (forward && index<length) || (!forward && index>0);
  1.2103 +    return retVal;
  1.2104 +}
  1.2105 +
  1.2106 +
  1.2107 +
  1.2108 +static int32_t U_CALLCONV
  1.2109 +unistrTextExtract(UText *t,
  1.2110 +                  int64_t start, int64_t limit,
  1.2111 +                  UChar *dest, int32_t destCapacity,
  1.2112 +                  UErrorCode *pErrorCode) {
  1.2113 +    const UnicodeString *us=(const UnicodeString *)t->context;
  1.2114 +    int32_t length=us->length();
  1.2115 +
  1.2116 +    if(U_FAILURE(*pErrorCode)) {
  1.2117 +        return 0;
  1.2118 +    }
  1.2119 +    if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  1.2120 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.2121 +    }
  1.2122 +    if(start<0 || start>limit) {
  1.2123 +        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.2124 +        return 0;
  1.2125 +    }
  1.2126 +
  1.2127 +    int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
  1.2128 +    int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
  1.2129 +
  1.2130 +    length=limit32-start32;
  1.2131 +    if (destCapacity>0 && dest!=NULL) {
  1.2132 +        int32_t trimmedLength = length;
  1.2133 +        if(trimmedLength>destCapacity) {
  1.2134 +            trimmedLength=destCapacity;
  1.2135 +        }
  1.2136 +        us->extract(start32, trimmedLength, dest);
  1.2137 +        t->chunkOffset = start32+trimmedLength;
  1.2138 +    } else {
  1.2139 +        t->chunkOffset = start32;
  1.2140 +    }
  1.2141 +    u_terminateUChars(dest, destCapacity, length, pErrorCode);
  1.2142 +    return length;
  1.2143 +}
  1.2144 +
  1.2145 +static int32_t U_CALLCONV
  1.2146 +unistrTextReplace(UText *ut,
  1.2147 +                  int64_t start, int64_t limit,
  1.2148 +                  const UChar *src, int32_t length,
  1.2149 +                  UErrorCode *pErrorCode) {
  1.2150 +    UnicodeString *us=(UnicodeString *)ut->context;
  1.2151 +    int32_t oldLength;
  1.2152 +
  1.2153 +    if(U_FAILURE(*pErrorCode)) {
  1.2154 +        return 0;
  1.2155 +    }
  1.2156 +    if(src==NULL && length!=0) {
  1.2157 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.2158 +    }
  1.2159 +    if(start>limit) {
  1.2160 +        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.2161 +        return 0;
  1.2162 +    }
  1.2163 +    oldLength=us->length();
  1.2164 +    int32_t start32 = pinIndex(start, oldLength);
  1.2165 +    int32_t limit32 = pinIndex(limit, oldLength);
  1.2166 +    if (start32 < oldLength) {
  1.2167 +        start32 = us->getChar32Start(start32);
  1.2168 +    }
  1.2169 +    if (limit32 < oldLength) {
  1.2170 +        limit32 = us->getChar32Start(limit32);
  1.2171 +    }
  1.2172 +
  1.2173 +    // replace
  1.2174 +    us->replace(start32, limit32-start32, src, length);
  1.2175 +    int32_t newLength = us->length();
  1.2176 +
  1.2177 +    // Update the chunk description.
  1.2178 +    ut->chunkContents    = us->getBuffer();
  1.2179 +    ut->chunkLength      = newLength;
  1.2180 +    ut->chunkNativeLimit = newLength;
  1.2181 +    ut->nativeIndexingLimit = newLength;
  1.2182 +
  1.2183 +    // Set iteration position to the point just following the newly inserted text.
  1.2184 +    int32_t lengthDelta = newLength - oldLength;
  1.2185 +    ut->chunkOffset = limit32 + lengthDelta;
  1.2186 +
  1.2187 +    return lengthDelta;
  1.2188 +}
  1.2189 +
  1.2190 +static void U_CALLCONV
  1.2191 +unistrTextCopy(UText *ut,
  1.2192 +               int64_t start, int64_t limit,
  1.2193 +               int64_t destIndex,
  1.2194 +               UBool move,
  1.2195 +               UErrorCode *pErrorCode) {
  1.2196 +    UnicodeString *us=(UnicodeString *)ut->context;
  1.2197 +    int32_t length=us->length();
  1.2198 +
  1.2199 +    if(U_FAILURE(*pErrorCode)) {
  1.2200 +        return;
  1.2201 +    }
  1.2202 +    int32_t start32 = pinIndex(start, length);
  1.2203 +    int32_t limit32 = pinIndex(limit, length);
  1.2204 +    int32_t destIndex32 = pinIndex(destIndex, length);
  1.2205 +
  1.2206 +    if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
  1.2207 +        *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1.2208 +        return;
  1.2209 +    }
  1.2210 +
  1.2211 +    if(move) {
  1.2212 +        // move: copy to destIndex, then replace original with nothing
  1.2213 +        int32_t segLength=limit32-start32;
  1.2214 +        us->copy(start32, limit32, destIndex32);
  1.2215 +        if(destIndex32<start32) {
  1.2216 +            start32+=segLength;
  1.2217 +        }
  1.2218 +        us->replace(start32, segLength, NULL, 0);
  1.2219 +    } else {
  1.2220 +        // copy
  1.2221 +        us->copy(start32, limit32, destIndex32);
  1.2222 +    }
  1.2223 +
  1.2224 +    // update chunk description, set iteration position.
  1.2225 +    ut->chunkContents = us->getBuffer();
  1.2226 +    if (move==FALSE) {
  1.2227 +        // copy operation, string length grows
  1.2228 +        ut->chunkLength += limit32-start32;
  1.2229 +        ut->chunkNativeLimit = ut->chunkLength;
  1.2230 +        ut->nativeIndexingLimit = ut->chunkLength;
  1.2231 +    }
  1.2232 +
  1.2233 +    // Iteration position to end of the newly inserted text.
  1.2234 +    ut->chunkOffset = destIndex32+limit32-start32;
  1.2235 +    if (move && destIndex32>start32) {
  1.2236 +        ut->chunkOffset = destIndex32;
  1.2237 +    }
  1.2238 +
  1.2239 +}
  1.2240 +
  1.2241 +static const struct UTextFuncs unistrFuncs = 
  1.2242 +{
  1.2243 +    sizeof(UTextFuncs),
  1.2244 +    0, 0, 0,             // Reserved alignment padding
  1.2245 +    unistrTextClone,
  1.2246 +    unistrTextLength,
  1.2247 +    unistrTextAccess,
  1.2248 +    unistrTextExtract,
  1.2249 +    unistrTextReplace,   
  1.2250 +    unistrTextCopy,   
  1.2251 +    NULL,                // MapOffsetToNative,
  1.2252 +    NULL,                // MapIndexToUTF16,
  1.2253 +    unistrTextClose,
  1.2254 +    NULL,                // spare 1
  1.2255 +    NULL,                // spare 2
  1.2256 +    NULL                 // spare 3
  1.2257 +};
  1.2258 +
  1.2259 +
  1.2260 +
  1.2261 +U_CDECL_END
  1.2262 +
  1.2263 +
  1.2264 +U_CAPI UText * U_EXPORT2
  1.2265 +utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
  1.2266 +    ut = utext_openConstUnicodeString(ut, s, status);
  1.2267 +    if (U_SUCCESS(*status)) {
  1.2268 +        ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1.2269 +    }
  1.2270 +    return ut;
  1.2271 +}
  1.2272 +
  1.2273 +
  1.2274 +
  1.2275 +U_CAPI UText * U_EXPORT2
  1.2276 +utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
  1.2277 +    if (U_SUCCESS(*status) && s->isBogus()) {
  1.2278 +        // The UnicodeString is bogus, but we still need to detach the UText
  1.2279 +        //   from whatever it was hooked to before, if anything.
  1.2280 +        utext_openUChars(ut, NULL, 0, status);
  1.2281 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2282 +        return ut;
  1.2283 +    }
  1.2284 +    ut = utext_setup(ut, 0, status);
  1.2285 +    //    note:  use the standard (writable) function table for UnicodeString.
  1.2286 +    //           The flag settings disable writing, so having the functions in
  1.2287 +    //           the table is harmless.
  1.2288 +    if (U_SUCCESS(*status)) {
  1.2289 +        ut->pFuncs              = &unistrFuncs;
  1.2290 +        ut->context             = s;
  1.2291 +        ut->providerProperties  = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  1.2292 +        ut->chunkContents       = s->getBuffer();
  1.2293 +        ut->chunkLength         = s->length();
  1.2294 +        ut->chunkNativeStart    = 0;
  1.2295 +        ut->chunkNativeLimit    = ut->chunkLength;
  1.2296 +        ut->nativeIndexingLimit = ut->chunkLength;
  1.2297 +    }
  1.2298 +    return ut;
  1.2299 +}
  1.2300 +
  1.2301 +//------------------------------------------------------------------------------
  1.2302 +//
  1.2303 +//     UText implementation for const UChar * strings
  1.2304 +//
  1.2305 +//         Use of UText data members:
  1.2306 +//            context    pointer to UnicodeString
  1.2307 +//            a          length.  -1 if not yet known.
  1.2308 +//
  1.2309 +//         TODO:  support 64 bit lengths.
  1.2310 +//
  1.2311 +//------------------------------------------------------------------------------
  1.2312 +
  1.2313 +U_CDECL_BEGIN
  1.2314 +
  1.2315 +
  1.2316 +static UText * U_CALLCONV
  1.2317 +ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
  1.2318 +    // First do a generic shallow clone.
  1.2319 +    dest = shallowTextClone(dest, src, status);
  1.2320 +
  1.2321 +    // For deep clones, make a copy of the string.
  1.2322 +    //  The copied storage is owned by the newly created clone.
  1.2323 +    //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  1.2324 +    //    it.
  1.2325 +    //
  1.2326 +    if (deep && U_SUCCESS(*status)) {
  1.2327 +        U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
  1.2328 +        int32_t  len = (int32_t)utext_nativeLength(dest);
  1.2329 +
  1.2330 +        // The cloned string IS going to be NUL terminated, whether or not the original was.
  1.2331 +        const UChar *srcStr = (const UChar *)src->context;
  1.2332 +        UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
  1.2333 +        if (copyStr == NULL) {
  1.2334 +            *status = U_MEMORY_ALLOCATION_ERROR;
  1.2335 +        } else {
  1.2336 +            int64_t i;
  1.2337 +            for (i=0; i<len; i++) {
  1.2338 +                copyStr[i] = srcStr[i];
  1.2339 +            }
  1.2340 +            copyStr[len] = 0;
  1.2341 +            dest->context = copyStr;
  1.2342 +            dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1.2343 +        }
  1.2344 +    }
  1.2345 +    return dest;
  1.2346 +}
  1.2347 +
  1.2348 +
  1.2349 +static void U_CALLCONV
  1.2350 +ucstrTextClose(UText *ut) {
  1.2351 +    // Most of the work of close is done by the generic UText framework close.
  1.2352 +    // All that needs to be done here is delete the string if the UText
  1.2353 +    //  owns it.  This occurs if the UText was created by cloning.
  1.2354 +    if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1.2355 +        UChar *s = (UChar *)ut->context;
  1.2356 +        uprv_free(s);
  1.2357 +        ut->context = NULL;
  1.2358 +    }
  1.2359 +}
  1.2360 +
  1.2361 +
  1.2362 +
  1.2363 +static int64_t U_CALLCONV
  1.2364 +ucstrTextLength(UText *ut) {
  1.2365 +    if (ut->a < 0) {
  1.2366 +        // null terminated, we don't yet know the length.  Scan for it.
  1.2367 +        //    Access is not convenient for doing this
  1.2368 +        //    because the current interation postion can't be changed.
  1.2369 +        const UChar  *str = (const UChar *)ut->context;
  1.2370 +        for (;;) {
  1.2371 +            if (str[ut->chunkNativeLimit] == 0) {
  1.2372 +                break;
  1.2373 +            }
  1.2374 +            ut->chunkNativeLimit++;
  1.2375 +        }
  1.2376 +        ut->a = ut->chunkNativeLimit;
  1.2377 +        ut->chunkLength = (int32_t)ut->chunkNativeLimit;
  1.2378 +        ut->nativeIndexingLimit = ut->chunkLength;
  1.2379 +        ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.2380 +    }
  1.2381 +    return ut->a;
  1.2382 +}
  1.2383 +
  1.2384 +
  1.2385 +static UBool U_CALLCONV
  1.2386 +ucstrTextAccess(UText *ut, int64_t index, UBool  forward) {
  1.2387 +    const UChar *str   = (const UChar *)ut->context;
  1.2388 +
  1.2389 +    // pin the requested index to the bounds of the string,
  1.2390 +    //  and set current iteration position.
  1.2391 +    if (index<0) {
  1.2392 +        index = 0;
  1.2393 +    } else if (index < ut->chunkNativeLimit) {
  1.2394 +        // The request data is within the chunk as it is known so far.
  1.2395 +        // Put index on a code point boundary.
  1.2396 +        U16_SET_CP_START(str, 0, index);
  1.2397 +    } else if (ut->a >= 0) {
  1.2398 +        // We know the length of this string, and the user is requesting something
  1.2399 +        // at or beyond the length.  Pin the requested index to the length.
  1.2400 +        index = ut->a;
  1.2401 +    } else {
  1.2402 +        // Null terminated string, length not yet known, and the requested index
  1.2403 +        //  is beyond where we have scanned so far.
  1.2404 +        //  Scan to 32 UChars beyond the requested index.  The strategy here is
  1.2405 +        //  to avoid fully scanning a long string when the caller only wants to
  1.2406 +        //  see a few characters at its beginning.
  1.2407 +        int32_t scanLimit = (int32_t)index + 32;
  1.2408 +        if ((index + 32)>INT32_MAX || (index + 32)<0 ) {   // note: int64 expression
  1.2409 +            scanLimit = INT32_MAX;
  1.2410 +        }
  1.2411 +
  1.2412 +        int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
  1.2413 +        for (; chunkLimit<scanLimit; chunkLimit++) {
  1.2414 +            if (str[chunkLimit] == 0) {
  1.2415 +                // We found the end of the string.  Remember it, pin the requested index to it,
  1.2416 +                //  and bail out of here.
  1.2417 +                ut->a = chunkLimit;
  1.2418 +                ut->chunkLength = chunkLimit;
  1.2419 +                ut->nativeIndexingLimit = chunkLimit;
  1.2420 +                if (index >= chunkLimit) {
  1.2421 +                    index = chunkLimit;
  1.2422 +                } else {
  1.2423 +                    U16_SET_CP_START(str, 0, index);
  1.2424 +                }
  1.2425 +
  1.2426 +                ut->chunkNativeLimit = chunkLimit;
  1.2427 +                ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.2428 +                goto breakout;
  1.2429 +            }
  1.2430 +        }
  1.2431 +        // We scanned through the next batch of UChars without finding the end.
  1.2432 +        U16_SET_CP_START(str, 0, index);
  1.2433 +        if (chunkLimit == INT32_MAX) {
  1.2434 +            // Scanned to the limit of a 32 bit length.
  1.2435 +            // Forceably trim the overlength string back so length fits in int32
  1.2436 +            //  TODO:  add support for 64 bit strings.
  1.2437 +            ut->a = chunkLimit;
  1.2438 +            ut->chunkLength = chunkLimit;
  1.2439 +            ut->nativeIndexingLimit = chunkLimit;
  1.2440 +            if (index > chunkLimit) {
  1.2441 +                index = chunkLimit;
  1.2442 +            }
  1.2443 +            ut->chunkNativeLimit = chunkLimit;
  1.2444 +            ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.2445 +        } else {
  1.2446 +            // The endpoint of a chunk must not be left in the middle of a surrogate pair.
  1.2447 +            // If the current end is on a lead surrogate, back the end up by one.
  1.2448 +            // It doesn't matter if the end char happens to be an unpaired surrogate,
  1.2449 +            //    and it's simpler not to worry about it.
  1.2450 +            if (U16_IS_LEAD(str[chunkLimit-1])) {
  1.2451 +                --chunkLimit;
  1.2452 +            }
  1.2453 +            // Null-terminated chunk with end still unknown.
  1.2454 +            // Update the chunk length to reflect what has been scanned thus far.
  1.2455 +            // That the full length is still unknown is (still) flagged by
  1.2456 +            //    ut->a being < 0.
  1.2457 +            ut->chunkNativeLimit = chunkLimit;
  1.2458 +            ut->nativeIndexingLimit = chunkLimit;
  1.2459 +            ut->chunkLength = chunkLimit;
  1.2460 +        }
  1.2461 +
  1.2462 +    }
  1.2463 +breakout:
  1.2464 +    U_ASSERT(index<=INT32_MAX);
  1.2465 +    ut->chunkOffset = (int32_t)index;
  1.2466 +
  1.2467 +    // Check whether request is at the start or end
  1.2468 +    UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
  1.2469 +    return retVal;
  1.2470 +}
  1.2471 +
  1.2472 +
  1.2473 +
  1.2474 +static int32_t U_CALLCONV
  1.2475 +ucstrTextExtract(UText *ut,
  1.2476 +                  int64_t start, int64_t limit,
  1.2477 +                  UChar *dest, int32_t destCapacity,
  1.2478 +                  UErrorCode *pErrorCode)
  1.2479 +{
  1.2480 +    if(U_FAILURE(*pErrorCode)) {
  1.2481 +        return 0;
  1.2482 +    }
  1.2483 +    if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
  1.2484 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.2485 +        return 0;
  1.2486 +    }
  1.2487 +
  1.2488 +    //const UChar *s=(const UChar *)ut->context;
  1.2489 +    int32_t si, di;
  1.2490 +
  1.2491 +    int32_t start32;
  1.2492 +    int32_t limit32;
  1.2493 +
  1.2494 +    // Access the start.  Does two things we need:
  1.2495 +    //   Pins 'start' to the length of the string, if it came in out-of-bounds.
  1.2496 +    //   Snaps 'start' to the beginning of a code point.
  1.2497 +    ucstrTextAccess(ut, start, TRUE);
  1.2498 +    const UChar *s=ut->chunkContents;
  1.2499 +    start32 = ut->chunkOffset;
  1.2500 +
  1.2501 +    int32_t strLength=(int32_t)ut->a;
  1.2502 +    if (strLength >= 0) {
  1.2503 +        limit32 = pinIndex(limit, strLength);
  1.2504 +    } else {
  1.2505 +        limit32 = pinIndex(limit, INT32_MAX);
  1.2506 +    }
  1.2507 +    di = 0;
  1.2508 +    for (si=start32; si<limit32; si++) {
  1.2509 +        if (strLength<0 && s[si]==0) {
  1.2510 +            // Just hit the end of a null-terminated string.
  1.2511 +            ut->a = si;               // set string length for this UText
  1.2512 +            ut->chunkNativeLimit    = si;
  1.2513 +            ut->chunkLength         = si;
  1.2514 +            ut->nativeIndexingLimit = si;
  1.2515 +            strLength               = si;
  1.2516 +            break;
  1.2517 +        }
  1.2518 +        U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
  1.2519 +        if (di<destCapacity) {
  1.2520 +            // only store if there is space.
  1.2521 +            dest[di] = s[si];
  1.2522 +        } else {
  1.2523 +            if (strLength>=0) {
  1.2524 +                // We have filled the destination buffer, and the string length is known.
  1.2525 +                //  Cut the loop short.  There is no need to scan string termination.
  1.2526 +                di = limit32 - start32;
  1.2527 +                si = limit32;
  1.2528 +                break;
  1.2529 +            }
  1.2530 +        }
  1.2531 +        di++;
  1.2532 +    }
  1.2533 +
  1.2534 +    // If the limit index points to a lead surrogate of a pair,
  1.2535 +    //   add the corresponding trail surrogate to the destination.
  1.2536 +    if (si>0 && U16_IS_LEAD(s[si-1]) &&
  1.2537 +        ((si<strLength || strLength<0)  && U16_IS_TRAIL(s[si])))
  1.2538 +    {
  1.2539 +        if (di<destCapacity) {
  1.2540 +            // store only if there is space in the output buffer.
  1.2541 +            dest[di++] = s[si++];
  1.2542 +        }
  1.2543 +    }
  1.2544 +
  1.2545 +    // Put iteration position at the point just following the extracted text
  1.2546 +    ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
  1.2547 +
  1.2548 +    // Add a terminating NUL if space in the buffer permits,
  1.2549 +    // and set the error status as required.
  1.2550 +    u_terminateUChars(dest, destCapacity, di, pErrorCode);
  1.2551 +    return di;
  1.2552 +}
  1.2553 +
  1.2554 +static const struct UTextFuncs ucstrFuncs = 
  1.2555 +{
  1.2556 +    sizeof(UTextFuncs),
  1.2557 +    0, 0, 0,           // Reserved alignment padding
  1.2558 +    ucstrTextClone,
  1.2559 +    ucstrTextLength,
  1.2560 +    ucstrTextAccess,
  1.2561 +    ucstrTextExtract,
  1.2562 +    NULL,              // Replace
  1.2563 +    NULL,              // Copy
  1.2564 +    NULL,              // MapOffsetToNative,
  1.2565 +    NULL,              // MapIndexToUTF16,
  1.2566 +    ucstrTextClose,
  1.2567 +    NULL,              // spare 1
  1.2568 +    NULL,              // spare 2
  1.2569 +    NULL,              // spare 3
  1.2570 +};
  1.2571 +
  1.2572 +U_CDECL_END
  1.2573 +
  1.2574 +static const UChar gEmptyUString[] = {0};
  1.2575 +
  1.2576 +U_CAPI UText * U_EXPORT2
  1.2577 +utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
  1.2578 +    if (U_FAILURE(*status)) {
  1.2579 +        return NULL;
  1.2580 +    }
  1.2581 +    if(s==NULL && length==0) {
  1.2582 +        s = gEmptyUString;
  1.2583 +    }
  1.2584 +    if (s==NULL || length < -1 || length>INT32_MAX) {
  1.2585 +        *status = U_ILLEGAL_ARGUMENT_ERROR;
  1.2586 +        return NULL;
  1.2587 +    }
  1.2588 +    ut = utext_setup(ut, 0, status);
  1.2589 +    if (U_SUCCESS(*status)) {
  1.2590 +        ut->pFuncs               = &ucstrFuncs;
  1.2591 +        ut->context              = s;
  1.2592 +        ut->providerProperties   = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  1.2593 +        if (length==-1) {
  1.2594 +            ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1.2595 +        }
  1.2596 +        ut->a                    = length;
  1.2597 +        ut->chunkContents        = s;
  1.2598 +        ut->chunkNativeStart     = 0;
  1.2599 +        ut->chunkNativeLimit     = length>=0? length : 0;
  1.2600 +        ut->chunkLength          = (int32_t)ut->chunkNativeLimit;
  1.2601 +        ut->chunkOffset          = 0;
  1.2602 +        ut->nativeIndexingLimit  = ut->chunkLength;
  1.2603 +    }
  1.2604 +    return ut;
  1.2605 +}
  1.2606 +
  1.2607 +
  1.2608 +//------------------------------------------------------------------------------
  1.2609 +//
  1.2610 +//     UText implementation for text from ICU CharacterIterators
  1.2611 +//
  1.2612 +//         Use of UText data members:
  1.2613 +//            context    pointer to the CharacterIterator
  1.2614 +//            a          length of the full text.
  1.2615 +//            p          pointer to  buffer 1
  1.2616 +//            b          start index of local buffer 1 contents
  1.2617 +//            q          pointer to buffer 2
  1.2618 +//            c          start index of local buffer 2 contents
  1.2619 +//            r          pointer to the character iterator if the UText owns it.
  1.2620 +//                       Null otherwise.
  1.2621 +//
  1.2622 +//------------------------------------------------------------------------------
  1.2623 +#define CIBufSize 16
  1.2624 +
  1.2625 +U_CDECL_BEGIN
  1.2626 +static void U_CALLCONV
  1.2627 +charIterTextClose(UText *ut) {
  1.2628 +    // Most of the work of close is done by the generic UText framework close.
  1.2629 +    // All that needs to be done here is delete the CharacterIterator if the UText
  1.2630 +    //  owns it.  This occurs if the UText was created by cloning.
  1.2631 +    CharacterIterator *ci = (CharacterIterator *)ut->r;
  1.2632 +    delete ci;
  1.2633 +    ut->r = NULL;
  1.2634 +}
  1.2635 +
  1.2636 +static int64_t U_CALLCONV
  1.2637 +charIterTextLength(UText *ut) {
  1.2638 +    return (int32_t)ut->a;
  1.2639 +}
  1.2640 +
  1.2641 +static UBool U_CALLCONV
  1.2642 +charIterTextAccess(UText *ut, int64_t index, UBool  forward) {
  1.2643 +    CharacterIterator *ci   = (CharacterIterator *)ut->context;
  1.2644 +
  1.2645 +    int32_t clippedIndex = (int32_t)index;
  1.2646 +    if (clippedIndex<0) {
  1.2647 +        clippedIndex=0;
  1.2648 +    } else if (clippedIndex>=ut->a) {
  1.2649 +        clippedIndex=(int32_t)ut->a;
  1.2650 +    }
  1.2651 +    int32_t neededIndex = clippedIndex;
  1.2652 +    if (!forward && neededIndex>0) {
  1.2653 +        // reverse iteration, want the position just before what was asked for.
  1.2654 +        neededIndex--;
  1.2655 +    } else if (forward && neededIndex==ut->a && neededIndex>0) {
  1.2656 +        // Forward iteration, don't ask for something past the end of the text.
  1.2657 +        neededIndex--;
  1.2658 +    }
  1.2659 +
  1.2660 +    // Find the native index of the start of the buffer containing what we want.
  1.2661 +    neededIndex -= neededIndex % CIBufSize;
  1.2662 +
  1.2663 +    UChar *buf = NULL;
  1.2664 +    UBool  needChunkSetup = TRUE;
  1.2665 +    int    i;
  1.2666 +    if (ut->chunkNativeStart == neededIndex) {
  1.2667 +        // The buffer we want is already the current chunk.
  1.2668 +        needChunkSetup = FALSE;
  1.2669 +    } else if (ut->b == neededIndex) {
  1.2670 +        // The first buffer (buffer p) has what we need.
  1.2671 +        buf = (UChar *)ut->p;
  1.2672 +    } else if (ut->c == neededIndex) {
  1.2673 +        // The second buffer (buffer q) has what we need.
  1.2674 +        buf = (UChar *)ut->q;
  1.2675 +    } else {
  1.2676 +        // Neither buffer already has what we need.
  1.2677 +        // Load new data from the character iterator.
  1.2678 +        // Use the buf that is not the current buffer.
  1.2679 +        buf = (UChar *)ut->p;
  1.2680 +        if (ut->p == ut->chunkContents) {
  1.2681 +            buf = (UChar *)ut->q;
  1.2682 +        }
  1.2683 +        ci->setIndex(neededIndex);
  1.2684 +        for (i=0; i<CIBufSize; i++) {
  1.2685 +            buf[i] = ci->nextPostInc();
  1.2686 +            if (i+neededIndex > ut->a) {
  1.2687 +                break;
  1.2688 +            }
  1.2689 +        }
  1.2690 +    }
  1.2691 +
  1.2692 +    // We have a buffer with the data we need.
  1.2693 +    // Set it up as the current chunk, if it wasn't already.
  1.2694 +    if (needChunkSetup) {
  1.2695 +        ut->chunkContents = buf;
  1.2696 +        ut->chunkLength   = CIBufSize;
  1.2697 +        ut->chunkNativeStart = neededIndex;
  1.2698 +        ut->chunkNativeLimit = neededIndex + CIBufSize;
  1.2699 +        if (ut->chunkNativeLimit > ut->a) {
  1.2700 +            ut->chunkNativeLimit = ut->a;
  1.2701 +            ut->chunkLength  = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
  1.2702 +        }
  1.2703 +        ut->nativeIndexingLimit = ut->chunkLength;
  1.2704 +        U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
  1.2705 +    }
  1.2706 +    ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
  1.2707 +    UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
  1.2708 +    return success;
  1.2709 +}
  1.2710 +
  1.2711 +static UText * U_CALLCONV
  1.2712 +charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
  1.2713 +    if (U_FAILURE(*status)) {
  1.2714 +        return NULL;
  1.2715 +    }
  1.2716 +
  1.2717 +    if (deep) {
  1.2718 +        // There is no CharacterIterator API for cloning the underlying text storage.
  1.2719 +        *status = U_UNSUPPORTED_ERROR;
  1.2720 +        return NULL;
  1.2721 +    } else {
  1.2722 +        CharacterIterator *srcCI =(CharacterIterator *)src->context;
  1.2723 +        srcCI = srcCI->clone();
  1.2724 +        dest = utext_openCharacterIterator(dest, srcCI, status);
  1.2725 +        // cast off const on getNativeIndex.
  1.2726 +        //   For CharacterIterator based UTexts, this is safe, the operation is const.
  1.2727 +        int64_t  ix = utext_getNativeIndex((UText *)src);
  1.2728 +        utext_setNativeIndex(dest, ix);
  1.2729 +        dest->r = srcCI;    // flags that this UText owns the CharacterIterator
  1.2730 +    }
  1.2731 +    return dest;
  1.2732 +}
  1.2733 +
  1.2734 +static int32_t U_CALLCONV
  1.2735 +charIterTextExtract(UText *ut,
  1.2736 +                  int64_t start, int64_t limit,
  1.2737 +                  UChar *dest, int32_t destCapacity,
  1.2738 +                  UErrorCode *status)
  1.2739 +{
  1.2740 +    if(U_FAILURE(*status)) {
  1.2741 +        return 0;
  1.2742 +    }
  1.2743 +    if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
  1.2744 +        *status=U_ILLEGAL_ARGUMENT_ERROR;
  1.2745 +        return 0;
  1.2746 +    }
  1.2747 +    int32_t  length  = (int32_t)ut->a;
  1.2748 +    int32_t  start32 = pinIndex(start, length);
  1.2749 +    int32_t  limit32 = pinIndex(limit, length);
  1.2750 +    int32_t  desti   = 0;
  1.2751 +    int32_t  srci;
  1.2752 +    int32_t  copyLimit;
  1.2753 +
  1.2754 +    CharacterIterator *ci = (CharacterIterator *)ut->context;
  1.2755 +    ci->setIndex32(start32);   // Moves ix to lead of surrogate pair, if needed.
  1.2756 +    srci = ci->getIndex();
  1.2757 +    copyLimit = srci;
  1.2758 +    while (srci<limit32) {
  1.2759 +        UChar32 c = ci->next32PostInc();
  1.2760 +        int32_t  len = U16_LENGTH(c);
  1.2761 +        U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
  1.2762 +        if (desti+len <= destCapacity) {
  1.2763 +            U16_APPEND_UNSAFE(dest, desti, c);
  1.2764 +            copyLimit = srci+len;
  1.2765 +        } else {
  1.2766 +            desti += len;
  1.2767 +            *status = U_BUFFER_OVERFLOW_ERROR;
  1.2768 +        }
  1.2769 +        srci += len;
  1.2770 +    }
  1.2771 +    
  1.2772 +    charIterTextAccess(ut, copyLimit, TRUE);
  1.2773 +
  1.2774 +    u_terminateUChars(dest, destCapacity, desti, status);
  1.2775 +    return desti;
  1.2776 +}
  1.2777 +
  1.2778 +static const struct UTextFuncs charIterFuncs = 
  1.2779 +{
  1.2780 +    sizeof(UTextFuncs),
  1.2781 +    0, 0, 0,             // Reserved alignment padding
  1.2782 +    charIterTextClone,
  1.2783 +    charIterTextLength,
  1.2784 +    charIterTextAccess,
  1.2785 +    charIterTextExtract,
  1.2786 +    NULL,                // Replace
  1.2787 +    NULL,                // Copy
  1.2788 +    NULL,                // MapOffsetToNative,
  1.2789 +    NULL,                // MapIndexToUTF16,
  1.2790 +    charIterTextClose,
  1.2791 +    NULL,                // spare 1
  1.2792 +    NULL,                // spare 2
  1.2793 +    NULL                 // spare 3
  1.2794 +};
  1.2795 +U_CDECL_END
  1.2796 +
  1.2797 +
  1.2798 +U_CAPI UText * U_EXPORT2
  1.2799 +utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
  1.2800 +    if (U_FAILURE(*status)) {
  1.2801 +        return NULL;
  1.2802 +    }
  1.2803 +
  1.2804 +    if (ci->startIndex() > 0) {
  1.2805 +        // No support for CharacterIterators that do not start indexing from zero.
  1.2806 +        *status = U_UNSUPPORTED_ERROR;
  1.2807 +        return NULL;
  1.2808 +    }
  1.2809 +
  1.2810 +    // Extra space in UText for 2 buffers of CIBufSize UChars each.
  1.2811 +    int32_t  extraSpace = 2 * CIBufSize * sizeof(UChar);
  1.2812 +    ut = utext_setup(ut, extraSpace, status);
  1.2813 +    if (U_SUCCESS(*status)) {
  1.2814 +        ut->pFuncs                = &charIterFuncs;
  1.2815 +        ut->context              = ci;
  1.2816 +        ut->providerProperties   = 0;
  1.2817 +        ut->a                    = ci->endIndex();        // Length of text
  1.2818 +        ut->p                    = ut->pExtra;            // First buffer
  1.2819 +        ut->b                    = -1;                    // Native index of first buffer contents
  1.2820 +        ut->q                    = (UChar*)ut->pExtra+CIBufSize;  // Second buffer
  1.2821 +        ut->c                    = -1;                    // Native index of second buffer contents
  1.2822 +
  1.2823 +        // Initialize current chunk contents to be empty.
  1.2824 +        //   First access will fault something in.
  1.2825 +        //   Note:  The initial nativeStart and chunkOffset must sum to zero
  1.2826 +        //          so that getNativeIndex() will correctly compute to zero
  1.2827 +        //          if no call to Access() has ever been made.  They can't be both
  1.2828 +        //          zero without Access() thinking that the chunk is valid.
  1.2829 +        ut->chunkContents        = (UChar *)ut->p;
  1.2830 +        ut->chunkNativeStart     = -1;
  1.2831 +        ut->chunkOffset          = 1;
  1.2832 +        ut->chunkNativeLimit     = 0;
  1.2833 +        ut->chunkLength          = 0;
  1.2834 +        ut->nativeIndexingLimit  = ut->chunkOffset;  // enables native indexing
  1.2835 +    }
  1.2836 +    return ut;
  1.2837 +}
  1.2838 +
  1.2839 +
  1.2840 +
The Tor Browser / file diff

diff: intl/icu/source/common/utext.cpp

intl/icu/source/common/utext.cpp