intl/icu/source/common/utext.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2005-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  utext.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2005apr12
    14 *   created by: Markus W. Scherer
    15 */
    17 #include "unicode/utypes.h"
    18 #include "unicode/ustring.h"
    19 #include "unicode/unistr.h"
    20 #include "unicode/chariter.h"
    21 #include "unicode/utext.h"
    22 #include "unicode/utf.h"
    23 #include "unicode/utf8.h"
    24 #include "unicode/utf16.h"
    25 #include "ustr_imp.h"
    26 #include "cmemory.h"
    27 #include "cstring.h"
    28 #include "uassert.h"
    29 #include "putilimp.h"
    31 U_NAMESPACE_USE
    33 #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex))
    36 static UBool
    37 utext_access(UText *ut, int64_t index, UBool forward) {
    38     return ut->pFuncs->access(ut, index, forward);
    39 }
    43 U_CAPI UBool U_EXPORT2
    44 utext_moveIndex32(UText *ut, int32_t delta) {
    45     UChar32  c;
    46     if (delta > 0) {
    47         do {
    48             if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) {
    49                 return FALSE;
    50             }
    51             c = ut->chunkContents[ut->chunkOffset];
    52             if (U16_IS_SURROGATE(c)) {
    53                 c = utext_next32(ut);
    54                 if (c == U_SENTINEL) {
    55                     return FALSE;
    56                 }
    57             } else {
    58                 ut->chunkOffset++;
    59             }
    60         } while(--delta>0);
    62     } else if (delta<0) {
    63         do {
    64             if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) {
    65                 return FALSE;
    66             }
    67             c = ut->chunkContents[ut->chunkOffset-1];
    68             if (U16_IS_SURROGATE(c)) {
    69                 c = utext_previous32(ut);
    70                 if (c == U_SENTINEL) {
    71                     return FALSE;
    72                 }
    73             } else {
    74                 ut->chunkOffset--;
    75             }
    76         } while(++delta<0);
    77     }
    79     return TRUE;
    80 }
    83 U_CAPI int64_t U_EXPORT2
    84 utext_nativeLength(UText *ut) {
    85     return ut->pFuncs->nativeLength(ut);
    86 }
    89 U_CAPI UBool U_EXPORT2
    90 utext_isLengthExpensive(const UText *ut) {
    91     UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0;
    92     return r;
    93 }
    96 U_CAPI int64_t U_EXPORT2
    97 utext_getNativeIndex(const UText *ut) {
    98     if(ut->chunkOffset <= ut->nativeIndexingLimit) {
    99         return ut->chunkNativeStart+ut->chunkOffset;
   100     } else {
   101         return ut->pFuncs->mapOffsetToNative(ut);
   102     }
   103 }
   106 U_CAPI void U_EXPORT2
   107 utext_setNativeIndex(UText *ut, int64_t index) {
   108     if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
   109         // The desired position is outside of the current chunk.
   110         // Access the new position.  Assume a forward iteration from here,
   111         // which will also be optimimum for a single random access.
   112         // Reverse iterations may suffer slightly.
   113         ut->pFuncs->access(ut, index, TRUE);
   114     } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) {
   115         // utf-16 indexing.
   116         ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart);
   117     } else {
   118          ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   119     }
   120     // The convention is that the index must always be on a code point boundary.
   121     // Adjust the index position if it is in the middle of a surrogate pair.
   122     if (ut->chunkOffset<ut->chunkLength) {
   123         UChar c= ut->chunkContents[ut->chunkOffset];
   124         if (U16_IS_TRAIL(c)) {
   125             if (ut->chunkOffset==0) {
   126                 ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE);
   127             }
   128             if (ut->chunkOffset>0) {
   129                 UChar lead = ut->chunkContents[ut->chunkOffset-1];
   130                 if (U16_IS_LEAD(lead)) {
   131                     ut->chunkOffset--;
   132                 }
   133             }
   134         }
   135     }
   136 }
   140 U_CAPI int64_t U_EXPORT2
   141 utext_getPreviousNativeIndex(UText *ut) {
   142     //
   143     //  Fast-path the common case.
   144     //     Common means current position is not at the beginning of a chunk
   145     //     and the preceding character is not supplementary.
   146     //
   147     int32_t i = ut->chunkOffset - 1;
   148     int64_t result;
   149     if (i >= 0) {
   150         UChar c = ut->chunkContents[i];
   151         if (U16_IS_TRAIL(c) == FALSE) {
   152             if (i <= ut->nativeIndexingLimit) {
   153                 result = ut->chunkNativeStart + i;
   154             } else {
   155                 ut->chunkOffset = i;
   156                 result = ut->pFuncs->mapOffsetToNative(ut);
   157                 ut->chunkOffset++;
   158             }
   159             return result;
   160         }
   161     }
   163     // If at the start of text, simply return 0.
   164     if (ut->chunkOffset==0 && ut->chunkNativeStart==0) {
   165         return 0;
   166     }
   168     // Harder, less common cases.  We are at a chunk boundary, or on a surrogate.
   169     //    Keep it simple, use other functions to handle the edges.
   170     //
   171     utext_previous32(ut);
   172     result = UTEXT_GETNATIVEINDEX(ut);
   173     utext_next32(ut);
   174     return result;
   175 }
   178 //
   179 //  utext_current32.  Get the UChar32 at the current position.
   180 //                    UText iteration position is always on a code point boundary,
   181 //                    never on the trail half of a surrogate pair.
   182 //
   183 U_CAPI UChar32 U_EXPORT2
   184 utext_current32(UText *ut) {
   185     UChar32  c;
   186     if (ut->chunkOffset==ut->chunkLength) {
   187         // Current position is just off the end of the chunk.
   188         if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   189             // Off the end of the text.
   190             return U_SENTINEL;
   191         }
   192     }
   194     c = ut->chunkContents[ut->chunkOffset];
   195     if (U16_IS_LEAD(c) == FALSE) {
   196         // Normal, non-supplementary case.
   197         return c;
   198     }
   200     //
   201     //  Possible supplementary char.
   202     //
   203     UChar32   trail = 0;
   204     UChar32   supplementaryC = c;
   205     if ((ut->chunkOffset+1) < ut->chunkLength) {
   206         // The trail surrogate is in the same chunk.
   207         trail = ut->chunkContents[ut->chunkOffset+1];
   208     } else {
   209         //  The trail surrogate is in a different chunk.
   210         //     Because we must maintain the iteration position, we need to switch forward
   211         //     into the new chunk, get the trail surrogate, then revert the chunk back to the
   212         //     original one.
   213         //     An edge case to be careful of:  the entire text may end with an unpaired
   214         //        leading surrogate.  The attempt to access the trail will fail, but
   215         //        the original position before the unpaired lead still needs to be restored.
   216         int64_t  nativePosition = ut->chunkNativeLimit;
   217         int32_t  originalOffset = ut->chunkOffset;
   218         if (ut->pFuncs->access(ut, nativePosition, TRUE)) {
   219             trail = ut->chunkContents[ut->chunkOffset];
   220         }
   221         UBool r = ut->pFuncs->access(ut, nativePosition, FALSE);  // reverse iteration flag loads preceding chunk
   222         U_ASSERT(r==TRUE);
   223         ut->chunkOffset = originalOffset;
   224         if(!r) {
   225             return U_SENTINEL;
   226         }
   227     }
   229     if (U16_IS_TRAIL(trail)) {
   230         supplementaryC = U16_GET_SUPPLEMENTARY(c, trail);
   231     }
   232     return supplementaryC;
   234 }
   237 U_CAPI UChar32 U_EXPORT2
   238 utext_char32At(UText *ut, int64_t nativeIndex) {
   239     UChar32 c = U_SENTINEL;
   241     // Fast path the common case.
   242     if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) {
   243         ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart);
   244         c = ut->chunkContents[ut->chunkOffset];
   245         if (U16_IS_SURROGATE(c) == FALSE) {
   246             return c;
   247         }
   248     }
   251     utext_setNativeIndex(ut, nativeIndex);
   252     if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) {
   253         c = ut->chunkContents[ut->chunkOffset];
   254         if (U16_IS_SURROGATE(c)) {
   255             // For surrogates, let current32() deal with the complications
   256             //    of supplementaries that may span chunk boundaries.
   257             c = utext_current32(ut);
   258         }
   259     }
   260     return c;
   261 }
   264 U_CAPI UChar32 U_EXPORT2
   265 utext_next32(UText *ut) {
   266     UChar32       c;
   268     if (ut->chunkOffset >= ut->chunkLength) {
   269         if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   270             return U_SENTINEL;
   271         }
   272     }
   274     c = ut->chunkContents[ut->chunkOffset++];
   275     if (U16_IS_LEAD(c) == FALSE) {
   276         // Normal case, not supplementary.
   277         //   (A trail surrogate seen here is just returned as is, as a surrogate value.
   278         //    It cannot be part of a pair.)
   279         return c;
   280     }
   282     if (ut->chunkOffset >= ut->chunkLength) {
   283         if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) {
   284             // c is an unpaired lead surrogate at the end of the text.
   285             // return it as it is.
   286             return c;
   287         }
   288     }
   289     UChar32 trail = ut->chunkContents[ut->chunkOffset];
   290     if (U16_IS_TRAIL(trail) == FALSE) {
   291         // c was an unpaired lead surrogate, not at the end of the text.
   292         // return it as it is (unpaired).  Iteration position is on the
   293         // following character, possibly in the next chunk, where the
   294         //  trail surrogate would have been if it had existed.
   295         return c;
   296     }
   298     UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail);
   299     ut->chunkOffset++;   // move iteration position over the trail surrogate.
   300     return supplementary;
   301     }
   304 U_CAPI UChar32 U_EXPORT2
   305 utext_previous32(UText *ut) {
   306     UChar32       c;
   308     if (ut->chunkOffset <= 0) {
   309         if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
   310             return U_SENTINEL;
   311         }
   312     }
   313     ut->chunkOffset--;
   314     c = ut->chunkContents[ut->chunkOffset];
   315     if (U16_IS_TRAIL(c) == FALSE) {
   316         // Normal case, not supplementary.
   317         //   (A lead surrogate seen here is just returned as is, as a surrogate value.
   318         //    It cannot be part of a pair.)
   319         return c;
   320     }
   322     if (ut->chunkOffset <= 0) {
   323         if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) {
   324             // c is an unpaired trail surrogate at the start of the text.
   325             // return it as it is.
   326             return c;
   327         }
   328     }
   330     UChar32 lead = ut->chunkContents[ut->chunkOffset-1];
   331     if (U16_IS_LEAD(lead) == FALSE) {
   332         // c was an unpaired trail surrogate, not at the end of the text.
   333         // return it as it is (unpaired).  Iteration position is at c
   334         return c;
   335     }
   337     UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c);
   338     ut->chunkOffset--;   // move iteration position over the lead surrogate.
   339     return supplementary;
   340 }
   344 U_CAPI UChar32 U_EXPORT2
   345 utext_next32From(UText *ut, int64_t index) {
   346     UChar32       c      = U_SENTINEL;
   348     if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) {
   349         // Desired position is outside of the current chunk.
   350         if(!ut->pFuncs->access(ut, index, TRUE)) {
   351             // no chunk available here
   352             return U_SENTINEL;
   353         }
   354     } else if (index - ut->chunkNativeStart  <= (int64_t)ut->nativeIndexingLimit) {
   355         // Desired position is in chunk, with direct 1:1 native to UTF16 indexing
   356         ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
   357     } else {
   358         // Desired position is in chunk, with non-UTF16 indexing.
   359         ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   360     }
   362     c = ut->chunkContents[ut->chunkOffset++];
   363     if (U16_IS_SURROGATE(c)) {
   364         // Surrogates.  Many edge cases.  Use other functions that already
   365         //              deal with the problems.
   366         utext_setNativeIndex(ut, index);
   367         c = utext_next32(ut);
   368     }
   369     return c;
   370 }
   373 U_CAPI UChar32 U_EXPORT2
   374 utext_previous32From(UText *ut, int64_t index) {
   375     //
   376     //  Return the character preceding the specified index.
   377     //  Leave the iteration position at the start of the character that was returned.
   378     //
   379     UChar32     cPrev;    // The character preceding cCurr, which is what we will return.
   381     // Address the chunk containg the position preceding the incoming index
   382     // A tricky edge case:
   383     //   We try to test the requested native index against the chunkNativeStart to determine
   384     //    whether the character preceding the one at the index is in the current chunk.
   385     //    BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the
   386     //    requested index is on something other than the first position of the first char.
   387     //
   388     if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) {
   389         // Requested native index is outside of the current chunk.
   390         if(!ut->pFuncs->access(ut, index, FALSE)) {
   391             // no chunk available here
   392             return U_SENTINEL;
   393         }
   394     } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) {
   395         // Direct UTF-16 indexing.
   396         ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
   397     } else {
   398         ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index);
   399         if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) {
   400             // no chunk available here
   401             return U_SENTINEL;
   402         }
   403     }
   405     //
   406     // Simple case with no surrogates.
   407     //
   408     ut->chunkOffset--;
   409     cPrev = ut->chunkContents[ut->chunkOffset];
   411     if (U16_IS_SURROGATE(cPrev)) {
   412         // Possible supplementary.  Many edge cases.
   413         // Let other functions do the heavy lifting.
   414         utext_setNativeIndex(ut, index);
   415         cPrev = utext_previous32(ut);
   416     }
   417     return cPrev;
   418 }
   421 U_CAPI int32_t U_EXPORT2
   422 utext_extract(UText *ut,
   423              int64_t start, int64_t limit,
   424              UChar *dest, int32_t destCapacity,
   425              UErrorCode *status) {
   426                  return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status);
   427              }
   431 U_CAPI UBool U_EXPORT2
   432 utext_equals(const UText *a, const UText *b) {
   433     if (a==NULL || b==NULL ||
   434         a->magic != UTEXT_MAGIC ||
   435         b->magic != UTEXT_MAGIC) {
   436             // Null or invalid arguments don't compare equal to anything.
   437             return FALSE;
   438     }
   440     if (a->pFuncs != b->pFuncs) {
   441         // Different types of text providers.
   442         return FALSE;
   443     }
   445     if (a->context != b->context) {
   446         // Different sources (different strings)
   447         return FALSE;
   448     }
   449     if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) {
   450         // Different current position in the string.
   451         return FALSE;
   452     }
   454     return TRUE;
   455 }
   457 U_CAPI UBool U_EXPORT2
   458 utext_isWritable(const UText *ut)
   459 {
   460     UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0;
   461     return b;
   462 }
   465 U_CAPI void U_EXPORT2
   466 utext_freeze(UText *ut) {
   467     // Zero out the WRITABLE flag.
   468     ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE));
   469 }
   472 U_CAPI UBool U_EXPORT2
   473 utext_hasMetaData(const UText *ut)
   474 {
   475     UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0;
   476     return b;
   477 }
   481 U_CAPI int32_t U_EXPORT2
   482 utext_replace(UText *ut,
   483              int64_t nativeStart, int64_t nativeLimit,
   484              const UChar *replacementText, int32_t replacementLength,
   485              UErrorCode *status)
   486 {
   487     if (U_FAILURE(*status)) {
   488         return 0;
   489     }
   490     if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
   491         *status = U_NO_WRITE_PERMISSION;
   492         return 0;
   493     }
   494     int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status);
   495     return i;
   496 }
   498 U_CAPI void U_EXPORT2
   499 utext_copy(UText *ut,
   500           int64_t nativeStart, int64_t nativeLimit,
   501           int64_t destIndex,
   502           UBool move,
   503           UErrorCode *status)
   504 {
   505     if (U_FAILURE(*status)) {
   506         return;
   507     }
   508     if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) {
   509         *status = U_NO_WRITE_PERMISSION;
   510         return;
   511     }
   512     ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status);
   513 }
   517 U_CAPI UText * U_EXPORT2
   518 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) {
   519     UText *result;
   520     result = src->pFuncs->clone(dest, src, deep, status);
   521     if (readOnly) {
   522         utext_freeze(result);
   523     }
   524     return result;
   525 }
   529 //------------------------------------------------------------------------------
   530 //
   531 //   UText common functions implementation
   532 //
   533 //------------------------------------------------------------------------------
   535 //
   536 //  UText.flags bit definitions
   537 //
   538 enum {
   539     UTEXT_HEAP_ALLOCATED  = 1,      //  1 if ICU has allocated this UText struct on the heap.
   540                                     //  0 if caller provided storage for the UText.
   542     UTEXT_EXTRA_HEAP_ALLOCATED = 2, //  1 if ICU has allocated extra storage as a separate
   543                                     //     heap block.
   544                                     //  0 if there is no separate allocation.  Either no extra
   545                                     //     storage was requested, or it is appended to the end
   546                                     //     of the main UText storage.
   548     UTEXT_OPEN = 4                  //  1 if this UText is currently open
   549                                     //  0 if this UText is not open.
   550 };
   553 //
   554 //  Extended form of a UText.  The purpose is to aid in computing the total size required
   555 //    when a provider asks for a UText to be allocated with extra storage.
   557 struct ExtendedUText {
   558     UText          ut;
   559     UAlignedMemory extension;
   560 };
   562 static const UText emptyText = UTEXT_INITIALIZER;
   564 U_CAPI UText * U_EXPORT2
   565 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) {
   566     if (U_FAILURE(*status)) {
   567         return ut;
   568     }
   570     if (ut == NULL) {
   571         // We need to heap-allocate storage for the new UText
   572         int32_t spaceRequired = sizeof(UText);
   573         if (extraSpace > 0) {
   574             spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory);
   575         }
   576         ut = (UText *)uprv_malloc(spaceRequired);
   577         if (ut == NULL) {
   578             *status = U_MEMORY_ALLOCATION_ERROR;
   579             return NULL;
   580         } else {
   581             *ut = emptyText;
   582             ut->flags |= UTEXT_HEAP_ALLOCATED;
   583             if (spaceRequired>0) {
   584                 ut->extraSize = extraSpace;
   585                 ut->pExtra    = &((ExtendedUText *)ut)->extension;
   586             }
   587         }
   588     } else {
   589         // We have been supplied with an already existing UText.
   590         // Verify that it really appears to be a UText.
   591         if (ut->magic != UTEXT_MAGIC) {
   592             *status = U_ILLEGAL_ARGUMENT_ERROR;
   593             return ut;
   594         }
   595         // If the ut is already open and there's a provider supplied close
   596         //   function, call it.
   597         if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL)  {
   598             ut->pFuncs->close(ut);
   599         }
   600         ut->flags &= ~UTEXT_OPEN;
   602         // If extra space was requested by our caller, check whether
   603         //   sufficient already exists, and allocate new if needed.
   604         if (extraSpace > ut->extraSize) {
   605             // Need more space.  If there is existing separately allocated space,
   606             //   delete it first, then allocate new space.
   607             if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
   608                 uprv_free(ut->pExtra);
   609                 ut->extraSize = 0;
   610             }
   611             ut->pExtra = uprv_malloc(extraSpace);
   612             if (ut->pExtra == NULL) {
   613                 *status = U_MEMORY_ALLOCATION_ERROR;
   614             } else {
   615                 ut->extraSize = extraSpace;
   616                 ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED;
   617             }
   618         }
   619     }
   620     if (U_SUCCESS(*status)) {
   621         ut->flags |= UTEXT_OPEN;
   623         // Initialize all remaining fields of the UText.
   624         //
   625         ut->context             = NULL;
   626         ut->chunkContents       = NULL;
   627         ut->p                   = NULL;
   628         ut->q                   = NULL;
   629         ut->r                   = NULL;
   630         ut->a                   = 0;
   631         ut->b                   = 0;
   632         ut->c                   = 0;
   633         ut->chunkOffset         = 0;
   634         ut->chunkLength         = 0;
   635         ut->chunkNativeStart    = 0;
   636         ut->chunkNativeLimit    = 0;
   637         ut->nativeIndexingLimit = 0;
   638         ut->providerProperties  = 0;
   639         ut->privA               = 0;
   640         ut->privB               = 0;
   641         ut->privC               = 0;
   642         ut->privP               = NULL;
   643         if (ut->pExtra!=NULL && ut->extraSize>0)
   644             uprv_memset(ut->pExtra, 0, ut->extraSize);
   646     }
   647     return ut;
   648 }
   651 U_CAPI UText * U_EXPORT2
   652 utext_close(UText *ut) {
   653     if (ut==NULL ||
   654         ut->magic != UTEXT_MAGIC ||
   655         (ut->flags & UTEXT_OPEN) == 0)
   656     {
   657         // The supplied ut is not an open UText.
   658         // Do nothing.
   659         return ut;
   660     }
   662     // If the provider gave us a close function, call it now.
   663     // This will clean up anything allocated specifically by the provider.
   664     if (ut->pFuncs->close != NULL) {
   665         ut->pFuncs->close(ut);
   666     }
   667     ut->flags &= ~UTEXT_OPEN;
   669     // If we (the framework) allocated the UText or subsidiary storage,
   670     //   delete it.
   671     if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) {
   672         uprv_free(ut->pExtra);
   673         ut->pExtra = NULL;
   674         ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED;
   675         ut->extraSize = 0;
   676     }
   678     // Zero out function table of the closed UText.  This is a defensive move,
   679     //   inteded to cause applications that inadvertantly use a closed
   680     //   utext to crash with null pointer errors.
   681     ut->pFuncs        = NULL;
   683     if (ut->flags & UTEXT_HEAP_ALLOCATED) {
   684         // This UText was allocated by UText setup.  We need to free it.
   685         // Clear magic, so we can detect if the user messes up and immediately
   686         //  tries to reopen another UText using the deleted storage.
   687         ut->magic = 0;
   688         uprv_free(ut);
   689         ut = NULL;
   690     }
   691     return ut;
   692 }
   697 //
   698 // invalidateChunk   Reset a chunk to have no contents, so that the next call
   699 //                   to access will cause new data to load.
   700 //                   This is needed when copy/move/replace operate directly on the
   701 //                   backing text, potentially putting it out of sync with the
   702 //                   contents in the chunk.
   703 //
   704 static void
   705 invalidateChunk(UText *ut) {
   706     ut->chunkLength = 0;
   707     ut->chunkNativeLimit = 0;
   708     ut->chunkNativeStart = 0;
   709     ut->chunkOffset = 0;
   710     ut->nativeIndexingLimit = 0;
   711 }
   713 //
   714 // pinIndex        Do range pinning on a native index parameter.
   715 //                 64 bit pinning is done in place.
   716 //                 32 bit truncated result is returned as a convenience for
   717 //                        use in providers that don't need 64 bits.
   718 static int32_t
   719 pinIndex(int64_t &index, int64_t limit) {
   720     if (index<0) {
   721         index = 0;
   722     } else if (index > limit) {
   723         index = limit;
   724     }
   725     return (int32_t)index;
   726 }
   729 U_CDECL_BEGIN
   731 //
   732 // Pointer relocation function,
   733 //   a utility used by shallow clone.
   734 //   Adjust a pointer that refers to something within one UText (the source)
   735 //   to refer to the same relative offset within a another UText (the target)
   736 //
   737 static void adjustPointer(UText *dest, const void **destPtr, const UText *src) {
   738     // convert all pointers to (char *) so that byte address arithmetic will work.
   739     char  *dptr = (char *)*destPtr;
   740     char  *dUText = (char *)dest;
   741     char  *sUText = (char *)src;
   743     if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) {
   744         // target ptr was to something within the src UText's pExtra storage.
   745         //   relocate it into the target UText's pExtra region.
   746         *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra);
   747     } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) {
   748         // target ptr was pointing to somewhere within the source UText itself.
   749         //   Move it to the same offset within the target UText.
   750         *destPtr = dUText + (dptr-sUText);
   751     }
   752 }
   755 //
   756 //  Clone.  This is a generic copy-the-utext-by-value clone function that can be
   757 //          used as-is with some utext types, and as a helper by other clones.
   758 //
   759 static UText * U_CALLCONV
   760 shallowTextClone(UText * dest, const UText * src, UErrorCode * status) {
   761     if (U_FAILURE(*status)) {
   762         return NULL;
   763     }
   764     int32_t  srcExtraSize = src->extraSize;
   766     //
   767     // Use the generic text_setup to allocate storage if required.
   768     //
   769     dest = utext_setup(dest, srcExtraSize, status);
   770     if (U_FAILURE(*status)) {
   771         return dest;
   772     }
   774     //
   775     //  flags (how the UText was allocated) and the pointer to the
   776     //   extra storage must retain the values in the cloned utext that
   777     //   were set up by utext_setup.  Save them separately before
   778     //   copying the whole struct.
   779     //
   780     void *destExtra = dest->pExtra;
   781     int32_t flags   = dest->flags;
   784     //
   785     //  Copy the whole UText struct by value.
   786     //  Any "Extra" storage is copied also.
   787     //
   788     int sizeToCopy = src->sizeOfStruct;
   789     if (sizeToCopy > dest->sizeOfStruct) {
   790         sizeToCopy = dest->sizeOfStruct;
   791     }
   792     uprv_memcpy(dest, src, sizeToCopy);
   793     dest->pExtra = destExtra;
   794     dest->flags  = flags;
   795     if (srcExtraSize > 0) {
   796         uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize);
   797     }
   799     //
   800     // Relocate any pointers in the target that refer to the UText itself
   801     //   to point to the cloned copy rather than the original source.
   802     //
   803     adjustPointer(dest, &dest->context, src);
   804     adjustPointer(dest, &dest->p, src);
   805     adjustPointer(dest, &dest->q, src);
   806     adjustPointer(dest, &dest->r, src);
   807     adjustPointer(dest, (const void **)&dest->chunkContents, src);
   809     return dest;
   810 }
   813 U_CDECL_END
   817 //------------------------------------------------------------------------------
   818 //
   819 //     UText implementation for UTF-8 char * strings (read-only)
   820 //     Limitation:  string length must be <= 0x7fffffff in length.
   821 //                  (length must for in an int32_t variable)
   822 //
   823 //         Use of UText data members:
   824 //              context    pointer to UTF-8 string
   825 //              utext.b    is the input string length (bytes).
   826 //              utext.c    Length scanned so far in string
   827 //                           (for optimizing finding length of zero terminated strings.)
   828 //              utext.p    pointer to the current buffer
   829 //              utext.q    pointer to the other buffer.
   830 //
   831 //------------------------------------------------------------------------------
   833 // Chunk size.
   834 //     Must be less than 85, because of byte mapping from UChar indexes to native indexes.
   835 //     Worst case is three native bytes to one UChar.  (Supplemenaries are 4 native bytes
   836 //     to two UChars.)
   837 //
   838 enum { UTF8_TEXT_CHUNK_SIZE=32 };
   840 //
   841 // UTF8Buf  Two of these structs will be set up in the UText's extra allocated space.
   842 //          Each contains the UChar chunk buffer, the to and from native maps, and
   843 //          header info.
   844 //
   845 //     because backwards iteration fills the buffers starting at the end and
   846 //     working towards the front, the filled part of the buffers may not begin
   847 //     at the start of the available storage for the buffers.
   848 //
   849 //     Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for
   850 //     the last character added being a supplementary, and thus requiring a surrogate
   851 //     pair.  Doing this is simpler than checking for the edge case.
   852 //
   854 struct UTF8Buf {
   855     int32_t   bufNativeStart;                        // Native index of first char in UChar buf
   856     int32_t   bufNativeLimit;                        // Native index following last char in buf.
   857     int32_t   bufStartIdx;                           // First filled position in buf.
   858     int32_t   bufLimitIdx;                           // Limit of filled range in buf.
   859     int32_t   bufNILimit;                            // Limit of native indexing part of buf
   860     int32_t   toUCharsMapStart;                      // Native index corresponding to
   861                                                      //   mapToUChars[0].
   862                                                      //   Set to bufNativeStart when filling forwards.
   863                                                      //   Set to computed value when filling backwards.
   865     UChar     buf[UTF8_TEXT_CHUNK_SIZE+4];           // The UChar buffer.  Requires one extra position beyond the
   866                                                      //   the chunk size, to allow for surrogate at the end.
   867                                                      //   Length must be identical to mapToNative array, below,
   868                                                      //   because of the way indexing works when the array is
   869                                                      //   filled backwards during a reverse iteration.  Thus,
   870                                                      //   the additional extra size.
   871     uint8_t   mapToNative[UTF8_TEXT_CHUNK_SIZE+4];   // map UChar index in buf to
   872                                                      //  native offset from bufNativeStart.
   873                                                      //  Requires two extra slots,
   874                                                      //    one for a supplementary starting in the last normal position,
   875                                                      //    and one for an entry for the buffer limit position.
   876     uint8_t   mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to
   877                                                      //   correspoding offset in filled part of buf.
   878     int32_t   align;
   879 };
   881 U_CDECL_BEGIN
   883 //
   884 //   utf8TextLength
   885 //
   886 //        Get the length of the string.  If we don't already know it,
   887 //              we'll need to scan for the trailing  nul.
   888 //
   889 static int64_t U_CALLCONV
   890 utf8TextLength(UText *ut) {
   891     if (ut->b < 0) {
   892         // Zero terminated string, and we haven't scanned to the end yet.
   893         // Scan it now.
   894         const char *r = (const char *)ut->context + ut->c;
   895         while (*r != 0) {
   896             r++;
   897         }
   898         if ((r - (const char *)ut->context) < 0x7fffffff) {
   899             ut->b = (int32_t)(r - (const char *)ut->context);
   900         } else {
   901             // Actual string was bigger (more than 2 gig) than we
   902             //   can handle.  Clip it to 2 GB.
   903             ut->b = 0x7fffffff;
   904         }
   905         ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
   906     }
   907     return ut->b;
   908 }
   915 static UBool U_CALLCONV
   916 utf8TextAccess(UText *ut, int64_t index, UBool forward) {
   917     //
   918     //  Apologies to those who are allergic to goto statements.
   919     //    Consider each goto to a labelled block to be the equivalent of
   920     //         call the named block as if it were a function();
   921     //         return;
   922     //
   923     const uint8_t *s8=(const uint8_t *)ut->context;
   924     UTF8Buf *u8b = NULL;
   925     int32_t  length = ut->b;         // Length of original utf-8
   926     int32_t  ix= (int32_t)index;     // Requested index, trimmed to 32 bits.
   927     int32_t  mapIndex = 0;
   928     if (index<0) {
   929         ix=0;
   930     } else if (index > 0x7fffffff) {
   931         // Strings with 64 bit lengths not supported by this UTF-8 provider.
   932         ix = 0x7fffffff;
   933     }
   935     // Pin requested index to the string length.
   936     if (ix>length) {
   937         if (length>=0) {
   938             ix=length;
   939         } else if (ix>=ut->c) {
   940             // Zero terminated string, and requested index is beyond
   941             //   the region that has already been scanned.
   942             //   Scan up to either the end of the string or to the
   943             //   requested position, whichever comes first.
   944             while (ut->c<ix && s8[ut->c]!=0) {
   945                 ut->c++;
   946             }
   947             //  TODO:  support for null terminated string length > 32 bits.
   948             if (s8[ut->c] == 0) {
   949                 // We just found the actual length of the string.
   950                 //  Trim the requested index back to that.
   951                 ix     = ut->c;
   952                 ut->b  = ut->c;
   953                 length = ut->c;
   954                 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
   955             }
   956         }
   957     }
   959     //
   960     // Dispatch to the appropriate action for a forward iteration request.
   961     //
   962     if (forward) {
   963         if (ix==ut->chunkNativeLimit) {
   964             // Check for normal sequential iteration cases first.
   965             if (ix==length) {
   966                 // Just reached end of string
   967                 // Don't swap buffers, but do set the
   968                 //   current buffer position.
   969                 ut->chunkOffset = ut->chunkLength;
   970                 return FALSE;
   971             } else {
   972                 // End of current buffer.
   973                 //   check whether other buffer already has what we need.
   974                 UTF8Buf *altB = (UTF8Buf *)ut->q;
   975                 if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) {
   976                     goto swapBuffers;
   977                 }
   978             }
   979         }
   981         // A random access.  Desired index could be in either or niether buf.
   982         // For optimizing the order of testing, first check for the index
   983         //    being in the other buffer.  This will be the case for uses that
   984         //    move back and forth over a fairly limited range
   985         {
   986             u8b = (UTF8Buf *)ut->q;   // the alternate buffer
   987             if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) {
   988                 // Requested index is in the other buffer.
   989                 goto swapBuffers;
   990             }
   991             if (ix == length) {
   992                 // Requested index is end-of-string.
   993                 //   (this is the case of randomly seeking to the end.
   994                 //    The case of iterating off the end is handled earlier.)
   995                 if (ix == ut->chunkNativeLimit) {
   996                     // Current buffer extends up to the end of the string.
   997                     //   Leave it as the current buffer.
   998                     ut->chunkOffset = ut->chunkLength;
   999                     return FALSE;
  1001                 if (ix == u8b->bufNativeLimit) {
  1002                     // Alternate buffer extends to the end of string.
  1003                     //   Swap it in as the current buffer.
  1004                     goto swapBuffersAndFail;
  1007                 // Neither existing buffer extends to the end of the string.
  1008                 goto makeStubBuffer;
  1011             if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) {
  1012                 // Requested index is in neither buffer.
  1013                 goto fillForward;
  1016             // Requested index is in this buffer.
  1017             u8b = (UTF8Buf *)ut->p;   // the current buffer
  1018             mapIndex = ix - u8b->toUCharsMapStart;
  1019             ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1020             return TRUE;
  1026     //
  1027     // Dispatch to the appropriate action for a
  1028     //   Backwards Diretion iteration request.
  1029     //
  1030     if (ix==ut->chunkNativeStart) {
  1031         // Check for normal sequential iteration cases first.
  1032         if (ix==0) {
  1033             // Just reached the start of string
  1034             // Don't swap buffers, but do set the
  1035             //   current buffer position.
  1036             ut->chunkOffset = 0;
  1037             return FALSE;
  1038         } else {
  1039             // Start of current buffer.
  1040             //   check whether other buffer already has what we need.
  1041             UTF8Buf *altB = (UTF8Buf *)ut->q;
  1042             if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) {
  1043                 goto swapBuffers;
  1048     // A random access.  Desired index could be in either or niether buf.
  1049     // For optimizing the order of testing,
  1050     //    Most likely case:  in the other buffer.
  1051     //    Second most likely: in neither buffer.
  1052     //    Unlikely, but must work:  in the current buffer.
  1053     u8b = (UTF8Buf *)ut->q;   // the alternate buffer
  1054     if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) {
  1055         // Requested index is in the other buffer.
  1056         goto swapBuffers;
  1058     // Requested index is start-of-string.
  1059     //   (this is the case of randomly seeking to the start.
  1060     //    The case of iterating off the start is handled earlier.)
  1061     if (ix==0) {
  1062         if (u8b->bufNativeStart==0) {
  1063             // Alternate buffer contains the data for the start string.
  1064             // Make it be the current buffer.
  1065             goto swapBuffersAndFail;
  1066         } else {
  1067             // Request for data before the start of string,
  1068             //   neither buffer is usable.
  1069             //   set up a zero-length buffer.
  1070             goto makeStubBuffer;
  1074     if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) {
  1075         // Requested index is in neither buffer.
  1076         goto fillReverse;
  1079     // Requested index is in this buffer.
  1080     //   Set the utf16 buffer index.
  1081     u8b = (UTF8Buf *)ut->p;
  1082     mapIndex = ix - u8b->toUCharsMapStart;
  1083     ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1084     if (ut->chunkOffset==0) {
  1085         // This occurs when the first character in the text is
  1086         //   a multi-byte UTF-8 char, and the requested index is to
  1087         //   one of the trailing bytes.  Because there is no preceding ,
  1088         //   character, this access fails.  We can't pick up on the
  1089         //   situation sooner because the requested index is not zero.
  1090         return FALSE;
  1091     } else {
  1092         return TRUE;
  1097 swapBuffers:
  1098     //  The alternate buffer (ut->q) has the string data that was requested.
  1099     //  Swap the primary and alternate buffers, and set the
  1100     //   chunk index into the new primary buffer.
  1102         u8b   = (UTF8Buf *)ut->q;
  1103         ut->q = ut->p;
  1104         ut->p = u8b;
  1105         ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
  1106         ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1107         ut->chunkNativeStart    = u8b->bufNativeStart;
  1108         ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1109         ut->nativeIndexingLimit = u8b->bufNILimit;
  1111         // Index into the (now current) chunk
  1112         // Use the map to set the chunk index.  It's more trouble than it's worth
  1113         //    to check whether native indexing can be used.
  1114         U_ASSERT(ix>=u8b->bufNativeStart);
  1115         U_ASSERT(ix<=u8b->bufNativeLimit);
  1116         mapIndex = ix - u8b->toUCharsMapStart;
  1117         U_ASSERT(mapIndex>=0);
  1118         U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars));
  1119         ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1121         return TRUE;
  1125  swapBuffersAndFail:
  1126     // We got a request for either the start or end of the string,
  1127     //  with iteration continuing in the out-of-bounds direction.
  1128     // The alternate buffer already contains the data up to the
  1129     //  start/end.
  1130     // Swap the buffers, then return failure, indicating that we couldn't
  1131     //  make things correct for continuing the iteration in the requested
  1132     //  direction.  The position & buffer are correct should the
  1133     //  user decide to iterate in the opposite direction.
  1134     u8b   = (UTF8Buf *)ut->q;
  1135     ut->q = ut->p;
  1136     ut->p = u8b;
  1137     ut->chunkContents       = &u8b->buf[u8b->bufStartIdx];
  1138     ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1139     ut->chunkNativeStart    = u8b->bufNativeStart;
  1140     ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1141     ut->nativeIndexingLimit = u8b->bufNILimit;
  1143     // Index into the (now current) chunk
  1144     //  For this function  (swapBuffersAndFail), the requested index
  1145     //    will always be at either the start or end of the chunk.
  1146     if (ix==u8b->bufNativeLimit) {
  1147         ut->chunkOffset = ut->chunkLength;
  1148     } else  {
  1149         ut->chunkOffset = 0;
  1150         U_ASSERT(ix == u8b->bufNativeStart);
  1152     return FALSE;
  1154 makeStubBuffer:
  1155     //   The user has done a seek/access past the start or end
  1156     //   of the string.  Rather than loading data that is likely
  1157     //   to never be used, just set up a zero-length buffer at
  1158     //   the position.
  1159     u8b = (UTF8Buf *)ut->q;
  1160     u8b->bufNativeStart   = ix;
  1161     u8b->bufNativeLimit   = ix;
  1162     u8b->bufStartIdx      = 0;
  1163     u8b->bufLimitIdx      = 0;
  1164     u8b->bufNILimit       = 0;
  1165     u8b->toUCharsMapStart = ix;
  1166     u8b->mapToNative[0]   = 0;
  1167     u8b->mapToUChars[0]   = 0;
  1168     goto swapBuffersAndFail;
  1172 fillForward:
  1174         // Move the incoming index to a code point boundary.
  1175         U8_SET_CP_START(s8, 0, ix);
  1177         // Swap the UText buffers.
  1178         //  We want to fill what was previously the alternate buffer,
  1179         //  and make what was the current buffer be the new alternate.
  1180         UTF8Buf *u8b = (UTF8Buf *)ut->q;
  1181         ut->q = ut->p;
  1182         ut->p = u8b;
  1184         int32_t strLen = ut->b;
  1185         UBool   nulTerminated = FALSE;
  1186         if (strLen < 0) {
  1187             strLen = 0x7fffffff;
  1188             nulTerminated = TRUE;
  1191         UChar   *buf = u8b->buf;
  1192         uint8_t *mapToNative  = u8b->mapToNative;
  1193         uint8_t *mapToUChars  = u8b->mapToUChars;
  1194         int32_t  destIx       = 0;
  1195         int32_t  srcIx        = ix;
  1196         UBool    seenNonAscii = FALSE;
  1197         UChar32  c = 0;
  1199         // Fill the chunk buffer and mapping arrays.
  1200         while (destIx<UTF8_TEXT_CHUNK_SIZE) {
  1201             c = s8[srcIx];
  1202             if (c>0 && c<0x80) {
  1203                 // Special case ASCII range for speed.
  1204                 //   zero is excluded to simplify bounds checking.
  1205                 buf[destIx] = (UChar)c;
  1206                 mapToNative[destIx]    = (uint8_t)(srcIx - ix);
  1207                 mapToUChars[srcIx-ix]  = (uint8_t)destIx;
  1208                 srcIx++;
  1209                 destIx++;
  1210             } else {
  1211                 // General case, handle everything.
  1212                 if (seenNonAscii == FALSE) {
  1213                     seenNonAscii = TRUE;
  1214                     u8b->bufNILimit = destIx;
  1217                 int32_t  cIx      = srcIx;
  1218                 int32_t  dIx      = destIx;
  1219                 int32_t  dIxSaved = destIx;
  1220                 U8_NEXT_OR_FFFD(s8, srcIx, strLen, c);
  1221                 if (c==0 && nulTerminated) {
  1222                     srcIx--;
  1223                     break;
  1226                 U16_APPEND_UNSAFE(buf, destIx, c);
  1227                 do {
  1228                     mapToNative[dIx++] = (uint8_t)(cIx - ix);
  1229                 } while (dIx < destIx);
  1231                 do {
  1232                     mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved;
  1233                 } while (cIx < srcIx);
  1235             if (srcIx>=strLen) {
  1236                 break;
  1241         //  store Native <--> Chunk Map entries for the end of the buffer.
  1242         //    There is no actual character here, but the index position is valid.
  1243         mapToNative[destIx]     = (uint8_t)(srcIx - ix);
  1244         mapToUChars[srcIx - ix] = (uint8_t)destIx;
  1246         //  fill in Buffer descriptor
  1247         u8b->bufNativeStart     = ix;
  1248         u8b->bufNativeLimit     = srcIx;
  1249         u8b->bufStartIdx        = 0;
  1250         u8b->bufLimitIdx        = destIx;
  1251         if (seenNonAscii == FALSE) {
  1252             u8b->bufNILimit     = destIx;
  1254         u8b->toUCharsMapStart   = u8b->bufNativeStart;
  1256         // Set UText chunk to refer to this buffer.
  1257         ut->chunkContents       = buf;
  1258         ut->chunkOffset         = 0;
  1259         ut->chunkLength         = u8b->bufLimitIdx;
  1260         ut->chunkNativeStart    = u8b->bufNativeStart;
  1261         ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1262         ut->nativeIndexingLimit = u8b->bufNILimit;
  1264         // For zero terminated strings, keep track of the maximum point
  1265         //   scanned so far.
  1266         if (nulTerminated && srcIx>ut->c) {
  1267             ut->c = srcIx;
  1268             if (c==0) {
  1269                 // We scanned to the end.
  1270                 //   Remember the actual length.
  1271                 ut->b = srcIx;
  1272                 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1275         return TRUE;
  1279 fillReverse:
  1281         // Move the incoming index to a code point boundary.
  1282         // Can only do this if the incoming index is somewhere in the interior of the string.
  1283         //   If index is at the end, there is no character there to look at.
  1284         if (ix != ut->b) {
  1285             U8_SET_CP_START(s8, 0, ix);
  1288         // Swap the UText buffers.
  1289         //  We want to fill what was previously the alternate buffer,
  1290         //  and make what was the current buffer be the new alternate.
  1291         UTF8Buf *u8b = (UTF8Buf *)ut->q;
  1292         ut->q = ut->p;
  1293         ut->p = u8b;
  1295         UChar   *buf = u8b->buf;
  1296         uint8_t *mapToNative = u8b->mapToNative;
  1297         uint8_t *mapToUChars = u8b->mapToUChars;
  1298         int32_t  toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1);
  1299         int32_t  destIx = UTF8_TEXT_CHUNK_SIZE+2;   // Start in the overflow region
  1300                                                     //   at end of buffer to leave room
  1301                                                     //   for a surrogate pair at the
  1302                                                     //   buffer start.
  1303         int32_t  srcIx  = ix;
  1304         int32_t  bufNILimit = destIx;
  1305         UChar32   c;
  1307         // Map to/from Native Indexes, fill in for the position at the end of
  1308         //   the buffer.
  1309         //
  1310         mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1311         mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1313         // Fill the chunk buffer
  1314         // Work backwards, filling from the end of the buffer towards the front.
  1315         //
  1316         while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) {
  1317             srcIx--;
  1318             destIx--;
  1320             // Get last byte of the UTF-8 character
  1321             c = s8[srcIx];
  1322             if (c<0x80) {
  1323                 // Special case ASCII range for speed.
  1324                 buf[destIx] = (UChar)c;
  1325                 mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx;
  1326                 mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1327             } else {
  1328                 // General case, handle everything non-ASCII.
  1330                 int32_t  sIx      = srcIx;  // ix of last byte of multi-byte u8 char
  1332                 // Get the full character from the UTF8 string.
  1333                 //   use code derived from tbe macros in utf8.h
  1334                 //   Leaves srcIx pointing at the first byte of the UTF-8 char.
  1335                 //
  1336                 c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3);
  1337                 // leaves srcIx at first byte of the multi-byte char.
  1339                 // Store the character in UTF-16 buffer.
  1340                 if (c<0x10000) {
  1341                     buf[destIx] = (UChar)c;
  1342                     mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1343                 } else {
  1344                     buf[destIx]         = U16_TRAIL(c);
  1345                     mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1346                     buf[--destIx]       = U16_LEAD(c);
  1347                     mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart);
  1350                 // Fill in the map from native indexes to UChars buf index.
  1351                 do {
  1352                     mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx;
  1353                 } while (sIx >= srcIx);
  1355                 // Set native indexing limit to be the current position.
  1356                 //   We are processing a non-ascii, non-native-indexing char now;
  1357                 //     the limit will be here if the rest of the chars to be
  1358                 //     added to this buffer are ascii.
  1359                 bufNILimit = destIx;
  1362         u8b->bufNativeStart     = srcIx;
  1363         u8b->bufNativeLimit     = ix;
  1364         u8b->bufStartIdx        = destIx;
  1365         u8b->bufLimitIdx        = UTF8_TEXT_CHUNK_SIZE+2;
  1366         u8b->bufNILimit         = bufNILimit - u8b->bufStartIdx;
  1367         u8b->toUCharsMapStart   = toUCharsMapStart;
  1369         ut->chunkContents       = &buf[u8b->bufStartIdx];
  1370         ut->chunkLength         = u8b->bufLimitIdx - u8b->bufStartIdx;
  1371         ut->chunkOffset         = ut->chunkLength;
  1372         ut->chunkNativeStart    = u8b->bufNativeStart;
  1373         ut->chunkNativeLimit    = u8b->bufNativeLimit;
  1374         ut->nativeIndexingLimit = u8b->bufNILimit;
  1375         return TRUE;
  1382 //
  1383 //  This is a slightly modified copy of u_strFromUTF8,
  1384 //     Inserts a Replacement Char rather than failing on invalid UTF-8
  1385 //     Removes unnecessary features.
  1386 //
  1387 static UChar*
  1388 utext_strFromUTF8(UChar *dest,
  1389               int32_t destCapacity,
  1390               int32_t *pDestLength,
  1391               const char* src,
  1392               int32_t srcLength,        // required.  NUL terminated not supported.
  1393               UErrorCode *pErrorCode
  1397     UChar *pDest = dest;
  1398     UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
  1399     UChar32 ch=0;
  1400     int32_t index = 0;
  1401     int32_t reqLength = 0;
  1402     uint8_t* pSrc = (uint8_t*) src;
  1405     while((index < srcLength)&&(pDest<pDestLimit)){
  1406         ch = pSrc[index++];
  1407         if(ch <=0x7f){
  1408             *pDest++=(UChar)ch;
  1409         }else{
  1410             ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1411             if(U_IS_BMP(ch)){
  1412                 *(pDest++)=(UChar)ch;
  1413             }else{
  1414                 *(pDest++)=U16_LEAD(ch);
  1415                 if(pDest<pDestLimit){
  1416                     *(pDest++)=U16_TRAIL(ch);
  1417                 }else{
  1418                     reqLength++;
  1419                     break;
  1424     /* donot fill the dest buffer just count the UChars needed */
  1425     while(index < srcLength){
  1426         ch = pSrc[index++];
  1427         if(ch <= 0x7f){
  1428             reqLength++;
  1429         }else{
  1430             ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3);
  1431             reqLength+=U16_LENGTH(ch);
  1435     reqLength+=(int32_t)(pDest - dest);
  1437     if(pDestLength){
  1438         *pDestLength = reqLength;
  1441     /* Terminate the buffer */
  1442     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
  1444     return dest;
  1449 static int32_t U_CALLCONV
  1450 utf8TextExtract(UText *ut,
  1451                 int64_t start, int64_t limit,
  1452                 UChar *dest, int32_t destCapacity,
  1453                 UErrorCode *pErrorCode) {
  1454     if(U_FAILURE(*pErrorCode)) {
  1455         return 0;
  1457     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  1458         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1459         return 0;
  1461     int32_t  length  = ut->b;
  1462     int32_t  start32 = pinIndex(start, length);
  1463     int32_t  limit32 = pinIndex(limit, length);
  1465     if(start32>limit32) {
  1466         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  1467         return 0;
  1471     // adjust the incoming indexes to land on code point boundaries if needed.
  1472     //    adjust by no more than three, because that is the largest number of trail bytes
  1473     //    in a well formed UTF8 character.
  1474     const uint8_t *buf = (const uint8_t *)ut->context;
  1475     int i;
  1476     if (start32 < ut->chunkNativeLimit) {
  1477         for (i=0; i<3; i++) {
  1478             if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) {
  1479                 break;
  1481             start32--;
  1485     if (limit32 < ut->chunkNativeLimit) {
  1486         for (i=0; i<3; i++) {
  1487             if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) {
  1488                 break;
  1490             limit32--;
  1494     // Do the actual extract.
  1495     int32_t destLength=0;
  1496     utext_strFromUTF8(dest, destCapacity, &destLength,
  1497                     (const char *)ut->context+start32, limit32-start32,
  1498                     pErrorCode);
  1499     utf8TextAccess(ut, limit32, TRUE);
  1500     return destLength;
  1503 //
  1504 // utf8TextMapOffsetToNative
  1505 //
  1506 // Map a chunk (UTF-16) offset to a native index.
  1507 static int64_t U_CALLCONV
  1508 utf8TextMapOffsetToNative(const UText *ut) {
  1509     //
  1510     UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1511     U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength);
  1512     int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart;
  1513     U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit);
  1514     return nativeOffset;
  1517 //
  1518 // Map a native index to the corrsponding chunk offset
  1519 //
  1520 static int32_t U_CALLCONV
  1521 utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) {
  1522     U_ASSERT(index64 <= 0x7fffffff);
  1523     int32_t index = (int32_t)index64;
  1524     UTF8Buf *u8b = (UTF8Buf *)ut->p;
  1525     U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit);
  1526     U_ASSERT(index<=ut->chunkNativeLimit);
  1527     int32_t mapIndex = index - u8b->toUCharsMapStart;
  1528     int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx;
  1529     U_ASSERT(offset>=0 && offset<=ut->chunkLength);
  1530     return offset;
  1533 static UText * U_CALLCONV
  1534 utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status)
  1536     // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  1537     dest = shallowTextClone(dest, src, status);
  1539     // For deep clones, make a copy of the string.
  1540     //  The copied storage is owned by the newly created clone.
  1541     //
  1542     // TODO:  There is an isssue with using utext_nativeLength().
  1543     //        That function is non-const in cases where the input was NUL terminated
  1544     //          and the length has not yet been determined.
  1545     //        This function (clone()) is const.
  1546     //        There potentially a thread safety issue lurking here.
  1547     //
  1548     if (deep && U_SUCCESS(*status)) {
  1549         int32_t  len = (int32_t)utext_nativeLength((UText *)src);
  1550         char *copyStr = (char *)uprv_malloc(len+1);
  1551         if (copyStr == NULL) {
  1552             *status = U_MEMORY_ALLOCATION_ERROR;
  1553         } else {
  1554             uprv_memcpy(copyStr, src->context, len+1);
  1555             dest->context = copyStr;
  1556             dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1559     return dest;
  1563 static void U_CALLCONV
  1564 utf8TextClose(UText *ut) {
  1565     // Most of the work of close is done by the generic UText framework close.
  1566     // All that needs to be done here is to delete the UTF8 string if the UText
  1567     //  owns it.  This occurs if the UText was created by cloning.
  1568     if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1569         char *s = (char *)ut->context;
  1570         uprv_free(s);
  1571         ut->context = NULL;
  1575 U_CDECL_END
  1578 static const struct UTextFuncs utf8Funcs = 
  1580     sizeof(UTextFuncs),
  1581     0, 0, 0,             // Reserved alignment padding
  1582     utf8TextClone,
  1583     utf8TextLength,
  1584     utf8TextAccess,
  1585     utf8TextExtract,
  1586     NULL,                /* replace*/
  1587     NULL,                /* copy   */
  1588     utf8TextMapOffsetToNative,
  1589     utf8TextMapIndexToUTF16,
  1590     utf8TextClose,
  1591     NULL,                // spare 1
  1592     NULL,                // spare 2
  1593     NULL                 // spare 3
  1594 };
  1597 static const char gEmptyString[] = {0};
  1599 U_CAPI UText * U_EXPORT2
  1600 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) {
  1601     if(U_FAILURE(*status)) {
  1602         return NULL;
  1604     if(s==NULL && length==0) {
  1605         s = gEmptyString;
  1608     if(s==NULL || length<-1 || length>INT32_MAX) {
  1609         *status=U_ILLEGAL_ARGUMENT_ERROR;
  1610         return NULL;
  1613     ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status);
  1614     if (U_FAILURE(*status)) {
  1615         return ut;
  1618     ut->pFuncs  = &utf8Funcs;
  1619     ut->context = s;
  1620     ut->b       = (int32_t)length;
  1621     ut->c       = (int32_t)length;
  1622     if (ut->c < 0) {
  1623         ut->c = 0;
  1624         ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  1626     ut->p = ut->pExtra;
  1627     ut->q = (char *)ut->pExtra + sizeof(UTF8Buf);
  1628     return ut;
  1639 //------------------------------------------------------------------------------
  1640 //
  1641 //     UText implementation wrapper for Replaceable (read/write)
  1642 //
  1643 //         Use of UText data members:
  1644 //            context    pointer to Replaceable.
  1645 //            p          pointer to Replaceable if it is owned by the UText.
  1646 //
  1647 //------------------------------------------------------------------------------
  1651 // minimum chunk size for this implementation: 3
  1652 // to allow for possible trimming for code point boundaries
  1653 enum { REP_TEXT_CHUNK_SIZE=10 };
  1655 struct ReplExtra {
  1656     /*
  1657      * Chunk UChars.
  1658      * +1 to simplify filling with surrogate pair at the end.
  1659      */
  1660     UChar s[REP_TEXT_CHUNK_SIZE+1];
  1661 };
  1664 U_CDECL_BEGIN
  1666 static UText * U_CALLCONV
  1667 repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  1668     // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  1669     dest = shallowTextClone(dest, src, status);
  1671     // For deep clones, make a copy of the Replaceable.
  1672     //  The copied Replaceable storage is owned by the newly created UText clone.
  1673     //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  1674     //    it.
  1675     //
  1676     if (deep && U_SUCCESS(*status)) {
  1677         const Replaceable *replSrc = (const Replaceable *)src->context;
  1678         dest->context = replSrc->clone();
  1679         dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  1681         // with deep clone, the copy is writable, even when the source is not.
  1682         dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  1684     return dest;
  1688 static void U_CALLCONV
  1689 repTextClose(UText *ut) {
  1690     // Most of the work of close is done by the generic UText framework close.
  1691     // All that needs to be done here is delete the Replaceable if the UText
  1692     //  owns it.  This occurs if the UText was created by cloning.
  1693     if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  1694         Replaceable *rep = (Replaceable *)ut->context;
  1695         delete rep;
  1696         ut->context = NULL;
  1701 static int64_t U_CALLCONV
  1702 repTextLength(UText *ut) {
  1703     const Replaceable *replSrc = (const Replaceable *)ut->context;
  1704     int32_t  len = replSrc->length();
  1705     return len;
  1709 static UBool U_CALLCONV
  1710 repTextAccess(UText *ut, int64_t index, UBool forward) {
  1711     const Replaceable *rep=(const Replaceable *)ut->context;
  1712     int32_t length=rep->length();   // Full length of the input text (bigger than a chunk)
  1714     // clip the requested index to the limits of the text.
  1715     int32_t index32 = pinIndex(index, length);
  1716     U_ASSERT(index<=INT32_MAX);
  1719     /*
  1720      * Compute start/limit boundaries around index, for a segment of text
  1721      * to be extracted.
  1722      * To allow for the possibility that our user gave an index to the trailing
  1723      * half of a surrogate pair, we must request one extra preceding UChar when
  1724      * going in the forward direction.  This will ensure that the buffer has the
  1725      * entire code point at the specified index.
  1726      */
  1727     if(forward) {
  1729         if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) {
  1730             // Buffer already contains the requested position.
  1731             ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart);
  1732             return TRUE;
  1734         if (index32>=length && ut->chunkNativeLimit==length) {
  1735             // Request for end of string, and buffer already extends up to it.
  1736             // Can't get the data, but don't change the buffer.
  1737             ut->chunkOffset = length - (int32_t)ut->chunkNativeStart;
  1738             return FALSE;
  1741         ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1;
  1742         // Going forward, so we want to have the buffer with stuff at and beyond
  1743         //   the requested index.  The -1 gets us one code point before the
  1744         //   requested index also, to handle the case of the index being on
  1745         //   a trail surrogate of a surrogate pair.
  1746         if(ut->chunkNativeLimit > length) {
  1747             ut->chunkNativeLimit = length;
  1749         // unless buffer ran off end, start is index-1.
  1750         ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE;
  1751         if(ut->chunkNativeStart < 0) {
  1752             ut->chunkNativeStart = 0;
  1754     } else {
  1755         // Reverse iteration.  Fill buffer with data preceding the requested index.
  1756         if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) {
  1757             // Requested position already in buffer.
  1758             ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart;
  1759             return TRUE;
  1761         if (index32==0 && ut->chunkNativeStart==0) {
  1762             // Request for start, buffer already begins at start.
  1763             //  No data, but keep the buffer as is.
  1764             ut->chunkOffset = 0;
  1765             return FALSE;
  1768         // Figure out the bounds of the chunk to extract for reverse iteration.
  1769         // Need to worry about chunk not splitting surrogate pairs, and while still
  1770         // containing the data we need.
  1771         // Fix by requesting a chunk that includes an extra UChar at the end.
  1772         // If this turns out to be a lead surrogate, we can lop it off and still have
  1773         //   the data we wanted.
  1774         ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE;
  1775         if (ut->chunkNativeStart < 0) {
  1776             ut->chunkNativeStart = 0;
  1779         ut->chunkNativeLimit = index32 + 1;
  1780         if (ut->chunkNativeLimit > length) {
  1781             ut->chunkNativeLimit = length;
  1785     // Extract the new chunk of text from the Replaceable source.
  1786     ReplExtra *ex = (ReplExtra *)ut->pExtra;
  1787     // UnicodeString with its buffer a writable alias to the chunk buffer
  1788     UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/);
  1789     rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer);
  1791     ut->chunkContents  = ex->s;
  1792     ut->chunkLength    = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart);
  1793     ut->chunkOffset    = (int32_t)(index32 - ut->chunkNativeStart);
  1795     // Surrogate pairs from the input text must not span chunk boundaries.
  1796     // If end of chunk could be the start of a surrogate, trim it off.
  1797     if (ut->chunkNativeLimit < length &&
  1798         U16_IS_LEAD(ex->s[ut->chunkLength-1])) {
  1799             ut->chunkLength--;
  1800             ut->chunkNativeLimit--;
  1801             if (ut->chunkOffset > ut->chunkLength) {
  1802                 ut->chunkOffset = ut->chunkLength;
  1806     // if the first UChar in the chunk could be the trailing half of a surrogate pair,
  1807     // trim it off.
  1808     if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) {
  1809         ++(ut->chunkContents);
  1810         ++(ut->chunkNativeStart);
  1811         --(ut->chunkLength);
  1812         --(ut->chunkOffset);
  1815     // adjust the index/chunkOffset to a code point boundary
  1816     U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset);
  1818     // Use fast indexing for get/setNativeIndex()
  1819     ut->nativeIndexingLimit = ut->chunkLength;
  1821     return TRUE;
  1826 static int32_t U_CALLCONV
  1827 repTextExtract(UText *ut,
  1828                int64_t start, int64_t limit,
  1829                UChar *dest, int32_t destCapacity,
  1830                UErrorCode *status) {
  1831     const Replaceable *rep=(const Replaceable *)ut->context;
  1832     int32_t  length=rep->length();
  1834     if(U_FAILURE(*status)) {
  1835         return 0;
  1837     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  1838         *status=U_ILLEGAL_ARGUMENT_ERROR;
  1840     if(start>limit) {
  1841         *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1842         return 0;
  1845     int32_t  start32 = pinIndex(start, length);
  1846     int32_t  limit32 = pinIndex(limit, length);
  1848     // adjust start, limit if they point to trail half of surrogates
  1849     if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) &&
  1850         U_IS_SUPPLEMENTARY(rep->char32At(start32))){
  1851             start32--;
  1853     if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) &&
  1854         U_IS_SUPPLEMENTARY(rep->char32At(limit32))){
  1855             limit32--;
  1858     length=limit32-start32;
  1859     if(length>destCapacity) {
  1860         limit32 = start32 + destCapacity;
  1862     UnicodeString buffer(dest, 0, destCapacity); // writable alias
  1863     rep->extractBetween(start32, limit32, buffer);
  1864     repTextAccess(ut, limit32, TRUE);
  1866     return u_terminateUChars(dest, destCapacity, length, status);
  1869 static int32_t U_CALLCONV
  1870 repTextReplace(UText *ut,
  1871                int64_t start, int64_t limit,
  1872                const UChar *src, int32_t length,
  1873                UErrorCode *status) {
  1874     Replaceable *rep=(Replaceable *)ut->context;
  1875     int32_t oldLength;
  1877     if(U_FAILURE(*status)) {
  1878         return 0;
  1880     if(src==NULL && length!=0) {
  1881         *status=U_ILLEGAL_ARGUMENT_ERROR;
  1882         return 0;
  1884     oldLength=rep->length(); // will subtract from new length
  1885     if(start>limit ) {
  1886         *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1887         return 0;
  1890     int32_t start32 = pinIndex(start, oldLength);
  1891     int32_t limit32 = pinIndex(limit, oldLength);
  1893     // Snap start & limit to code point boundaries.
  1894     if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) &&
  1895         start32>0 && U16_IS_LEAD(rep->charAt(start32-1)))
  1897             start32--;
  1899     if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) &&
  1900         U16_IS_TRAIL(rep->charAt(limit32)))
  1902             limit32++;
  1905     // Do the actual replace operation using methods of the Replaceable class
  1906     UnicodeString replStr((UBool)(length<0), src, length); // read-only alias
  1907     rep->handleReplaceBetween(start32, limit32, replStr);
  1908     int32_t newLength = rep->length();
  1909     int32_t lengthDelta = newLength - oldLength;
  1911     // Is the UText chunk buffer OK?
  1912     if (ut->chunkNativeLimit > start32) {
  1913         // this replace operation may have impacted the current chunk.
  1914         // invalidate it, which will force a reload on the next access.
  1915         invalidateChunk(ut);
  1918     // set the iteration position to the end of the newly inserted replacement text.
  1919     int32_t newIndexPos = limit32 + lengthDelta;
  1920     repTextAccess(ut, newIndexPos, TRUE);
  1922     return lengthDelta;
  1926 static void U_CALLCONV
  1927 repTextCopy(UText *ut,
  1928                 int64_t start, int64_t limit,
  1929                 int64_t destIndex,
  1930                 UBool move,
  1931                 UErrorCode *status)
  1933     Replaceable *rep=(Replaceable *)ut->context;
  1934     int32_t length=rep->length();
  1936     if(U_FAILURE(*status)) {
  1937         return;
  1939     if (start>limit || (start<destIndex && destIndex<limit))
  1941         *status=U_INDEX_OUTOFBOUNDS_ERROR;
  1942         return;
  1945     int32_t start32     = pinIndex(start, length);
  1946     int32_t limit32     = pinIndex(limit, length);
  1947     int32_t destIndex32 = pinIndex(destIndex, length);
  1949     // TODO:  snap input parameters to code point boundaries.
  1951     if(move) {
  1952         // move: copy to destIndex, then replace original with nothing
  1953         int32_t segLength=limit32-start32;
  1954         rep->copy(start32, limit32, destIndex32);
  1955         if(destIndex32<start32) {
  1956             start32+=segLength;
  1957             limit32+=segLength;
  1959         rep->handleReplaceBetween(start32, limit32, UnicodeString());
  1960     } else {
  1961         // copy
  1962         rep->copy(start32, limit32, destIndex32);
  1965     // If the change to the text touched the region in the chunk buffer,
  1966     //  invalidate the buffer.
  1967     int32_t firstAffectedIndex = destIndex32;
  1968     if (move && start32<firstAffectedIndex) {
  1969         firstAffectedIndex = start32;
  1971     if (firstAffectedIndex < ut->chunkNativeLimit) {
  1972         // changes may have affected range covered by the chunk
  1973         invalidateChunk(ut);
  1976     // Put iteration position at the newly inserted (moved) block,
  1977     int32_t  nativeIterIndex = destIndex32 + limit32 - start32;
  1978     if (move && destIndex32>start32) {
  1979         // moved a block of text towards the end of the string.
  1980         nativeIterIndex = destIndex32;
  1983     // Set position, reload chunk if needed.
  1984     repTextAccess(ut, nativeIterIndex, TRUE);
  1987 static const struct UTextFuncs repFuncs = 
  1989     sizeof(UTextFuncs),
  1990     0, 0, 0,           // Reserved alignment padding
  1991     repTextClone,
  1992     repTextLength,
  1993     repTextAccess,
  1994     repTextExtract,
  1995     repTextReplace,   
  1996     repTextCopy,   
  1997     NULL,              // MapOffsetToNative,
  1998     NULL,              // MapIndexToUTF16,
  1999     repTextClose,
  2000     NULL,              // spare 1
  2001     NULL,              // spare 2
  2002     NULL               // spare 3
  2003 };
  2006 U_CAPI UText * U_EXPORT2
  2007 utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status)
  2009     if(U_FAILURE(*status)) {
  2010         return NULL;
  2012     if(rep==NULL) {
  2013         *status=U_ILLEGAL_ARGUMENT_ERROR;
  2014         return NULL;
  2016     ut = utext_setup(ut, sizeof(ReplExtra), status);
  2018     ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  2019     if(rep->hasMetaData()) {
  2020         ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA);
  2023     ut->pFuncs  = &repFuncs;
  2024     ut->context =  rep;
  2025     return ut;
  2028 U_CDECL_END
  2037 //------------------------------------------------------------------------------
  2038 //
  2039 //     UText implementation for UnicodeString (read/write)  and
  2040 //                    for const UnicodeString (read only)
  2041 //             (same implementation, only the flags are different)
  2042 //
  2043 //         Use of UText data members:
  2044 //            context    pointer to UnicodeString
  2045 //            p          pointer to UnicodeString IF this UText owns the string
  2046 //                       and it must be deleted on close().  NULL otherwise.
  2047 //
  2048 //------------------------------------------------------------------------------
  2050 U_CDECL_BEGIN
  2053 static UText * U_CALLCONV
  2054 unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) {
  2055     // First do a generic shallow clone.  Does everything needed for the UText struct itself.
  2056     dest = shallowTextClone(dest, src, status);
  2058     // For deep clones, make a copy of the UnicodeSring.
  2059     //  The copied UnicodeString storage is owned by the newly created UText clone.
  2060     //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  2061     //    the UText.
  2062     //
  2063     if (deep && U_SUCCESS(*status)) {
  2064         const UnicodeString *srcString = (const UnicodeString *)src->context;
  2065         dest->context = new UnicodeString(*srcString);
  2066         dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  2068         // with deep clone, the copy is writable, even when the source is not.
  2069         dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  2071     return dest;
  2074 static void U_CALLCONV
  2075 unistrTextClose(UText *ut) {
  2076     // Most of the work of close is done by the generic UText framework close.
  2077     // All that needs to be done here is delete the UnicodeString if the UText
  2078     //  owns it.  This occurs if the UText was created by cloning.
  2079     if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  2080         UnicodeString *str = (UnicodeString *)ut->context;
  2081         delete str;
  2082         ut->context = NULL;
  2087 static int64_t U_CALLCONV
  2088 unistrTextLength(UText *t) {
  2089     return ((const UnicodeString *)t->context)->length();
  2093 static UBool U_CALLCONV
  2094 unistrTextAccess(UText *ut, int64_t index, UBool  forward) {
  2095     int32_t length  = ut->chunkLength;
  2096     ut->chunkOffset = pinIndex(index, length);
  2098     // Check whether request is at the start or end
  2099     UBool retVal = (forward && index<length) || (!forward && index>0);
  2100     return retVal;
  2105 static int32_t U_CALLCONV
  2106 unistrTextExtract(UText *t,
  2107                   int64_t start, int64_t limit,
  2108                   UChar *dest, int32_t destCapacity,
  2109                   UErrorCode *pErrorCode) {
  2110     const UnicodeString *us=(const UnicodeString *)t->context;
  2111     int32_t length=us->length();
  2113     if(U_FAILURE(*pErrorCode)) {
  2114         return 0;
  2116     if(destCapacity<0 || (dest==NULL && destCapacity>0)) {
  2117         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  2119     if(start<0 || start>limit) {
  2120         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  2121         return 0;
  2124     int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length;
  2125     int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length;
  2127     length=limit32-start32;
  2128     if (destCapacity>0 && dest!=NULL) {
  2129         int32_t trimmedLength = length;
  2130         if(trimmedLength>destCapacity) {
  2131             trimmedLength=destCapacity;
  2133         us->extract(start32, trimmedLength, dest);
  2134         t->chunkOffset = start32+trimmedLength;
  2135     } else {
  2136         t->chunkOffset = start32;
  2138     u_terminateUChars(dest, destCapacity, length, pErrorCode);
  2139     return length;
  2142 static int32_t U_CALLCONV
  2143 unistrTextReplace(UText *ut,
  2144                   int64_t start, int64_t limit,
  2145                   const UChar *src, int32_t length,
  2146                   UErrorCode *pErrorCode) {
  2147     UnicodeString *us=(UnicodeString *)ut->context;
  2148     int32_t oldLength;
  2150     if(U_FAILURE(*pErrorCode)) {
  2151         return 0;
  2153     if(src==NULL && length!=0) {
  2154         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  2156     if(start>limit) {
  2157         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  2158         return 0;
  2160     oldLength=us->length();
  2161     int32_t start32 = pinIndex(start, oldLength);
  2162     int32_t limit32 = pinIndex(limit, oldLength);
  2163     if (start32 < oldLength) {
  2164         start32 = us->getChar32Start(start32);
  2166     if (limit32 < oldLength) {
  2167         limit32 = us->getChar32Start(limit32);
  2170     // replace
  2171     us->replace(start32, limit32-start32, src, length);
  2172     int32_t newLength = us->length();
  2174     // Update the chunk description.
  2175     ut->chunkContents    = us->getBuffer();
  2176     ut->chunkLength      = newLength;
  2177     ut->chunkNativeLimit = newLength;
  2178     ut->nativeIndexingLimit = newLength;
  2180     // Set iteration position to the point just following the newly inserted text.
  2181     int32_t lengthDelta = newLength - oldLength;
  2182     ut->chunkOffset = limit32 + lengthDelta;
  2184     return lengthDelta;
  2187 static void U_CALLCONV
  2188 unistrTextCopy(UText *ut,
  2189                int64_t start, int64_t limit,
  2190                int64_t destIndex,
  2191                UBool move,
  2192                UErrorCode *pErrorCode) {
  2193     UnicodeString *us=(UnicodeString *)ut->context;
  2194     int32_t length=us->length();
  2196     if(U_FAILURE(*pErrorCode)) {
  2197         return;
  2199     int32_t start32 = pinIndex(start, length);
  2200     int32_t limit32 = pinIndex(limit, length);
  2201     int32_t destIndex32 = pinIndex(destIndex, length);
  2203     if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) {
  2204         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
  2205         return;
  2208     if(move) {
  2209         // move: copy to destIndex, then replace original with nothing
  2210         int32_t segLength=limit32-start32;
  2211         us->copy(start32, limit32, destIndex32);
  2212         if(destIndex32<start32) {
  2213             start32+=segLength;
  2215         us->replace(start32, segLength, NULL, 0);
  2216     } else {
  2217         // copy
  2218         us->copy(start32, limit32, destIndex32);
  2221     // update chunk description, set iteration position.
  2222     ut->chunkContents = us->getBuffer();
  2223     if (move==FALSE) {
  2224         // copy operation, string length grows
  2225         ut->chunkLength += limit32-start32;
  2226         ut->chunkNativeLimit = ut->chunkLength;
  2227         ut->nativeIndexingLimit = ut->chunkLength;
  2230     // Iteration position to end of the newly inserted text.
  2231     ut->chunkOffset = destIndex32+limit32-start32;
  2232     if (move && destIndex32>start32) {
  2233         ut->chunkOffset = destIndex32;
  2238 static const struct UTextFuncs unistrFuncs = 
  2240     sizeof(UTextFuncs),
  2241     0, 0, 0,             // Reserved alignment padding
  2242     unistrTextClone,
  2243     unistrTextLength,
  2244     unistrTextAccess,
  2245     unistrTextExtract,
  2246     unistrTextReplace,   
  2247     unistrTextCopy,   
  2248     NULL,                // MapOffsetToNative,
  2249     NULL,                // MapIndexToUTF16,
  2250     unistrTextClose,
  2251     NULL,                // spare 1
  2252     NULL,                // spare 2
  2253     NULL                 // spare 3
  2254 };
  2258 U_CDECL_END
  2261 U_CAPI UText * U_EXPORT2
  2262 utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) {
  2263     ut = utext_openConstUnicodeString(ut, s, status);
  2264     if (U_SUCCESS(*status)) {
  2265         ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE);
  2267     return ut;
  2272 U_CAPI UText * U_EXPORT2
  2273 utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) {
  2274     if (U_SUCCESS(*status) && s->isBogus()) {
  2275         // The UnicodeString is bogus, but we still need to detach the UText
  2276         //   from whatever it was hooked to before, if anything.
  2277         utext_openUChars(ut, NULL, 0, status);
  2278         *status = U_ILLEGAL_ARGUMENT_ERROR;
  2279         return ut;
  2281     ut = utext_setup(ut, 0, status);
  2282     //    note:  use the standard (writable) function table for UnicodeString.
  2283     //           The flag settings disable writing, so having the functions in
  2284     //           the table is harmless.
  2285     if (U_SUCCESS(*status)) {
  2286         ut->pFuncs              = &unistrFuncs;
  2287         ut->context             = s;
  2288         ut->providerProperties  = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  2289         ut->chunkContents       = s->getBuffer();
  2290         ut->chunkLength         = s->length();
  2291         ut->chunkNativeStart    = 0;
  2292         ut->chunkNativeLimit    = ut->chunkLength;
  2293         ut->nativeIndexingLimit = ut->chunkLength;
  2295     return ut;
  2298 //------------------------------------------------------------------------------
  2299 //
  2300 //     UText implementation for const UChar * strings
  2301 //
  2302 //         Use of UText data members:
  2303 //            context    pointer to UnicodeString
  2304 //            a          length.  -1 if not yet known.
  2305 //
  2306 //         TODO:  support 64 bit lengths.
  2307 //
  2308 //------------------------------------------------------------------------------
  2310 U_CDECL_BEGIN
  2313 static UText * U_CALLCONV
  2314 ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) {
  2315     // First do a generic shallow clone.
  2316     dest = shallowTextClone(dest, src, status);
  2318     // For deep clones, make a copy of the string.
  2319     //  The copied storage is owned by the newly created clone.
  2320     //  A non-NULL pointer in UText.p is the signal to the close() function to delete
  2321     //    it.
  2322     //
  2323     if (deep && U_SUCCESS(*status)) {
  2324         U_ASSERT(utext_nativeLength(dest) < INT32_MAX);
  2325         int32_t  len = (int32_t)utext_nativeLength(dest);
  2327         // The cloned string IS going to be NUL terminated, whether or not the original was.
  2328         const UChar *srcStr = (const UChar *)src->context;
  2329         UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar));
  2330         if (copyStr == NULL) {
  2331             *status = U_MEMORY_ALLOCATION_ERROR;
  2332         } else {
  2333             int64_t i;
  2334             for (i=0; i<len; i++) {
  2335                 copyStr[i] = srcStr[i];
  2337             copyStr[len] = 0;
  2338             dest->context = copyStr;
  2339             dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT);
  2342     return dest;
  2346 static void U_CALLCONV
  2347 ucstrTextClose(UText *ut) {
  2348     // Most of the work of close is done by the generic UText framework close.
  2349     // All that needs to be done here is delete the string if the UText
  2350     //  owns it.  This occurs if the UText was created by cloning.
  2351     if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) {
  2352         UChar *s = (UChar *)ut->context;
  2353         uprv_free(s);
  2354         ut->context = NULL;
  2360 static int64_t U_CALLCONV
  2361 ucstrTextLength(UText *ut) {
  2362     if (ut->a < 0) {
  2363         // null terminated, we don't yet know the length.  Scan for it.
  2364         //    Access is not convenient for doing this
  2365         //    because the current interation postion can't be changed.
  2366         const UChar  *str = (const UChar *)ut->context;
  2367         for (;;) {
  2368             if (str[ut->chunkNativeLimit] == 0) {
  2369                 break;
  2371             ut->chunkNativeLimit++;
  2373         ut->a = ut->chunkNativeLimit;
  2374         ut->chunkLength = (int32_t)ut->chunkNativeLimit;
  2375         ut->nativeIndexingLimit = ut->chunkLength;
  2376         ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2378     return ut->a;
  2382 static UBool U_CALLCONV
  2383 ucstrTextAccess(UText *ut, int64_t index, UBool  forward) {
  2384     const UChar *str   = (const UChar *)ut->context;
  2386     // pin the requested index to the bounds of the string,
  2387     //  and set current iteration position.
  2388     if (index<0) {
  2389         index = 0;
  2390     } else if (index < ut->chunkNativeLimit) {
  2391         // The request data is within the chunk as it is known so far.
  2392         // Put index on a code point boundary.
  2393         U16_SET_CP_START(str, 0, index);
  2394     } else if (ut->a >= 0) {
  2395         // We know the length of this string, and the user is requesting something
  2396         // at or beyond the length.  Pin the requested index to the length.
  2397         index = ut->a;
  2398     } else {
  2399         // Null terminated string, length not yet known, and the requested index
  2400         //  is beyond where we have scanned so far.
  2401         //  Scan to 32 UChars beyond the requested index.  The strategy here is
  2402         //  to avoid fully scanning a long string when the caller only wants to
  2403         //  see a few characters at its beginning.
  2404         int32_t scanLimit = (int32_t)index + 32;
  2405         if ((index + 32)>INT32_MAX || (index + 32)<0 ) {   // note: int64 expression
  2406             scanLimit = INT32_MAX;
  2409         int32_t chunkLimit = (int32_t)ut->chunkNativeLimit;
  2410         for (; chunkLimit<scanLimit; chunkLimit++) {
  2411             if (str[chunkLimit] == 0) {
  2412                 // We found the end of the string.  Remember it, pin the requested index to it,
  2413                 //  and bail out of here.
  2414                 ut->a = chunkLimit;
  2415                 ut->chunkLength = chunkLimit;
  2416                 ut->nativeIndexingLimit = chunkLimit;
  2417                 if (index >= chunkLimit) {
  2418                     index = chunkLimit;
  2419                 } else {
  2420                     U16_SET_CP_START(str, 0, index);
  2423                 ut->chunkNativeLimit = chunkLimit;
  2424                 ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2425                 goto breakout;
  2428         // We scanned through the next batch of UChars without finding the end.
  2429         U16_SET_CP_START(str, 0, index);
  2430         if (chunkLimit == INT32_MAX) {
  2431             // Scanned to the limit of a 32 bit length.
  2432             // Forceably trim the overlength string back so length fits in int32
  2433             //  TODO:  add support for 64 bit strings.
  2434             ut->a = chunkLimit;
  2435             ut->chunkLength = chunkLimit;
  2436             ut->nativeIndexingLimit = chunkLimit;
  2437             if (index > chunkLimit) {
  2438                 index = chunkLimit;
  2440             ut->chunkNativeLimit = chunkLimit;
  2441             ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2442         } else {
  2443             // The endpoint of a chunk must not be left in the middle of a surrogate pair.
  2444             // If the current end is on a lead surrogate, back the end up by one.
  2445             // It doesn't matter if the end char happens to be an unpaired surrogate,
  2446             //    and it's simpler not to worry about it.
  2447             if (U16_IS_LEAD(str[chunkLimit-1])) {
  2448                 --chunkLimit;
  2450             // Null-terminated chunk with end still unknown.
  2451             // Update the chunk length to reflect what has been scanned thus far.
  2452             // That the full length is still unknown is (still) flagged by
  2453             //    ut->a being < 0.
  2454             ut->chunkNativeLimit = chunkLimit;
  2455             ut->nativeIndexingLimit = chunkLimit;
  2456             ut->chunkLength = chunkLimit;
  2460 breakout:
  2461     U_ASSERT(index<=INT32_MAX);
  2462     ut->chunkOffset = (int32_t)index;
  2464     // Check whether request is at the start or end
  2465     UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0);
  2466     return retVal;
  2471 static int32_t U_CALLCONV
  2472 ucstrTextExtract(UText *ut,
  2473                   int64_t start, int64_t limit,
  2474                   UChar *dest, int32_t destCapacity,
  2475                   UErrorCode *pErrorCode)
  2477     if(U_FAILURE(*pErrorCode)) {
  2478         return 0;
  2480     if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
  2481         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  2482         return 0;
  2485     //const UChar *s=(const UChar *)ut->context;
  2486     int32_t si, di;
  2488     int32_t start32;
  2489     int32_t limit32;
  2491     // Access the start.  Does two things we need:
  2492     //   Pins 'start' to the length of the string, if it came in out-of-bounds.
  2493     //   Snaps 'start' to the beginning of a code point.
  2494     ucstrTextAccess(ut, start, TRUE);
  2495     const UChar *s=ut->chunkContents;
  2496     start32 = ut->chunkOffset;
  2498     int32_t strLength=(int32_t)ut->a;
  2499     if (strLength >= 0) {
  2500         limit32 = pinIndex(limit, strLength);
  2501     } else {
  2502         limit32 = pinIndex(limit, INT32_MAX);
  2504     di = 0;
  2505     for (si=start32; si<limit32; si++) {
  2506         if (strLength<0 && s[si]==0) {
  2507             // Just hit the end of a null-terminated string.
  2508             ut->a = si;               // set string length for this UText
  2509             ut->chunkNativeLimit    = si;
  2510             ut->chunkLength         = si;
  2511             ut->nativeIndexingLimit = si;
  2512             strLength               = si;
  2513             break;
  2515         U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */
  2516         if (di<destCapacity) {
  2517             // only store if there is space.
  2518             dest[di] = s[si];
  2519         } else {
  2520             if (strLength>=0) {
  2521                 // We have filled the destination buffer, and the string length is known.
  2522                 //  Cut the loop short.  There is no need to scan string termination.
  2523                 di = limit32 - start32;
  2524                 si = limit32;
  2525                 break;
  2528         di++;
  2531     // If the limit index points to a lead surrogate of a pair,
  2532     //   add the corresponding trail surrogate to the destination.
  2533     if (si>0 && U16_IS_LEAD(s[si-1]) &&
  2534         ((si<strLength || strLength<0)  && U16_IS_TRAIL(s[si])))
  2536         if (di<destCapacity) {
  2537             // store only if there is space in the output buffer.
  2538             dest[di++] = s[si++];
  2542     // Put iteration position at the point just following the extracted text
  2543     ut->chunkOffset = uprv_min(strLength, start32 + destCapacity);
  2545     // Add a terminating NUL if space in the buffer permits,
  2546     // and set the error status as required.
  2547     u_terminateUChars(dest, destCapacity, di, pErrorCode);
  2548     return di;
  2551 static const struct UTextFuncs ucstrFuncs = 
  2553     sizeof(UTextFuncs),
  2554     0, 0, 0,           // Reserved alignment padding
  2555     ucstrTextClone,
  2556     ucstrTextLength,
  2557     ucstrTextAccess,
  2558     ucstrTextExtract,
  2559     NULL,              // Replace
  2560     NULL,              // Copy
  2561     NULL,              // MapOffsetToNative,
  2562     NULL,              // MapIndexToUTF16,
  2563     ucstrTextClose,
  2564     NULL,              // spare 1
  2565     NULL,              // spare 2
  2566     NULL,              // spare 3
  2567 };
  2569 U_CDECL_END
  2571 static const UChar gEmptyUString[] = {0};
  2573 U_CAPI UText * U_EXPORT2
  2574 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) {
  2575     if (U_FAILURE(*status)) {
  2576         return NULL;
  2578     if(s==NULL && length==0) {
  2579         s = gEmptyUString;
  2581     if (s==NULL || length < -1 || length>INT32_MAX) {
  2582         *status = U_ILLEGAL_ARGUMENT_ERROR;
  2583         return NULL;
  2585     ut = utext_setup(ut, 0, status);
  2586     if (U_SUCCESS(*status)) {
  2587         ut->pFuncs               = &ucstrFuncs;
  2588         ut->context              = s;
  2589         ut->providerProperties   = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS);
  2590         if (length==-1) {
  2591             ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE);
  2593         ut->a                    = length;
  2594         ut->chunkContents        = s;
  2595         ut->chunkNativeStart     = 0;
  2596         ut->chunkNativeLimit     = length>=0? length : 0;
  2597         ut->chunkLength          = (int32_t)ut->chunkNativeLimit;
  2598         ut->chunkOffset          = 0;
  2599         ut->nativeIndexingLimit  = ut->chunkLength;
  2601     return ut;
  2605 //------------------------------------------------------------------------------
  2606 //
  2607 //     UText implementation for text from ICU CharacterIterators
  2608 //
  2609 //         Use of UText data members:
  2610 //            context    pointer to the CharacterIterator
  2611 //            a          length of the full text.
  2612 //            p          pointer to  buffer 1
  2613 //            b          start index of local buffer 1 contents
  2614 //            q          pointer to buffer 2
  2615 //            c          start index of local buffer 2 contents
  2616 //            r          pointer to the character iterator if the UText owns it.
  2617 //                       Null otherwise.
  2618 //
  2619 //------------------------------------------------------------------------------
  2620 #define CIBufSize 16
  2622 U_CDECL_BEGIN
  2623 static void U_CALLCONV
  2624 charIterTextClose(UText *ut) {
  2625     // Most of the work of close is done by the generic UText framework close.
  2626     // All that needs to be done here is delete the CharacterIterator if the UText
  2627     //  owns it.  This occurs if the UText was created by cloning.
  2628     CharacterIterator *ci = (CharacterIterator *)ut->r;
  2629     delete ci;
  2630     ut->r = NULL;
  2633 static int64_t U_CALLCONV
  2634 charIterTextLength(UText *ut) {
  2635     return (int32_t)ut->a;
  2638 static UBool U_CALLCONV
  2639 charIterTextAccess(UText *ut, int64_t index, UBool  forward) {
  2640     CharacterIterator *ci   = (CharacterIterator *)ut->context;
  2642     int32_t clippedIndex = (int32_t)index;
  2643     if (clippedIndex<0) {
  2644         clippedIndex=0;
  2645     } else if (clippedIndex>=ut->a) {
  2646         clippedIndex=(int32_t)ut->a;
  2648     int32_t neededIndex = clippedIndex;
  2649     if (!forward && neededIndex>0) {
  2650         // reverse iteration, want the position just before what was asked for.
  2651         neededIndex--;
  2652     } else if (forward && neededIndex==ut->a && neededIndex>0) {
  2653         // Forward iteration, don't ask for something past the end of the text.
  2654         neededIndex--;
  2657     // Find the native index of the start of the buffer containing what we want.
  2658     neededIndex -= neededIndex % CIBufSize;
  2660     UChar *buf = NULL;
  2661     UBool  needChunkSetup = TRUE;
  2662     int    i;
  2663     if (ut->chunkNativeStart == neededIndex) {
  2664         // The buffer we want is already the current chunk.
  2665         needChunkSetup = FALSE;
  2666     } else if (ut->b == neededIndex) {
  2667         // The first buffer (buffer p) has what we need.
  2668         buf = (UChar *)ut->p;
  2669     } else if (ut->c == neededIndex) {
  2670         // The second buffer (buffer q) has what we need.
  2671         buf = (UChar *)ut->q;
  2672     } else {
  2673         // Neither buffer already has what we need.
  2674         // Load new data from the character iterator.
  2675         // Use the buf that is not the current buffer.
  2676         buf = (UChar *)ut->p;
  2677         if (ut->p == ut->chunkContents) {
  2678             buf = (UChar *)ut->q;
  2680         ci->setIndex(neededIndex);
  2681         for (i=0; i<CIBufSize; i++) {
  2682             buf[i] = ci->nextPostInc();
  2683             if (i+neededIndex > ut->a) {
  2684                 break;
  2689     // We have a buffer with the data we need.
  2690     // Set it up as the current chunk, if it wasn't already.
  2691     if (needChunkSetup) {
  2692         ut->chunkContents = buf;
  2693         ut->chunkLength   = CIBufSize;
  2694         ut->chunkNativeStart = neededIndex;
  2695         ut->chunkNativeLimit = neededIndex + CIBufSize;
  2696         if (ut->chunkNativeLimit > ut->a) {
  2697             ut->chunkNativeLimit = ut->a;
  2698             ut->chunkLength  = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart);
  2700         ut->nativeIndexingLimit = ut->chunkLength;
  2701         U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize);
  2703     ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart;
  2704     UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0);
  2705     return success;
  2708 static UText * U_CALLCONV
  2709 charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) {
  2710     if (U_FAILURE(*status)) {
  2711         return NULL;
  2714     if (deep) {
  2715         // There is no CharacterIterator API for cloning the underlying text storage.
  2716         *status = U_UNSUPPORTED_ERROR;
  2717         return NULL;
  2718     } else {
  2719         CharacterIterator *srcCI =(CharacterIterator *)src->context;
  2720         srcCI = srcCI->clone();
  2721         dest = utext_openCharacterIterator(dest, srcCI, status);
  2722         // cast off const on getNativeIndex.
  2723         //   For CharacterIterator based UTexts, this is safe, the operation is const.
  2724         int64_t  ix = utext_getNativeIndex((UText *)src);
  2725         utext_setNativeIndex(dest, ix);
  2726         dest->r = srcCI;    // flags that this UText owns the CharacterIterator
  2728     return dest;
  2731 static int32_t U_CALLCONV
  2732 charIterTextExtract(UText *ut,
  2733                   int64_t start, int64_t limit,
  2734                   UChar *dest, int32_t destCapacity,
  2735                   UErrorCode *status)
  2737     if(U_FAILURE(*status)) {
  2738         return 0;
  2740     if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) {
  2741         *status=U_ILLEGAL_ARGUMENT_ERROR;
  2742         return 0;
  2744     int32_t  length  = (int32_t)ut->a;
  2745     int32_t  start32 = pinIndex(start, length);
  2746     int32_t  limit32 = pinIndex(limit, length);
  2747     int32_t  desti   = 0;
  2748     int32_t  srci;
  2749     int32_t  copyLimit;
  2751     CharacterIterator *ci = (CharacterIterator *)ut->context;
  2752     ci->setIndex32(start32);   // Moves ix to lead of surrogate pair, if needed.
  2753     srci = ci->getIndex();
  2754     copyLimit = srci;
  2755     while (srci<limit32) {
  2756         UChar32 c = ci->next32PostInc();
  2757         int32_t  len = U16_LENGTH(c);
  2758         U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */
  2759         if (desti+len <= destCapacity) {
  2760             U16_APPEND_UNSAFE(dest, desti, c);
  2761             copyLimit = srci+len;
  2762         } else {
  2763             desti += len;
  2764             *status = U_BUFFER_OVERFLOW_ERROR;
  2766         srci += len;
  2769     charIterTextAccess(ut, copyLimit, TRUE);
  2771     u_terminateUChars(dest, destCapacity, desti, status);
  2772     return desti;
  2775 static const struct UTextFuncs charIterFuncs = 
  2777     sizeof(UTextFuncs),
  2778     0, 0, 0,             // Reserved alignment padding
  2779     charIterTextClone,
  2780     charIterTextLength,
  2781     charIterTextAccess,
  2782     charIterTextExtract,
  2783     NULL,                // Replace
  2784     NULL,                // Copy
  2785     NULL,                // MapOffsetToNative,
  2786     NULL,                // MapIndexToUTF16,
  2787     charIterTextClose,
  2788     NULL,                // spare 1
  2789     NULL,                // spare 2
  2790     NULL                 // spare 3
  2791 };
  2792 U_CDECL_END
  2795 U_CAPI UText * U_EXPORT2
  2796 utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) {
  2797     if (U_FAILURE(*status)) {
  2798         return NULL;
  2801     if (ci->startIndex() > 0) {
  2802         // No support for CharacterIterators that do not start indexing from zero.
  2803         *status = U_UNSUPPORTED_ERROR;
  2804         return NULL;
  2807     // Extra space in UText for 2 buffers of CIBufSize UChars each.
  2808     int32_t  extraSpace = 2 * CIBufSize * sizeof(UChar);
  2809     ut = utext_setup(ut, extraSpace, status);
  2810     if (U_SUCCESS(*status)) {
  2811         ut->pFuncs                = &charIterFuncs;
  2812         ut->context              = ci;
  2813         ut->providerProperties   = 0;
  2814         ut->a                    = ci->endIndex();        // Length of text
  2815         ut->p                    = ut->pExtra;            // First buffer
  2816         ut->b                    = -1;                    // Native index of first buffer contents
  2817         ut->q                    = (UChar*)ut->pExtra+CIBufSize;  // Second buffer
  2818         ut->c                    = -1;                    // Native index of second buffer contents
  2820         // Initialize current chunk contents to be empty.
  2821         //   First access will fault something in.
  2822         //   Note:  The initial nativeStart and chunkOffset must sum to zero
  2823         //          so that getNativeIndex() will correctly compute to zero
  2824         //          if no call to Access() has ever been made.  They can't be both
  2825         //          zero without Access() thinking that the chunk is valid.
  2826         ut->chunkContents        = (UChar *)ut->p;
  2827         ut->chunkNativeStart     = -1;
  2828         ut->chunkOffset          = 1;
  2829         ut->chunkNativeLimit     = 0;
  2830         ut->chunkLength          = 0;
  2831         ut->nativeIndexingLimit  = ut->chunkOffset;  // enables native indexing
  2833     return ut;

mercurial