Wed, 31 Dec 2014 07:22:50 +0100
Correct previous dual key logic pending first delivery installment.
michael@0 | 1 | /* |
michael@0 | 2 | ******************************************************************************* |
michael@0 | 3 | * |
michael@0 | 4 | * Copyright (C) 2005-2012, International Business Machines |
michael@0 | 5 | * Corporation and others. All Rights Reserved. |
michael@0 | 6 | * |
michael@0 | 7 | ******************************************************************************* |
michael@0 | 8 | * file name: utext.cpp |
michael@0 | 9 | * encoding: US-ASCII |
michael@0 | 10 | * tab size: 8 (not used) |
michael@0 | 11 | * indentation:4 |
michael@0 | 12 | * |
michael@0 | 13 | * created on: 2005apr12 |
michael@0 | 14 | * created by: Markus W. Scherer |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | #include "unicode/ustring.h" |
michael@0 | 19 | #include "unicode/unistr.h" |
michael@0 | 20 | #include "unicode/chariter.h" |
michael@0 | 21 | #include "unicode/utext.h" |
michael@0 | 22 | #include "unicode/utf.h" |
michael@0 | 23 | #include "unicode/utf8.h" |
michael@0 | 24 | #include "unicode/utf16.h" |
michael@0 | 25 | #include "ustr_imp.h" |
michael@0 | 26 | #include "cmemory.h" |
michael@0 | 27 | #include "cstring.h" |
michael@0 | 28 | #include "uassert.h" |
michael@0 | 29 | #include "putilimp.h" |
michael@0 | 30 | |
michael@0 | 31 | U_NAMESPACE_USE |
michael@0 | 32 | |
michael@0 | 33 | #define I32_FLAG(bitIndex) ((int32_t)1<<(bitIndex)) |
michael@0 | 34 | |
michael@0 | 35 | |
michael@0 | 36 | static UBool |
michael@0 | 37 | utext_access(UText *ut, int64_t index, UBool forward) { |
michael@0 | 38 | return ut->pFuncs->access(ut, index, forward); |
michael@0 | 39 | } |
michael@0 | 40 | |
michael@0 | 41 | |
michael@0 | 42 | |
michael@0 | 43 | U_CAPI UBool U_EXPORT2 |
michael@0 | 44 | utext_moveIndex32(UText *ut, int32_t delta) { |
michael@0 | 45 | UChar32 c; |
michael@0 | 46 | if (delta > 0) { |
michael@0 | 47 | do { |
michael@0 | 48 | if(ut->chunkOffset>=ut->chunkLength && !utext_access(ut, ut->chunkNativeLimit, TRUE)) { |
michael@0 | 49 | return FALSE; |
michael@0 | 50 | } |
michael@0 | 51 | c = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 52 | if (U16_IS_SURROGATE(c)) { |
michael@0 | 53 | c = utext_next32(ut); |
michael@0 | 54 | if (c == U_SENTINEL) { |
michael@0 | 55 | return FALSE; |
michael@0 | 56 | } |
michael@0 | 57 | } else { |
michael@0 | 58 | ut->chunkOffset++; |
michael@0 | 59 | } |
michael@0 | 60 | } while(--delta>0); |
michael@0 | 61 | |
michael@0 | 62 | } else if (delta<0) { |
michael@0 | 63 | do { |
michael@0 | 64 | if(ut->chunkOffset<=0 && !utext_access(ut, ut->chunkNativeStart, FALSE)) { |
michael@0 | 65 | return FALSE; |
michael@0 | 66 | } |
michael@0 | 67 | c = ut->chunkContents[ut->chunkOffset-1]; |
michael@0 | 68 | if (U16_IS_SURROGATE(c)) { |
michael@0 | 69 | c = utext_previous32(ut); |
michael@0 | 70 | if (c == U_SENTINEL) { |
michael@0 | 71 | return FALSE; |
michael@0 | 72 | } |
michael@0 | 73 | } else { |
michael@0 | 74 | ut->chunkOffset--; |
michael@0 | 75 | } |
michael@0 | 76 | } while(++delta<0); |
michael@0 | 77 | } |
michael@0 | 78 | |
michael@0 | 79 | return TRUE; |
michael@0 | 80 | } |
michael@0 | 81 | |
michael@0 | 82 | |
michael@0 | 83 | U_CAPI int64_t U_EXPORT2 |
michael@0 | 84 | utext_nativeLength(UText *ut) { |
michael@0 | 85 | return ut->pFuncs->nativeLength(ut); |
michael@0 | 86 | } |
michael@0 | 87 | |
michael@0 | 88 | |
michael@0 | 89 | U_CAPI UBool U_EXPORT2 |
michael@0 | 90 | utext_isLengthExpensive(const UText *ut) { |
michael@0 | 91 | UBool r = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE)) != 0; |
michael@0 | 92 | return r; |
michael@0 | 93 | } |
michael@0 | 94 | |
michael@0 | 95 | |
michael@0 | 96 | U_CAPI int64_t U_EXPORT2 |
michael@0 | 97 | utext_getNativeIndex(const UText *ut) { |
michael@0 | 98 | if(ut->chunkOffset <= ut->nativeIndexingLimit) { |
michael@0 | 99 | return ut->chunkNativeStart+ut->chunkOffset; |
michael@0 | 100 | } else { |
michael@0 | 101 | return ut->pFuncs->mapOffsetToNative(ut); |
michael@0 | 102 | } |
michael@0 | 103 | } |
michael@0 | 104 | |
michael@0 | 105 | |
michael@0 | 106 | U_CAPI void U_EXPORT2 |
michael@0 | 107 | utext_setNativeIndex(UText *ut, int64_t index) { |
michael@0 | 108 | if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { |
michael@0 | 109 | // The desired position is outside of the current chunk. |
michael@0 | 110 | // Access the new position. Assume a forward iteration from here, |
michael@0 | 111 | // which will also be optimimum for a single random access. |
michael@0 | 112 | // Reverse iterations may suffer slightly. |
michael@0 | 113 | ut->pFuncs->access(ut, index, TRUE); |
michael@0 | 114 | } else if((int32_t)(index - ut->chunkNativeStart) <= ut->nativeIndexingLimit) { |
michael@0 | 115 | // utf-16 indexing. |
michael@0 | 116 | ut->chunkOffset=(int32_t)(index-ut->chunkNativeStart); |
michael@0 | 117 | } else { |
michael@0 | 118 | ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); |
michael@0 | 119 | } |
michael@0 | 120 | // The convention is that the index must always be on a code point boundary. |
michael@0 | 121 | // Adjust the index position if it is in the middle of a surrogate pair. |
michael@0 | 122 | if (ut->chunkOffset<ut->chunkLength) { |
michael@0 | 123 | UChar c= ut->chunkContents[ut->chunkOffset]; |
michael@0 | 124 | if (U16_IS_TRAIL(c)) { |
michael@0 | 125 | if (ut->chunkOffset==0) { |
michael@0 | 126 | ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE); |
michael@0 | 127 | } |
michael@0 | 128 | if (ut->chunkOffset>0) { |
michael@0 | 129 | UChar lead = ut->chunkContents[ut->chunkOffset-1]; |
michael@0 | 130 | if (U16_IS_LEAD(lead)) { |
michael@0 | 131 | ut->chunkOffset--; |
michael@0 | 132 | } |
michael@0 | 133 | } |
michael@0 | 134 | } |
michael@0 | 135 | } |
michael@0 | 136 | } |
michael@0 | 137 | |
michael@0 | 138 | |
michael@0 | 139 | |
michael@0 | 140 | U_CAPI int64_t U_EXPORT2 |
michael@0 | 141 | utext_getPreviousNativeIndex(UText *ut) { |
michael@0 | 142 | // |
michael@0 | 143 | // Fast-path the common case. |
michael@0 | 144 | // Common means current position is not at the beginning of a chunk |
michael@0 | 145 | // and the preceding character is not supplementary. |
michael@0 | 146 | // |
michael@0 | 147 | int32_t i = ut->chunkOffset - 1; |
michael@0 | 148 | int64_t result; |
michael@0 | 149 | if (i >= 0) { |
michael@0 | 150 | UChar c = ut->chunkContents[i]; |
michael@0 | 151 | if (U16_IS_TRAIL(c) == FALSE) { |
michael@0 | 152 | if (i <= ut->nativeIndexingLimit) { |
michael@0 | 153 | result = ut->chunkNativeStart + i; |
michael@0 | 154 | } else { |
michael@0 | 155 | ut->chunkOffset = i; |
michael@0 | 156 | result = ut->pFuncs->mapOffsetToNative(ut); |
michael@0 | 157 | ut->chunkOffset++; |
michael@0 | 158 | } |
michael@0 | 159 | return result; |
michael@0 | 160 | } |
michael@0 | 161 | } |
michael@0 | 162 | |
michael@0 | 163 | // If at the start of text, simply return 0. |
michael@0 | 164 | if (ut->chunkOffset==0 && ut->chunkNativeStart==0) { |
michael@0 | 165 | return 0; |
michael@0 | 166 | } |
michael@0 | 167 | |
michael@0 | 168 | // Harder, less common cases. We are at a chunk boundary, or on a surrogate. |
michael@0 | 169 | // Keep it simple, use other functions to handle the edges. |
michael@0 | 170 | // |
michael@0 | 171 | utext_previous32(ut); |
michael@0 | 172 | result = UTEXT_GETNATIVEINDEX(ut); |
michael@0 | 173 | utext_next32(ut); |
michael@0 | 174 | return result; |
michael@0 | 175 | } |
michael@0 | 176 | |
michael@0 | 177 | |
michael@0 | 178 | // |
michael@0 | 179 | // utext_current32. Get the UChar32 at the current position. |
michael@0 | 180 | // UText iteration position is always on a code point boundary, |
michael@0 | 181 | // never on the trail half of a surrogate pair. |
michael@0 | 182 | // |
michael@0 | 183 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 184 | utext_current32(UText *ut) { |
michael@0 | 185 | UChar32 c; |
michael@0 | 186 | if (ut->chunkOffset==ut->chunkLength) { |
michael@0 | 187 | // Current position is just off the end of the chunk. |
michael@0 | 188 | if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { |
michael@0 | 189 | // Off the end of the text. |
michael@0 | 190 | return U_SENTINEL; |
michael@0 | 191 | } |
michael@0 | 192 | } |
michael@0 | 193 | |
michael@0 | 194 | c = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 195 | if (U16_IS_LEAD(c) == FALSE) { |
michael@0 | 196 | // Normal, non-supplementary case. |
michael@0 | 197 | return c; |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | // |
michael@0 | 201 | // Possible supplementary char. |
michael@0 | 202 | // |
michael@0 | 203 | UChar32 trail = 0; |
michael@0 | 204 | UChar32 supplementaryC = c; |
michael@0 | 205 | if ((ut->chunkOffset+1) < ut->chunkLength) { |
michael@0 | 206 | // The trail surrogate is in the same chunk. |
michael@0 | 207 | trail = ut->chunkContents[ut->chunkOffset+1]; |
michael@0 | 208 | } else { |
michael@0 | 209 | // The trail surrogate is in a different chunk. |
michael@0 | 210 | // Because we must maintain the iteration position, we need to switch forward |
michael@0 | 211 | // into the new chunk, get the trail surrogate, then revert the chunk back to the |
michael@0 | 212 | // original one. |
michael@0 | 213 | // An edge case to be careful of: the entire text may end with an unpaired |
michael@0 | 214 | // leading surrogate. The attempt to access the trail will fail, but |
michael@0 | 215 | // the original position before the unpaired lead still needs to be restored. |
michael@0 | 216 | int64_t nativePosition = ut->chunkNativeLimit; |
michael@0 | 217 | int32_t originalOffset = ut->chunkOffset; |
michael@0 | 218 | if (ut->pFuncs->access(ut, nativePosition, TRUE)) { |
michael@0 | 219 | trail = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 220 | } |
michael@0 | 221 | UBool r = ut->pFuncs->access(ut, nativePosition, FALSE); // reverse iteration flag loads preceding chunk |
michael@0 | 222 | U_ASSERT(r==TRUE); |
michael@0 | 223 | ut->chunkOffset = originalOffset; |
michael@0 | 224 | if(!r) { |
michael@0 | 225 | return U_SENTINEL; |
michael@0 | 226 | } |
michael@0 | 227 | } |
michael@0 | 228 | |
michael@0 | 229 | if (U16_IS_TRAIL(trail)) { |
michael@0 | 230 | supplementaryC = U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 231 | } |
michael@0 | 232 | return supplementaryC; |
michael@0 | 233 | |
michael@0 | 234 | } |
michael@0 | 235 | |
michael@0 | 236 | |
michael@0 | 237 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 238 | utext_char32At(UText *ut, int64_t nativeIndex) { |
michael@0 | 239 | UChar32 c = U_SENTINEL; |
michael@0 | 240 | |
michael@0 | 241 | // Fast path the common case. |
michael@0 | 242 | if (nativeIndex>=ut->chunkNativeStart && nativeIndex < ut->chunkNativeStart + ut->nativeIndexingLimit) { |
michael@0 | 243 | ut->chunkOffset = (int32_t)(nativeIndex - ut->chunkNativeStart); |
michael@0 | 244 | c = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 245 | if (U16_IS_SURROGATE(c) == FALSE) { |
michael@0 | 246 | return c; |
michael@0 | 247 | } |
michael@0 | 248 | } |
michael@0 | 249 | |
michael@0 | 250 | |
michael@0 | 251 | utext_setNativeIndex(ut, nativeIndex); |
michael@0 | 252 | if (nativeIndex>=ut->chunkNativeStart && ut->chunkOffset<ut->chunkLength) { |
michael@0 | 253 | c = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 254 | if (U16_IS_SURROGATE(c)) { |
michael@0 | 255 | // For surrogates, let current32() deal with the complications |
michael@0 | 256 | // of supplementaries that may span chunk boundaries. |
michael@0 | 257 | c = utext_current32(ut); |
michael@0 | 258 | } |
michael@0 | 259 | } |
michael@0 | 260 | return c; |
michael@0 | 261 | } |
michael@0 | 262 | |
michael@0 | 263 | |
michael@0 | 264 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 265 | utext_next32(UText *ut) { |
michael@0 | 266 | UChar32 c; |
michael@0 | 267 | |
michael@0 | 268 | if (ut->chunkOffset >= ut->chunkLength) { |
michael@0 | 269 | if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { |
michael@0 | 270 | return U_SENTINEL; |
michael@0 | 271 | } |
michael@0 | 272 | } |
michael@0 | 273 | |
michael@0 | 274 | c = ut->chunkContents[ut->chunkOffset++]; |
michael@0 | 275 | if (U16_IS_LEAD(c) == FALSE) { |
michael@0 | 276 | // Normal case, not supplementary. |
michael@0 | 277 | // (A trail surrogate seen here is just returned as is, as a surrogate value. |
michael@0 | 278 | // It cannot be part of a pair.) |
michael@0 | 279 | return c; |
michael@0 | 280 | } |
michael@0 | 281 | |
michael@0 | 282 | if (ut->chunkOffset >= ut->chunkLength) { |
michael@0 | 283 | if (ut->pFuncs->access(ut, ut->chunkNativeLimit, TRUE) == FALSE) { |
michael@0 | 284 | // c is an unpaired lead surrogate at the end of the text. |
michael@0 | 285 | // return it as it is. |
michael@0 | 286 | return c; |
michael@0 | 287 | } |
michael@0 | 288 | } |
michael@0 | 289 | UChar32 trail = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 290 | if (U16_IS_TRAIL(trail) == FALSE) { |
michael@0 | 291 | // c was an unpaired lead surrogate, not at the end of the text. |
michael@0 | 292 | // return it as it is (unpaired). Iteration position is on the |
michael@0 | 293 | // following character, possibly in the next chunk, where the |
michael@0 | 294 | // trail surrogate would have been if it had existed. |
michael@0 | 295 | return c; |
michael@0 | 296 | } |
michael@0 | 297 | |
michael@0 | 298 | UChar32 supplementary = U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 299 | ut->chunkOffset++; // move iteration position over the trail surrogate. |
michael@0 | 300 | return supplementary; |
michael@0 | 301 | } |
michael@0 | 302 | |
michael@0 | 303 | |
michael@0 | 304 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 305 | utext_previous32(UText *ut) { |
michael@0 | 306 | UChar32 c; |
michael@0 | 307 | |
michael@0 | 308 | if (ut->chunkOffset <= 0) { |
michael@0 | 309 | if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { |
michael@0 | 310 | return U_SENTINEL; |
michael@0 | 311 | } |
michael@0 | 312 | } |
michael@0 | 313 | ut->chunkOffset--; |
michael@0 | 314 | c = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 315 | if (U16_IS_TRAIL(c) == FALSE) { |
michael@0 | 316 | // Normal case, not supplementary. |
michael@0 | 317 | // (A lead surrogate seen here is just returned as is, as a surrogate value. |
michael@0 | 318 | // It cannot be part of a pair.) |
michael@0 | 319 | return c; |
michael@0 | 320 | } |
michael@0 | 321 | |
michael@0 | 322 | if (ut->chunkOffset <= 0) { |
michael@0 | 323 | if (ut->pFuncs->access(ut, ut->chunkNativeStart, FALSE) == FALSE) { |
michael@0 | 324 | // c is an unpaired trail surrogate at the start of the text. |
michael@0 | 325 | // return it as it is. |
michael@0 | 326 | return c; |
michael@0 | 327 | } |
michael@0 | 328 | } |
michael@0 | 329 | |
michael@0 | 330 | UChar32 lead = ut->chunkContents[ut->chunkOffset-1]; |
michael@0 | 331 | if (U16_IS_LEAD(lead) == FALSE) { |
michael@0 | 332 | // c was an unpaired trail surrogate, not at the end of the text. |
michael@0 | 333 | // return it as it is (unpaired). Iteration position is at c |
michael@0 | 334 | return c; |
michael@0 | 335 | } |
michael@0 | 336 | |
michael@0 | 337 | UChar32 supplementary = U16_GET_SUPPLEMENTARY(lead, c); |
michael@0 | 338 | ut->chunkOffset--; // move iteration position over the lead surrogate. |
michael@0 | 339 | return supplementary; |
michael@0 | 340 | } |
michael@0 | 341 | |
michael@0 | 342 | |
michael@0 | 343 | |
michael@0 | 344 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 345 | utext_next32From(UText *ut, int64_t index) { |
michael@0 | 346 | UChar32 c = U_SENTINEL; |
michael@0 | 347 | |
michael@0 | 348 | if(index<ut->chunkNativeStart || index>=ut->chunkNativeLimit) { |
michael@0 | 349 | // Desired position is outside of the current chunk. |
michael@0 | 350 | if(!ut->pFuncs->access(ut, index, TRUE)) { |
michael@0 | 351 | // no chunk available here |
michael@0 | 352 | return U_SENTINEL; |
michael@0 | 353 | } |
michael@0 | 354 | } else if (index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { |
michael@0 | 355 | // Desired position is in chunk, with direct 1:1 native to UTF16 indexing |
michael@0 | 356 | ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); |
michael@0 | 357 | } else { |
michael@0 | 358 | // Desired position is in chunk, with non-UTF16 indexing. |
michael@0 | 359 | ut->chunkOffset = ut->pFuncs->mapNativeIndexToUTF16(ut, index); |
michael@0 | 360 | } |
michael@0 | 361 | |
michael@0 | 362 | c = ut->chunkContents[ut->chunkOffset++]; |
michael@0 | 363 | if (U16_IS_SURROGATE(c)) { |
michael@0 | 364 | // Surrogates. Many edge cases. Use other functions that already |
michael@0 | 365 | // deal with the problems. |
michael@0 | 366 | utext_setNativeIndex(ut, index); |
michael@0 | 367 | c = utext_next32(ut); |
michael@0 | 368 | } |
michael@0 | 369 | return c; |
michael@0 | 370 | } |
michael@0 | 371 | |
michael@0 | 372 | |
michael@0 | 373 | U_CAPI UChar32 U_EXPORT2 |
michael@0 | 374 | utext_previous32From(UText *ut, int64_t index) { |
michael@0 | 375 | // |
michael@0 | 376 | // Return the character preceding the specified index. |
michael@0 | 377 | // Leave the iteration position at the start of the character that was returned. |
michael@0 | 378 | // |
michael@0 | 379 | UChar32 cPrev; // The character preceding cCurr, which is what we will return. |
michael@0 | 380 | |
michael@0 | 381 | // Address the chunk containg the position preceding the incoming index |
michael@0 | 382 | // A tricky edge case: |
michael@0 | 383 | // We try to test the requested native index against the chunkNativeStart to determine |
michael@0 | 384 | // whether the character preceding the one at the index is in the current chunk. |
michael@0 | 385 | // BUT, this test can fail with UTF-8 (or any other multibyte encoding), when the |
michael@0 | 386 | // requested index is on something other than the first position of the first char. |
michael@0 | 387 | // |
michael@0 | 388 | if(index<=ut->chunkNativeStart || index>ut->chunkNativeLimit) { |
michael@0 | 389 | // Requested native index is outside of the current chunk. |
michael@0 | 390 | if(!ut->pFuncs->access(ut, index, FALSE)) { |
michael@0 | 391 | // no chunk available here |
michael@0 | 392 | return U_SENTINEL; |
michael@0 | 393 | } |
michael@0 | 394 | } else if(index - ut->chunkNativeStart <= (int64_t)ut->nativeIndexingLimit) { |
michael@0 | 395 | // Direct UTF-16 indexing. |
michael@0 | 396 | ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); |
michael@0 | 397 | } else { |
michael@0 | 398 | ut->chunkOffset=ut->pFuncs->mapNativeIndexToUTF16(ut, index); |
michael@0 | 399 | if (ut->chunkOffset==0 && !ut->pFuncs->access(ut, index, FALSE)) { |
michael@0 | 400 | // no chunk available here |
michael@0 | 401 | return U_SENTINEL; |
michael@0 | 402 | } |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | // |
michael@0 | 406 | // Simple case with no surrogates. |
michael@0 | 407 | // |
michael@0 | 408 | ut->chunkOffset--; |
michael@0 | 409 | cPrev = ut->chunkContents[ut->chunkOffset]; |
michael@0 | 410 | |
michael@0 | 411 | if (U16_IS_SURROGATE(cPrev)) { |
michael@0 | 412 | // Possible supplementary. Many edge cases. |
michael@0 | 413 | // Let other functions do the heavy lifting. |
michael@0 | 414 | utext_setNativeIndex(ut, index); |
michael@0 | 415 | cPrev = utext_previous32(ut); |
michael@0 | 416 | } |
michael@0 | 417 | return cPrev; |
michael@0 | 418 | } |
michael@0 | 419 | |
michael@0 | 420 | |
michael@0 | 421 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 422 | utext_extract(UText *ut, |
michael@0 | 423 | int64_t start, int64_t limit, |
michael@0 | 424 | UChar *dest, int32_t destCapacity, |
michael@0 | 425 | UErrorCode *status) { |
michael@0 | 426 | return ut->pFuncs->extract(ut, start, limit, dest, destCapacity, status); |
michael@0 | 427 | } |
michael@0 | 428 | |
michael@0 | 429 | |
michael@0 | 430 | |
michael@0 | 431 | U_CAPI UBool U_EXPORT2 |
michael@0 | 432 | utext_equals(const UText *a, const UText *b) { |
michael@0 | 433 | if (a==NULL || b==NULL || |
michael@0 | 434 | a->magic != UTEXT_MAGIC || |
michael@0 | 435 | b->magic != UTEXT_MAGIC) { |
michael@0 | 436 | // Null or invalid arguments don't compare equal to anything. |
michael@0 | 437 | return FALSE; |
michael@0 | 438 | } |
michael@0 | 439 | |
michael@0 | 440 | if (a->pFuncs != b->pFuncs) { |
michael@0 | 441 | // Different types of text providers. |
michael@0 | 442 | return FALSE; |
michael@0 | 443 | } |
michael@0 | 444 | |
michael@0 | 445 | if (a->context != b->context) { |
michael@0 | 446 | // Different sources (different strings) |
michael@0 | 447 | return FALSE; |
michael@0 | 448 | } |
michael@0 | 449 | if (utext_getNativeIndex(a) != utext_getNativeIndex(b)) { |
michael@0 | 450 | // Different current position in the string. |
michael@0 | 451 | return FALSE; |
michael@0 | 452 | } |
michael@0 | 453 | |
michael@0 | 454 | return TRUE; |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | U_CAPI UBool U_EXPORT2 |
michael@0 | 458 | utext_isWritable(const UText *ut) |
michael@0 | 459 | { |
michael@0 | 460 | UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) != 0; |
michael@0 | 461 | return b; |
michael@0 | 462 | } |
michael@0 | 463 | |
michael@0 | 464 | |
michael@0 | 465 | U_CAPI void U_EXPORT2 |
michael@0 | 466 | utext_freeze(UText *ut) { |
michael@0 | 467 | // Zero out the WRITABLE flag. |
michael@0 | 468 | ut->providerProperties &= ~(I32_FLAG(UTEXT_PROVIDER_WRITABLE)); |
michael@0 | 469 | } |
michael@0 | 470 | |
michael@0 | 471 | |
michael@0 | 472 | U_CAPI UBool U_EXPORT2 |
michael@0 | 473 | utext_hasMetaData(const UText *ut) |
michael@0 | 474 | { |
michael@0 | 475 | UBool b = (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA)) != 0; |
michael@0 | 476 | return b; |
michael@0 | 477 | } |
michael@0 | 478 | |
michael@0 | 479 | |
michael@0 | 480 | |
michael@0 | 481 | U_CAPI int32_t U_EXPORT2 |
michael@0 | 482 | utext_replace(UText *ut, |
michael@0 | 483 | int64_t nativeStart, int64_t nativeLimit, |
michael@0 | 484 | const UChar *replacementText, int32_t replacementLength, |
michael@0 | 485 | UErrorCode *status) |
michael@0 | 486 | { |
michael@0 | 487 | if (U_FAILURE(*status)) { |
michael@0 | 488 | return 0; |
michael@0 | 489 | } |
michael@0 | 490 | if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { |
michael@0 | 491 | *status = U_NO_WRITE_PERMISSION; |
michael@0 | 492 | return 0; |
michael@0 | 493 | } |
michael@0 | 494 | int32_t i = ut->pFuncs->replace(ut, nativeStart, nativeLimit, replacementText, replacementLength, status); |
michael@0 | 495 | return i; |
michael@0 | 496 | } |
michael@0 | 497 | |
michael@0 | 498 | U_CAPI void U_EXPORT2 |
michael@0 | 499 | utext_copy(UText *ut, |
michael@0 | 500 | int64_t nativeStart, int64_t nativeLimit, |
michael@0 | 501 | int64_t destIndex, |
michael@0 | 502 | UBool move, |
michael@0 | 503 | UErrorCode *status) |
michael@0 | 504 | { |
michael@0 | 505 | if (U_FAILURE(*status)) { |
michael@0 | 506 | return; |
michael@0 | 507 | } |
michael@0 | 508 | if ((ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_WRITABLE)) == 0) { |
michael@0 | 509 | *status = U_NO_WRITE_PERMISSION; |
michael@0 | 510 | return; |
michael@0 | 511 | } |
michael@0 | 512 | ut->pFuncs->copy(ut, nativeStart, nativeLimit, destIndex, move, status); |
michael@0 | 513 | } |
michael@0 | 514 | |
michael@0 | 515 | |
michael@0 | 516 | |
michael@0 | 517 | U_CAPI UText * U_EXPORT2 |
michael@0 | 518 | utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status) { |
michael@0 | 519 | UText *result; |
michael@0 | 520 | result = src->pFuncs->clone(dest, src, deep, status); |
michael@0 | 521 | if (readOnly) { |
michael@0 | 522 | utext_freeze(result); |
michael@0 | 523 | } |
michael@0 | 524 | return result; |
michael@0 | 525 | } |
michael@0 | 526 | |
michael@0 | 527 | |
michael@0 | 528 | |
michael@0 | 529 | //------------------------------------------------------------------------------ |
michael@0 | 530 | // |
michael@0 | 531 | // UText common functions implementation |
michael@0 | 532 | // |
michael@0 | 533 | //------------------------------------------------------------------------------ |
michael@0 | 534 | |
michael@0 | 535 | // |
michael@0 | 536 | // UText.flags bit definitions |
michael@0 | 537 | // |
michael@0 | 538 | enum { |
michael@0 | 539 | UTEXT_HEAP_ALLOCATED = 1, // 1 if ICU has allocated this UText struct on the heap. |
michael@0 | 540 | // 0 if caller provided storage for the UText. |
michael@0 | 541 | |
michael@0 | 542 | UTEXT_EXTRA_HEAP_ALLOCATED = 2, // 1 if ICU has allocated extra storage as a separate |
michael@0 | 543 | // heap block. |
michael@0 | 544 | // 0 if there is no separate allocation. Either no extra |
michael@0 | 545 | // storage was requested, or it is appended to the end |
michael@0 | 546 | // of the main UText storage. |
michael@0 | 547 | |
michael@0 | 548 | UTEXT_OPEN = 4 // 1 if this UText is currently open |
michael@0 | 549 | // 0 if this UText is not open. |
michael@0 | 550 | }; |
michael@0 | 551 | |
michael@0 | 552 | |
michael@0 | 553 | // |
michael@0 | 554 | // Extended form of a UText. The purpose is to aid in computing the total size required |
michael@0 | 555 | // when a provider asks for a UText to be allocated with extra storage. |
michael@0 | 556 | |
michael@0 | 557 | struct ExtendedUText { |
michael@0 | 558 | UText ut; |
michael@0 | 559 | UAlignedMemory extension; |
michael@0 | 560 | }; |
michael@0 | 561 | |
michael@0 | 562 | static const UText emptyText = UTEXT_INITIALIZER; |
michael@0 | 563 | |
michael@0 | 564 | U_CAPI UText * U_EXPORT2 |
michael@0 | 565 | utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status) { |
michael@0 | 566 | if (U_FAILURE(*status)) { |
michael@0 | 567 | return ut; |
michael@0 | 568 | } |
michael@0 | 569 | |
michael@0 | 570 | if (ut == NULL) { |
michael@0 | 571 | // We need to heap-allocate storage for the new UText |
michael@0 | 572 | int32_t spaceRequired = sizeof(UText); |
michael@0 | 573 | if (extraSpace > 0) { |
michael@0 | 574 | spaceRequired = sizeof(ExtendedUText) + extraSpace - sizeof(UAlignedMemory); |
michael@0 | 575 | } |
michael@0 | 576 | ut = (UText *)uprv_malloc(spaceRequired); |
michael@0 | 577 | if (ut == NULL) { |
michael@0 | 578 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 579 | return NULL; |
michael@0 | 580 | } else { |
michael@0 | 581 | *ut = emptyText; |
michael@0 | 582 | ut->flags |= UTEXT_HEAP_ALLOCATED; |
michael@0 | 583 | if (spaceRequired>0) { |
michael@0 | 584 | ut->extraSize = extraSpace; |
michael@0 | 585 | ut->pExtra = &((ExtendedUText *)ut)->extension; |
michael@0 | 586 | } |
michael@0 | 587 | } |
michael@0 | 588 | } else { |
michael@0 | 589 | // We have been supplied with an already existing UText. |
michael@0 | 590 | // Verify that it really appears to be a UText. |
michael@0 | 591 | if (ut->magic != UTEXT_MAGIC) { |
michael@0 | 592 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 593 | return ut; |
michael@0 | 594 | } |
michael@0 | 595 | // If the ut is already open and there's a provider supplied close |
michael@0 | 596 | // function, call it. |
michael@0 | 597 | if ((ut->flags & UTEXT_OPEN) && ut->pFuncs->close != NULL) { |
michael@0 | 598 | ut->pFuncs->close(ut); |
michael@0 | 599 | } |
michael@0 | 600 | ut->flags &= ~UTEXT_OPEN; |
michael@0 | 601 | |
michael@0 | 602 | // If extra space was requested by our caller, check whether |
michael@0 | 603 | // sufficient already exists, and allocate new if needed. |
michael@0 | 604 | if (extraSpace > ut->extraSize) { |
michael@0 | 605 | // Need more space. If there is existing separately allocated space, |
michael@0 | 606 | // delete it first, then allocate new space. |
michael@0 | 607 | if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { |
michael@0 | 608 | uprv_free(ut->pExtra); |
michael@0 | 609 | ut->extraSize = 0; |
michael@0 | 610 | } |
michael@0 | 611 | ut->pExtra = uprv_malloc(extraSpace); |
michael@0 | 612 | if (ut->pExtra == NULL) { |
michael@0 | 613 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 614 | } else { |
michael@0 | 615 | ut->extraSize = extraSpace; |
michael@0 | 616 | ut->flags |= UTEXT_EXTRA_HEAP_ALLOCATED; |
michael@0 | 617 | } |
michael@0 | 618 | } |
michael@0 | 619 | } |
michael@0 | 620 | if (U_SUCCESS(*status)) { |
michael@0 | 621 | ut->flags |= UTEXT_OPEN; |
michael@0 | 622 | |
michael@0 | 623 | // Initialize all remaining fields of the UText. |
michael@0 | 624 | // |
michael@0 | 625 | ut->context = NULL; |
michael@0 | 626 | ut->chunkContents = NULL; |
michael@0 | 627 | ut->p = NULL; |
michael@0 | 628 | ut->q = NULL; |
michael@0 | 629 | ut->r = NULL; |
michael@0 | 630 | ut->a = 0; |
michael@0 | 631 | ut->b = 0; |
michael@0 | 632 | ut->c = 0; |
michael@0 | 633 | ut->chunkOffset = 0; |
michael@0 | 634 | ut->chunkLength = 0; |
michael@0 | 635 | ut->chunkNativeStart = 0; |
michael@0 | 636 | ut->chunkNativeLimit = 0; |
michael@0 | 637 | ut->nativeIndexingLimit = 0; |
michael@0 | 638 | ut->providerProperties = 0; |
michael@0 | 639 | ut->privA = 0; |
michael@0 | 640 | ut->privB = 0; |
michael@0 | 641 | ut->privC = 0; |
michael@0 | 642 | ut->privP = NULL; |
michael@0 | 643 | if (ut->pExtra!=NULL && ut->extraSize>0) |
michael@0 | 644 | uprv_memset(ut->pExtra, 0, ut->extraSize); |
michael@0 | 645 | |
michael@0 | 646 | } |
michael@0 | 647 | return ut; |
michael@0 | 648 | } |
michael@0 | 649 | |
michael@0 | 650 | |
michael@0 | 651 | U_CAPI UText * U_EXPORT2 |
michael@0 | 652 | utext_close(UText *ut) { |
michael@0 | 653 | if (ut==NULL || |
michael@0 | 654 | ut->magic != UTEXT_MAGIC || |
michael@0 | 655 | (ut->flags & UTEXT_OPEN) == 0) |
michael@0 | 656 | { |
michael@0 | 657 | // The supplied ut is not an open UText. |
michael@0 | 658 | // Do nothing. |
michael@0 | 659 | return ut; |
michael@0 | 660 | } |
michael@0 | 661 | |
michael@0 | 662 | // If the provider gave us a close function, call it now. |
michael@0 | 663 | // This will clean up anything allocated specifically by the provider. |
michael@0 | 664 | if (ut->pFuncs->close != NULL) { |
michael@0 | 665 | ut->pFuncs->close(ut); |
michael@0 | 666 | } |
michael@0 | 667 | ut->flags &= ~UTEXT_OPEN; |
michael@0 | 668 | |
michael@0 | 669 | // If we (the framework) allocated the UText or subsidiary storage, |
michael@0 | 670 | // delete it. |
michael@0 | 671 | if (ut->flags & UTEXT_EXTRA_HEAP_ALLOCATED) { |
michael@0 | 672 | uprv_free(ut->pExtra); |
michael@0 | 673 | ut->pExtra = NULL; |
michael@0 | 674 | ut->flags &= ~UTEXT_EXTRA_HEAP_ALLOCATED; |
michael@0 | 675 | ut->extraSize = 0; |
michael@0 | 676 | } |
michael@0 | 677 | |
michael@0 | 678 | // Zero out function table of the closed UText. This is a defensive move, |
michael@0 | 679 | // inteded to cause applications that inadvertantly use a closed |
michael@0 | 680 | // utext to crash with null pointer errors. |
michael@0 | 681 | ut->pFuncs = NULL; |
michael@0 | 682 | |
michael@0 | 683 | if (ut->flags & UTEXT_HEAP_ALLOCATED) { |
michael@0 | 684 | // This UText was allocated by UText setup. We need to free it. |
michael@0 | 685 | // Clear magic, so we can detect if the user messes up and immediately |
michael@0 | 686 | // tries to reopen another UText using the deleted storage. |
michael@0 | 687 | ut->magic = 0; |
michael@0 | 688 | uprv_free(ut); |
michael@0 | 689 | ut = NULL; |
michael@0 | 690 | } |
michael@0 | 691 | return ut; |
michael@0 | 692 | } |
michael@0 | 693 | |
michael@0 | 694 | |
michael@0 | 695 | |
michael@0 | 696 | |
michael@0 | 697 | // |
michael@0 | 698 | // invalidateChunk Reset a chunk to have no contents, so that the next call |
michael@0 | 699 | // to access will cause new data to load. |
michael@0 | 700 | // This is needed when copy/move/replace operate directly on the |
michael@0 | 701 | // backing text, potentially putting it out of sync with the |
michael@0 | 702 | // contents in the chunk. |
michael@0 | 703 | // |
michael@0 | 704 | static void |
michael@0 | 705 | invalidateChunk(UText *ut) { |
michael@0 | 706 | ut->chunkLength = 0; |
michael@0 | 707 | ut->chunkNativeLimit = 0; |
michael@0 | 708 | ut->chunkNativeStart = 0; |
michael@0 | 709 | ut->chunkOffset = 0; |
michael@0 | 710 | ut->nativeIndexingLimit = 0; |
michael@0 | 711 | } |
michael@0 | 712 | |
michael@0 | 713 | // |
michael@0 | 714 | // pinIndex Do range pinning on a native index parameter. |
michael@0 | 715 | // 64 bit pinning is done in place. |
michael@0 | 716 | // 32 bit truncated result is returned as a convenience for |
michael@0 | 717 | // use in providers that don't need 64 bits. |
michael@0 | 718 | static int32_t |
michael@0 | 719 | pinIndex(int64_t &index, int64_t limit) { |
michael@0 | 720 | if (index<0) { |
michael@0 | 721 | index = 0; |
michael@0 | 722 | } else if (index > limit) { |
michael@0 | 723 | index = limit; |
michael@0 | 724 | } |
michael@0 | 725 | return (int32_t)index; |
michael@0 | 726 | } |
michael@0 | 727 | |
michael@0 | 728 | |
michael@0 | 729 | U_CDECL_BEGIN |
michael@0 | 730 | |
michael@0 | 731 | // |
michael@0 | 732 | // Pointer relocation function, |
michael@0 | 733 | // a utility used by shallow clone. |
michael@0 | 734 | // Adjust a pointer that refers to something within one UText (the source) |
michael@0 | 735 | // to refer to the same relative offset within a another UText (the target) |
michael@0 | 736 | // |
michael@0 | 737 | static void adjustPointer(UText *dest, const void **destPtr, const UText *src) { |
michael@0 | 738 | // convert all pointers to (char *) so that byte address arithmetic will work. |
michael@0 | 739 | char *dptr = (char *)*destPtr; |
michael@0 | 740 | char *dUText = (char *)dest; |
michael@0 | 741 | char *sUText = (char *)src; |
michael@0 | 742 | |
michael@0 | 743 | if (dptr >= (char *)src->pExtra && dptr < ((char*)src->pExtra)+src->extraSize) { |
michael@0 | 744 | // target ptr was to something within the src UText's pExtra storage. |
michael@0 | 745 | // relocate it into the target UText's pExtra region. |
michael@0 | 746 | *destPtr = ((char *)dest->pExtra) + (dptr - (char *)src->pExtra); |
michael@0 | 747 | } else if (dptr>=sUText && dptr < sUText+src->sizeOfStruct) { |
michael@0 | 748 | // target ptr was pointing to somewhere within the source UText itself. |
michael@0 | 749 | // Move it to the same offset within the target UText. |
michael@0 | 750 | *destPtr = dUText + (dptr-sUText); |
michael@0 | 751 | } |
michael@0 | 752 | } |
michael@0 | 753 | |
michael@0 | 754 | |
michael@0 | 755 | // |
michael@0 | 756 | // Clone. This is a generic copy-the-utext-by-value clone function that can be |
michael@0 | 757 | // used as-is with some utext types, and as a helper by other clones. |
michael@0 | 758 | // |
michael@0 | 759 | static UText * U_CALLCONV |
michael@0 | 760 | shallowTextClone(UText * dest, const UText * src, UErrorCode * status) { |
michael@0 | 761 | if (U_FAILURE(*status)) { |
michael@0 | 762 | return NULL; |
michael@0 | 763 | } |
michael@0 | 764 | int32_t srcExtraSize = src->extraSize; |
michael@0 | 765 | |
michael@0 | 766 | // |
michael@0 | 767 | // Use the generic text_setup to allocate storage if required. |
michael@0 | 768 | // |
michael@0 | 769 | dest = utext_setup(dest, srcExtraSize, status); |
michael@0 | 770 | if (U_FAILURE(*status)) { |
michael@0 | 771 | return dest; |
michael@0 | 772 | } |
michael@0 | 773 | |
michael@0 | 774 | // |
michael@0 | 775 | // flags (how the UText was allocated) and the pointer to the |
michael@0 | 776 | // extra storage must retain the values in the cloned utext that |
michael@0 | 777 | // were set up by utext_setup. Save them separately before |
michael@0 | 778 | // copying the whole struct. |
michael@0 | 779 | // |
michael@0 | 780 | void *destExtra = dest->pExtra; |
michael@0 | 781 | int32_t flags = dest->flags; |
michael@0 | 782 | |
michael@0 | 783 | |
michael@0 | 784 | // |
michael@0 | 785 | // Copy the whole UText struct by value. |
michael@0 | 786 | // Any "Extra" storage is copied also. |
michael@0 | 787 | // |
michael@0 | 788 | int sizeToCopy = src->sizeOfStruct; |
michael@0 | 789 | if (sizeToCopy > dest->sizeOfStruct) { |
michael@0 | 790 | sizeToCopy = dest->sizeOfStruct; |
michael@0 | 791 | } |
michael@0 | 792 | uprv_memcpy(dest, src, sizeToCopy); |
michael@0 | 793 | dest->pExtra = destExtra; |
michael@0 | 794 | dest->flags = flags; |
michael@0 | 795 | if (srcExtraSize > 0) { |
michael@0 | 796 | uprv_memcpy(dest->pExtra, src->pExtra, srcExtraSize); |
michael@0 | 797 | } |
michael@0 | 798 | |
michael@0 | 799 | // |
michael@0 | 800 | // Relocate any pointers in the target that refer to the UText itself |
michael@0 | 801 | // to point to the cloned copy rather than the original source. |
michael@0 | 802 | // |
michael@0 | 803 | adjustPointer(dest, &dest->context, src); |
michael@0 | 804 | adjustPointer(dest, &dest->p, src); |
michael@0 | 805 | adjustPointer(dest, &dest->q, src); |
michael@0 | 806 | adjustPointer(dest, &dest->r, src); |
michael@0 | 807 | adjustPointer(dest, (const void **)&dest->chunkContents, src); |
michael@0 | 808 | |
michael@0 | 809 | return dest; |
michael@0 | 810 | } |
michael@0 | 811 | |
michael@0 | 812 | |
michael@0 | 813 | U_CDECL_END |
michael@0 | 814 | |
michael@0 | 815 | |
michael@0 | 816 | |
michael@0 | 817 | //------------------------------------------------------------------------------ |
michael@0 | 818 | // |
michael@0 | 819 | // UText implementation for UTF-8 char * strings (read-only) |
michael@0 | 820 | // Limitation: string length must be <= 0x7fffffff in length. |
michael@0 | 821 | // (length must for in an int32_t variable) |
michael@0 | 822 | // |
michael@0 | 823 | // Use of UText data members: |
michael@0 | 824 | // context pointer to UTF-8 string |
michael@0 | 825 | // utext.b is the input string length (bytes). |
michael@0 | 826 | // utext.c Length scanned so far in string |
michael@0 | 827 | // (for optimizing finding length of zero terminated strings.) |
michael@0 | 828 | // utext.p pointer to the current buffer |
michael@0 | 829 | // utext.q pointer to the other buffer. |
michael@0 | 830 | // |
michael@0 | 831 | //------------------------------------------------------------------------------ |
michael@0 | 832 | |
michael@0 | 833 | // Chunk size. |
michael@0 | 834 | // Must be less than 85, because of byte mapping from UChar indexes to native indexes. |
michael@0 | 835 | // Worst case is three native bytes to one UChar. (Supplemenaries are 4 native bytes |
michael@0 | 836 | // to two UChars.) |
michael@0 | 837 | // |
michael@0 | 838 | enum { UTF8_TEXT_CHUNK_SIZE=32 }; |
michael@0 | 839 | |
michael@0 | 840 | // |
michael@0 | 841 | // UTF8Buf Two of these structs will be set up in the UText's extra allocated space. |
michael@0 | 842 | // Each contains the UChar chunk buffer, the to and from native maps, and |
michael@0 | 843 | // header info. |
michael@0 | 844 | // |
michael@0 | 845 | // because backwards iteration fills the buffers starting at the end and |
michael@0 | 846 | // working towards the front, the filled part of the buffers may not begin |
michael@0 | 847 | // at the start of the available storage for the buffers. |
michael@0 | 848 | // |
michael@0 | 849 | // Buffer size is one bigger than the specified UTF8_TEXT_CHUNK_SIZE to allow for |
michael@0 | 850 | // the last character added being a supplementary, and thus requiring a surrogate |
michael@0 | 851 | // pair. Doing this is simpler than checking for the edge case. |
michael@0 | 852 | // |
michael@0 | 853 | |
michael@0 | 854 | struct UTF8Buf { |
michael@0 | 855 | int32_t bufNativeStart; // Native index of first char in UChar buf |
michael@0 | 856 | int32_t bufNativeLimit; // Native index following last char in buf. |
michael@0 | 857 | int32_t bufStartIdx; // First filled position in buf. |
michael@0 | 858 | int32_t bufLimitIdx; // Limit of filled range in buf. |
michael@0 | 859 | int32_t bufNILimit; // Limit of native indexing part of buf |
michael@0 | 860 | int32_t toUCharsMapStart; // Native index corresponding to |
michael@0 | 861 | // mapToUChars[0]. |
michael@0 | 862 | // Set to bufNativeStart when filling forwards. |
michael@0 | 863 | // Set to computed value when filling backwards. |
michael@0 | 864 | |
michael@0 | 865 | UChar buf[UTF8_TEXT_CHUNK_SIZE+4]; // The UChar buffer. Requires one extra position beyond the |
michael@0 | 866 | // the chunk size, to allow for surrogate at the end. |
michael@0 | 867 | // Length must be identical to mapToNative array, below, |
michael@0 | 868 | // because of the way indexing works when the array is |
michael@0 | 869 | // filled backwards during a reverse iteration. Thus, |
michael@0 | 870 | // the additional extra size. |
michael@0 | 871 | uint8_t mapToNative[UTF8_TEXT_CHUNK_SIZE+4]; // map UChar index in buf to |
michael@0 | 872 | // native offset from bufNativeStart. |
michael@0 | 873 | // Requires two extra slots, |
michael@0 | 874 | // one for a supplementary starting in the last normal position, |
michael@0 | 875 | // and one for an entry for the buffer limit position. |
michael@0 | 876 | uint8_t mapToUChars[UTF8_TEXT_CHUNK_SIZE*3+6]; // Map native offset from bufNativeStart to |
michael@0 | 877 | // correspoding offset in filled part of buf. |
michael@0 | 878 | int32_t align; |
michael@0 | 879 | }; |
michael@0 | 880 | |
michael@0 | 881 | U_CDECL_BEGIN |
michael@0 | 882 | |
michael@0 | 883 | // |
michael@0 | 884 | // utf8TextLength |
michael@0 | 885 | // |
michael@0 | 886 | // Get the length of the string. If we don't already know it, |
michael@0 | 887 | // we'll need to scan for the trailing nul. |
michael@0 | 888 | // |
michael@0 | 889 | static int64_t U_CALLCONV |
michael@0 | 890 | utf8TextLength(UText *ut) { |
michael@0 | 891 | if (ut->b < 0) { |
michael@0 | 892 | // Zero terminated string, and we haven't scanned to the end yet. |
michael@0 | 893 | // Scan it now. |
michael@0 | 894 | const char *r = (const char *)ut->context + ut->c; |
michael@0 | 895 | while (*r != 0) { |
michael@0 | 896 | r++; |
michael@0 | 897 | } |
michael@0 | 898 | if ((r - (const char *)ut->context) < 0x7fffffff) { |
michael@0 | 899 | ut->b = (int32_t)(r - (const char *)ut->context); |
michael@0 | 900 | } else { |
michael@0 | 901 | // Actual string was bigger (more than 2 gig) than we |
michael@0 | 902 | // can handle. Clip it to 2 GB. |
michael@0 | 903 | ut->b = 0x7fffffff; |
michael@0 | 904 | } |
michael@0 | 905 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 906 | } |
michael@0 | 907 | return ut->b; |
michael@0 | 908 | } |
michael@0 | 909 | |
michael@0 | 910 | |
michael@0 | 911 | |
michael@0 | 912 | |
michael@0 | 913 | |
michael@0 | 914 | |
michael@0 | 915 | static UBool U_CALLCONV |
michael@0 | 916 | utf8TextAccess(UText *ut, int64_t index, UBool forward) { |
michael@0 | 917 | // |
michael@0 | 918 | // Apologies to those who are allergic to goto statements. |
michael@0 | 919 | // Consider each goto to a labelled block to be the equivalent of |
michael@0 | 920 | // call the named block as if it were a function(); |
michael@0 | 921 | // return; |
michael@0 | 922 | // |
michael@0 | 923 | const uint8_t *s8=(const uint8_t *)ut->context; |
michael@0 | 924 | UTF8Buf *u8b = NULL; |
michael@0 | 925 | int32_t length = ut->b; // Length of original utf-8 |
michael@0 | 926 | int32_t ix= (int32_t)index; // Requested index, trimmed to 32 bits. |
michael@0 | 927 | int32_t mapIndex = 0; |
michael@0 | 928 | if (index<0) { |
michael@0 | 929 | ix=0; |
michael@0 | 930 | } else if (index > 0x7fffffff) { |
michael@0 | 931 | // Strings with 64 bit lengths not supported by this UTF-8 provider. |
michael@0 | 932 | ix = 0x7fffffff; |
michael@0 | 933 | } |
michael@0 | 934 | |
michael@0 | 935 | // Pin requested index to the string length. |
michael@0 | 936 | if (ix>length) { |
michael@0 | 937 | if (length>=0) { |
michael@0 | 938 | ix=length; |
michael@0 | 939 | } else if (ix>=ut->c) { |
michael@0 | 940 | // Zero terminated string, and requested index is beyond |
michael@0 | 941 | // the region that has already been scanned. |
michael@0 | 942 | // Scan up to either the end of the string or to the |
michael@0 | 943 | // requested position, whichever comes first. |
michael@0 | 944 | while (ut->c<ix && s8[ut->c]!=0) { |
michael@0 | 945 | ut->c++; |
michael@0 | 946 | } |
michael@0 | 947 | // TODO: support for null terminated string length > 32 bits. |
michael@0 | 948 | if (s8[ut->c] == 0) { |
michael@0 | 949 | // We just found the actual length of the string. |
michael@0 | 950 | // Trim the requested index back to that. |
michael@0 | 951 | ix = ut->c; |
michael@0 | 952 | ut->b = ut->c; |
michael@0 | 953 | length = ut->c; |
michael@0 | 954 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 955 | } |
michael@0 | 956 | } |
michael@0 | 957 | } |
michael@0 | 958 | |
michael@0 | 959 | // |
michael@0 | 960 | // Dispatch to the appropriate action for a forward iteration request. |
michael@0 | 961 | // |
michael@0 | 962 | if (forward) { |
michael@0 | 963 | if (ix==ut->chunkNativeLimit) { |
michael@0 | 964 | // Check for normal sequential iteration cases first. |
michael@0 | 965 | if (ix==length) { |
michael@0 | 966 | // Just reached end of string |
michael@0 | 967 | // Don't swap buffers, but do set the |
michael@0 | 968 | // current buffer position. |
michael@0 | 969 | ut->chunkOffset = ut->chunkLength; |
michael@0 | 970 | return FALSE; |
michael@0 | 971 | } else { |
michael@0 | 972 | // End of current buffer. |
michael@0 | 973 | // check whether other buffer already has what we need. |
michael@0 | 974 | UTF8Buf *altB = (UTF8Buf *)ut->q; |
michael@0 | 975 | if (ix>=altB->bufNativeStart && ix<altB->bufNativeLimit) { |
michael@0 | 976 | goto swapBuffers; |
michael@0 | 977 | } |
michael@0 | 978 | } |
michael@0 | 979 | } |
michael@0 | 980 | |
michael@0 | 981 | // A random access. Desired index could be in either or niether buf. |
michael@0 | 982 | // For optimizing the order of testing, first check for the index |
michael@0 | 983 | // being in the other buffer. This will be the case for uses that |
michael@0 | 984 | // move back and forth over a fairly limited range |
michael@0 | 985 | { |
michael@0 | 986 | u8b = (UTF8Buf *)ut->q; // the alternate buffer |
michael@0 | 987 | if (ix>=u8b->bufNativeStart && ix<u8b->bufNativeLimit) { |
michael@0 | 988 | // Requested index is in the other buffer. |
michael@0 | 989 | goto swapBuffers; |
michael@0 | 990 | } |
michael@0 | 991 | if (ix == length) { |
michael@0 | 992 | // Requested index is end-of-string. |
michael@0 | 993 | // (this is the case of randomly seeking to the end. |
michael@0 | 994 | // The case of iterating off the end is handled earlier.) |
michael@0 | 995 | if (ix == ut->chunkNativeLimit) { |
michael@0 | 996 | // Current buffer extends up to the end of the string. |
michael@0 | 997 | // Leave it as the current buffer. |
michael@0 | 998 | ut->chunkOffset = ut->chunkLength; |
michael@0 | 999 | return FALSE; |
michael@0 | 1000 | } |
michael@0 | 1001 | if (ix == u8b->bufNativeLimit) { |
michael@0 | 1002 | // Alternate buffer extends to the end of string. |
michael@0 | 1003 | // Swap it in as the current buffer. |
michael@0 | 1004 | goto swapBuffersAndFail; |
michael@0 | 1005 | } |
michael@0 | 1006 | |
michael@0 | 1007 | // Neither existing buffer extends to the end of the string. |
michael@0 | 1008 | goto makeStubBuffer; |
michael@0 | 1009 | } |
michael@0 | 1010 | |
michael@0 | 1011 | if (ix<ut->chunkNativeStart || ix>=ut->chunkNativeLimit) { |
michael@0 | 1012 | // Requested index is in neither buffer. |
michael@0 | 1013 | goto fillForward; |
michael@0 | 1014 | } |
michael@0 | 1015 | |
michael@0 | 1016 | // Requested index is in this buffer. |
michael@0 | 1017 | u8b = (UTF8Buf *)ut->p; // the current buffer |
michael@0 | 1018 | mapIndex = ix - u8b->toUCharsMapStart; |
michael@0 | 1019 | ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; |
michael@0 | 1020 | return TRUE; |
michael@0 | 1021 | |
michael@0 | 1022 | } |
michael@0 | 1023 | } |
michael@0 | 1024 | |
michael@0 | 1025 | |
michael@0 | 1026 | // |
michael@0 | 1027 | // Dispatch to the appropriate action for a |
michael@0 | 1028 | // Backwards Diretion iteration request. |
michael@0 | 1029 | // |
michael@0 | 1030 | if (ix==ut->chunkNativeStart) { |
michael@0 | 1031 | // Check for normal sequential iteration cases first. |
michael@0 | 1032 | if (ix==0) { |
michael@0 | 1033 | // Just reached the start of string |
michael@0 | 1034 | // Don't swap buffers, but do set the |
michael@0 | 1035 | // current buffer position. |
michael@0 | 1036 | ut->chunkOffset = 0; |
michael@0 | 1037 | return FALSE; |
michael@0 | 1038 | } else { |
michael@0 | 1039 | // Start of current buffer. |
michael@0 | 1040 | // check whether other buffer already has what we need. |
michael@0 | 1041 | UTF8Buf *altB = (UTF8Buf *)ut->q; |
michael@0 | 1042 | if (ix>altB->bufNativeStart && ix<=altB->bufNativeLimit) { |
michael@0 | 1043 | goto swapBuffers; |
michael@0 | 1044 | } |
michael@0 | 1045 | } |
michael@0 | 1046 | } |
michael@0 | 1047 | |
michael@0 | 1048 | // A random access. Desired index could be in either or niether buf. |
michael@0 | 1049 | // For optimizing the order of testing, |
michael@0 | 1050 | // Most likely case: in the other buffer. |
michael@0 | 1051 | // Second most likely: in neither buffer. |
michael@0 | 1052 | // Unlikely, but must work: in the current buffer. |
michael@0 | 1053 | u8b = (UTF8Buf *)ut->q; // the alternate buffer |
michael@0 | 1054 | if (ix>u8b->bufNativeStart && ix<=u8b->bufNativeLimit) { |
michael@0 | 1055 | // Requested index is in the other buffer. |
michael@0 | 1056 | goto swapBuffers; |
michael@0 | 1057 | } |
michael@0 | 1058 | // Requested index is start-of-string. |
michael@0 | 1059 | // (this is the case of randomly seeking to the start. |
michael@0 | 1060 | // The case of iterating off the start is handled earlier.) |
michael@0 | 1061 | if (ix==0) { |
michael@0 | 1062 | if (u8b->bufNativeStart==0) { |
michael@0 | 1063 | // Alternate buffer contains the data for the start string. |
michael@0 | 1064 | // Make it be the current buffer. |
michael@0 | 1065 | goto swapBuffersAndFail; |
michael@0 | 1066 | } else { |
michael@0 | 1067 | // Request for data before the start of string, |
michael@0 | 1068 | // neither buffer is usable. |
michael@0 | 1069 | // set up a zero-length buffer. |
michael@0 | 1070 | goto makeStubBuffer; |
michael@0 | 1071 | } |
michael@0 | 1072 | } |
michael@0 | 1073 | |
michael@0 | 1074 | if (ix<=ut->chunkNativeStart || ix>ut->chunkNativeLimit) { |
michael@0 | 1075 | // Requested index is in neither buffer. |
michael@0 | 1076 | goto fillReverse; |
michael@0 | 1077 | } |
michael@0 | 1078 | |
michael@0 | 1079 | // Requested index is in this buffer. |
michael@0 | 1080 | // Set the utf16 buffer index. |
michael@0 | 1081 | u8b = (UTF8Buf *)ut->p; |
michael@0 | 1082 | mapIndex = ix - u8b->toUCharsMapStart; |
michael@0 | 1083 | ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; |
michael@0 | 1084 | if (ut->chunkOffset==0) { |
michael@0 | 1085 | // This occurs when the first character in the text is |
michael@0 | 1086 | // a multi-byte UTF-8 char, and the requested index is to |
michael@0 | 1087 | // one of the trailing bytes. Because there is no preceding , |
michael@0 | 1088 | // character, this access fails. We can't pick up on the |
michael@0 | 1089 | // situation sooner because the requested index is not zero. |
michael@0 | 1090 | return FALSE; |
michael@0 | 1091 | } else { |
michael@0 | 1092 | return TRUE; |
michael@0 | 1093 | } |
michael@0 | 1094 | |
michael@0 | 1095 | |
michael@0 | 1096 | |
michael@0 | 1097 | swapBuffers: |
michael@0 | 1098 | // The alternate buffer (ut->q) has the string data that was requested. |
michael@0 | 1099 | // Swap the primary and alternate buffers, and set the |
michael@0 | 1100 | // chunk index into the new primary buffer. |
michael@0 | 1101 | { |
michael@0 | 1102 | u8b = (UTF8Buf *)ut->q; |
michael@0 | 1103 | ut->q = ut->p; |
michael@0 | 1104 | ut->p = u8b; |
michael@0 | 1105 | ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; |
michael@0 | 1106 | ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; |
michael@0 | 1107 | ut->chunkNativeStart = u8b->bufNativeStart; |
michael@0 | 1108 | ut->chunkNativeLimit = u8b->bufNativeLimit; |
michael@0 | 1109 | ut->nativeIndexingLimit = u8b->bufNILimit; |
michael@0 | 1110 | |
michael@0 | 1111 | // Index into the (now current) chunk |
michael@0 | 1112 | // Use the map to set the chunk index. It's more trouble than it's worth |
michael@0 | 1113 | // to check whether native indexing can be used. |
michael@0 | 1114 | U_ASSERT(ix>=u8b->bufNativeStart); |
michael@0 | 1115 | U_ASSERT(ix<=u8b->bufNativeLimit); |
michael@0 | 1116 | mapIndex = ix - u8b->toUCharsMapStart; |
michael@0 | 1117 | U_ASSERT(mapIndex>=0); |
michael@0 | 1118 | U_ASSERT(mapIndex<(int32_t)sizeof(u8b->mapToUChars)); |
michael@0 | 1119 | ut->chunkOffset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; |
michael@0 | 1120 | |
michael@0 | 1121 | return TRUE; |
michael@0 | 1122 | } |
michael@0 | 1123 | |
michael@0 | 1124 | |
michael@0 | 1125 | swapBuffersAndFail: |
michael@0 | 1126 | // We got a request for either the start or end of the string, |
michael@0 | 1127 | // with iteration continuing in the out-of-bounds direction. |
michael@0 | 1128 | // The alternate buffer already contains the data up to the |
michael@0 | 1129 | // start/end. |
michael@0 | 1130 | // Swap the buffers, then return failure, indicating that we couldn't |
michael@0 | 1131 | // make things correct for continuing the iteration in the requested |
michael@0 | 1132 | // direction. The position & buffer are correct should the |
michael@0 | 1133 | // user decide to iterate in the opposite direction. |
michael@0 | 1134 | u8b = (UTF8Buf *)ut->q; |
michael@0 | 1135 | ut->q = ut->p; |
michael@0 | 1136 | ut->p = u8b; |
michael@0 | 1137 | ut->chunkContents = &u8b->buf[u8b->bufStartIdx]; |
michael@0 | 1138 | ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; |
michael@0 | 1139 | ut->chunkNativeStart = u8b->bufNativeStart; |
michael@0 | 1140 | ut->chunkNativeLimit = u8b->bufNativeLimit; |
michael@0 | 1141 | ut->nativeIndexingLimit = u8b->bufNILimit; |
michael@0 | 1142 | |
michael@0 | 1143 | // Index into the (now current) chunk |
michael@0 | 1144 | // For this function (swapBuffersAndFail), the requested index |
michael@0 | 1145 | // will always be at either the start or end of the chunk. |
michael@0 | 1146 | if (ix==u8b->bufNativeLimit) { |
michael@0 | 1147 | ut->chunkOffset = ut->chunkLength; |
michael@0 | 1148 | } else { |
michael@0 | 1149 | ut->chunkOffset = 0; |
michael@0 | 1150 | U_ASSERT(ix == u8b->bufNativeStart); |
michael@0 | 1151 | } |
michael@0 | 1152 | return FALSE; |
michael@0 | 1153 | |
michael@0 | 1154 | makeStubBuffer: |
michael@0 | 1155 | // The user has done a seek/access past the start or end |
michael@0 | 1156 | // of the string. Rather than loading data that is likely |
michael@0 | 1157 | // to never be used, just set up a zero-length buffer at |
michael@0 | 1158 | // the position. |
michael@0 | 1159 | u8b = (UTF8Buf *)ut->q; |
michael@0 | 1160 | u8b->bufNativeStart = ix; |
michael@0 | 1161 | u8b->bufNativeLimit = ix; |
michael@0 | 1162 | u8b->bufStartIdx = 0; |
michael@0 | 1163 | u8b->bufLimitIdx = 0; |
michael@0 | 1164 | u8b->bufNILimit = 0; |
michael@0 | 1165 | u8b->toUCharsMapStart = ix; |
michael@0 | 1166 | u8b->mapToNative[0] = 0; |
michael@0 | 1167 | u8b->mapToUChars[0] = 0; |
michael@0 | 1168 | goto swapBuffersAndFail; |
michael@0 | 1169 | |
michael@0 | 1170 | |
michael@0 | 1171 | |
michael@0 | 1172 | fillForward: |
michael@0 | 1173 | { |
michael@0 | 1174 | // Move the incoming index to a code point boundary. |
michael@0 | 1175 | U8_SET_CP_START(s8, 0, ix); |
michael@0 | 1176 | |
michael@0 | 1177 | // Swap the UText buffers. |
michael@0 | 1178 | // We want to fill what was previously the alternate buffer, |
michael@0 | 1179 | // and make what was the current buffer be the new alternate. |
michael@0 | 1180 | UTF8Buf *u8b = (UTF8Buf *)ut->q; |
michael@0 | 1181 | ut->q = ut->p; |
michael@0 | 1182 | ut->p = u8b; |
michael@0 | 1183 | |
michael@0 | 1184 | int32_t strLen = ut->b; |
michael@0 | 1185 | UBool nulTerminated = FALSE; |
michael@0 | 1186 | if (strLen < 0) { |
michael@0 | 1187 | strLen = 0x7fffffff; |
michael@0 | 1188 | nulTerminated = TRUE; |
michael@0 | 1189 | } |
michael@0 | 1190 | |
michael@0 | 1191 | UChar *buf = u8b->buf; |
michael@0 | 1192 | uint8_t *mapToNative = u8b->mapToNative; |
michael@0 | 1193 | uint8_t *mapToUChars = u8b->mapToUChars; |
michael@0 | 1194 | int32_t destIx = 0; |
michael@0 | 1195 | int32_t srcIx = ix; |
michael@0 | 1196 | UBool seenNonAscii = FALSE; |
michael@0 | 1197 | UChar32 c = 0; |
michael@0 | 1198 | |
michael@0 | 1199 | // Fill the chunk buffer and mapping arrays. |
michael@0 | 1200 | while (destIx<UTF8_TEXT_CHUNK_SIZE) { |
michael@0 | 1201 | c = s8[srcIx]; |
michael@0 | 1202 | if (c>0 && c<0x80) { |
michael@0 | 1203 | // Special case ASCII range for speed. |
michael@0 | 1204 | // zero is excluded to simplify bounds checking. |
michael@0 | 1205 | buf[destIx] = (UChar)c; |
michael@0 | 1206 | mapToNative[destIx] = (uint8_t)(srcIx - ix); |
michael@0 | 1207 | mapToUChars[srcIx-ix] = (uint8_t)destIx; |
michael@0 | 1208 | srcIx++; |
michael@0 | 1209 | destIx++; |
michael@0 | 1210 | } else { |
michael@0 | 1211 | // General case, handle everything. |
michael@0 | 1212 | if (seenNonAscii == FALSE) { |
michael@0 | 1213 | seenNonAscii = TRUE; |
michael@0 | 1214 | u8b->bufNILimit = destIx; |
michael@0 | 1215 | } |
michael@0 | 1216 | |
michael@0 | 1217 | int32_t cIx = srcIx; |
michael@0 | 1218 | int32_t dIx = destIx; |
michael@0 | 1219 | int32_t dIxSaved = destIx; |
michael@0 | 1220 | U8_NEXT_OR_FFFD(s8, srcIx, strLen, c); |
michael@0 | 1221 | if (c==0 && nulTerminated) { |
michael@0 | 1222 | srcIx--; |
michael@0 | 1223 | break; |
michael@0 | 1224 | } |
michael@0 | 1225 | |
michael@0 | 1226 | U16_APPEND_UNSAFE(buf, destIx, c); |
michael@0 | 1227 | do { |
michael@0 | 1228 | mapToNative[dIx++] = (uint8_t)(cIx - ix); |
michael@0 | 1229 | } while (dIx < destIx); |
michael@0 | 1230 | |
michael@0 | 1231 | do { |
michael@0 | 1232 | mapToUChars[cIx++ - ix] = (uint8_t)dIxSaved; |
michael@0 | 1233 | } while (cIx < srcIx); |
michael@0 | 1234 | } |
michael@0 | 1235 | if (srcIx>=strLen) { |
michael@0 | 1236 | break; |
michael@0 | 1237 | } |
michael@0 | 1238 | |
michael@0 | 1239 | } |
michael@0 | 1240 | |
michael@0 | 1241 | // store Native <--> Chunk Map entries for the end of the buffer. |
michael@0 | 1242 | // There is no actual character here, but the index position is valid. |
michael@0 | 1243 | mapToNative[destIx] = (uint8_t)(srcIx - ix); |
michael@0 | 1244 | mapToUChars[srcIx - ix] = (uint8_t)destIx; |
michael@0 | 1245 | |
michael@0 | 1246 | // fill in Buffer descriptor |
michael@0 | 1247 | u8b->bufNativeStart = ix; |
michael@0 | 1248 | u8b->bufNativeLimit = srcIx; |
michael@0 | 1249 | u8b->bufStartIdx = 0; |
michael@0 | 1250 | u8b->bufLimitIdx = destIx; |
michael@0 | 1251 | if (seenNonAscii == FALSE) { |
michael@0 | 1252 | u8b->bufNILimit = destIx; |
michael@0 | 1253 | } |
michael@0 | 1254 | u8b->toUCharsMapStart = u8b->bufNativeStart; |
michael@0 | 1255 | |
michael@0 | 1256 | // Set UText chunk to refer to this buffer. |
michael@0 | 1257 | ut->chunkContents = buf; |
michael@0 | 1258 | ut->chunkOffset = 0; |
michael@0 | 1259 | ut->chunkLength = u8b->bufLimitIdx; |
michael@0 | 1260 | ut->chunkNativeStart = u8b->bufNativeStart; |
michael@0 | 1261 | ut->chunkNativeLimit = u8b->bufNativeLimit; |
michael@0 | 1262 | ut->nativeIndexingLimit = u8b->bufNILimit; |
michael@0 | 1263 | |
michael@0 | 1264 | // For zero terminated strings, keep track of the maximum point |
michael@0 | 1265 | // scanned so far. |
michael@0 | 1266 | if (nulTerminated && srcIx>ut->c) { |
michael@0 | 1267 | ut->c = srcIx; |
michael@0 | 1268 | if (c==0) { |
michael@0 | 1269 | // We scanned to the end. |
michael@0 | 1270 | // Remember the actual length. |
michael@0 | 1271 | ut->b = srcIx; |
michael@0 | 1272 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 1273 | } |
michael@0 | 1274 | } |
michael@0 | 1275 | return TRUE; |
michael@0 | 1276 | } |
michael@0 | 1277 | |
michael@0 | 1278 | |
michael@0 | 1279 | fillReverse: |
michael@0 | 1280 | { |
michael@0 | 1281 | // Move the incoming index to a code point boundary. |
michael@0 | 1282 | // Can only do this if the incoming index is somewhere in the interior of the string. |
michael@0 | 1283 | // If index is at the end, there is no character there to look at. |
michael@0 | 1284 | if (ix != ut->b) { |
michael@0 | 1285 | U8_SET_CP_START(s8, 0, ix); |
michael@0 | 1286 | } |
michael@0 | 1287 | |
michael@0 | 1288 | // Swap the UText buffers. |
michael@0 | 1289 | // We want to fill what was previously the alternate buffer, |
michael@0 | 1290 | // and make what was the current buffer be the new alternate. |
michael@0 | 1291 | UTF8Buf *u8b = (UTF8Buf *)ut->q; |
michael@0 | 1292 | ut->q = ut->p; |
michael@0 | 1293 | ut->p = u8b; |
michael@0 | 1294 | |
michael@0 | 1295 | UChar *buf = u8b->buf; |
michael@0 | 1296 | uint8_t *mapToNative = u8b->mapToNative; |
michael@0 | 1297 | uint8_t *mapToUChars = u8b->mapToUChars; |
michael@0 | 1298 | int32_t toUCharsMapStart = ix - (UTF8_TEXT_CHUNK_SIZE*3 + 1); |
michael@0 | 1299 | int32_t destIx = UTF8_TEXT_CHUNK_SIZE+2; // Start in the overflow region |
michael@0 | 1300 | // at end of buffer to leave room |
michael@0 | 1301 | // for a surrogate pair at the |
michael@0 | 1302 | // buffer start. |
michael@0 | 1303 | int32_t srcIx = ix; |
michael@0 | 1304 | int32_t bufNILimit = destIx; |
michael@0 | 1305 | UChar32 c; |
michael@0 | 1306 | |
michael@0 | 1307 | // Map to/from Native Indexes, fill in for the position at the end of |
michael@0 | 1308 | // the buffer. |
michael@0 | 1309 | // |
michael@0 | 1310 | mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); |
michael@0 | 1311 | mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; |
michael@0 | 1312 | |
michael@0 | 1313 | // Fill the chunk buffer |
michael@0 | 1314 | // Work backwards, filling from the end of the buffer towards the front. |
michael@0 | 1315 | // |
michael@0 | 1316 | while (destIx>2 && (srcIx - toUCharsMapStart > 5) && (srcIx > 0)) { |
michael@0 | 1317 | srcIx--; |
michael@0 | 1318 | destIx--; |
michael@0 | 1319 | |
michael@0 | 1320 | // Get last byte of the UTF-8 character |
michael@0 | 1321 | c = s8[srcIx]; |
michael@0 | 1322 | if (c<0x80) { |
michael@0 | 1323 | // Special case ASCII range for speed. |
michael@0 | 1324 | buf[destIx] = (UChar)c; |
michael@0 | 1325 | mapToUChars[srcIx - toUCharsMapStart] = (uint8_t)destIx; |
michael@0 | 1326 | mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); |
michael@0 | 1327 | } else { |
michael@0 | 1328 | // General case, handle everything non-ASCII. |
michael@0 | 1329 | |
michael@0 | 1330 | int32_t sIx = srcIx; // ix of last byte of multi-byte u8 char |
michael@0 | 1331 | |
michael@0 | 1332 | // Get the full character from the UTF8 string. |
michael@0 | 1333 | // use code derived from tbe macros in utf8.h |
michael@0 | 1334 | // Leaves srcIx pointing at the first byte of the UTF-8 char. |
michael@0 | 1335 | // |
michael@0 | 1336 | c=utf8_prevCharSafeBody(s8, 0, &srcIx, c, -3); |
michael@0 | 1337 | // leaves srcIx at first byte of the multi-byte char. |
michael@0 | 1338 | |
michael@0 | 1339 | // Store the character in UTF-16 buffer. |
michael@0 | 1340 | if (c<0x10000) { |
michael@0 | 1341 | buf[destIx] = (UChar)c; |
michael@0 | 1342 | mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); |
michael@0 | 1343 | } else { |
michael@0 | 1344 | buf[destIx] = U16_TRAIL(c); |
michael@0 | 1345 | mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); |
michael@0 | 1346 | buf[--destIx] = U16_LEAD(c); |
michael@0 | 1347 | mapToNative[destIx] = (uint8_t)(srcIx - toUCharsMapStart); |
michael@0 | 1348 | } |
michael@0 | 1349 | |
michael@0 | 1350 | // Fill in the map from native indexes to UChars buf index. |
michael@0 | 1351 | do { |
michael@0 | 1352 | mapToUChars[sIx-- - toUCharsMapStart] = (uint8_t)destIx; |
michael@0 | 1353 | } while (sIx >= srcIx); |
michael@0 | 1354 | |
michael@0 | 1355 | // Set native indexing limit to be the current position. |
michael@0 | 1356 | // We are processing a non-ascii, non-native-indexing char now; |
michael@0 | 1357 | // the limit will be here if the rest of the chars to be |
michael@0 | 1358 | // added to this buffer are ascii. |
michael@0 | 1359 | bufNILimit = destIx; |
michael@0 | 1360 | } |
michael@0 | 1361 | } |
michael@0 | 1362 | u8b->bufNativeStart = srcIx; |
michael@0 | 1363 | u8b->bufNativeLimit = ix; |
michael@0 | 1364 | u8b->bufStartIdx = destIx; |
michael@0 | 1365 | u8b->bufLimitIdx = UTF8_TEXT_CHUNK_SIZE+2; |
michael@0 | 1366 | u8b->bufNILimit = bufNILimit - u8b->bufStartIdx; |
michael@0 | 1367 | u8b->toUCharsMapStart = toUCharsMapStart; |
michael@0 | 1368 | |
michael@0 | 1369 | ut->chunkContents = &buf[u8b->bufStartIdx]; |
michael@0 | 1370 | ut->chunkLength = u8b->bufLimitIdx - u8b->bufStartIdx; |
michael@0 | 1371 | ut->chunkOffset = ut->chunkLength; |
michael@0 | 1372 | ut->chunkNativeStart = u8b->bufNativeStart; |
michael@0 | 1373 | ut->chunkNativeLimit = u8b->bufNativeLimit; |
michael@0 | 1374 | ut->nativeIndexingLimit = u8b->bufNILimit; |
michael@0 | 1375 | return TRUE; |
michael@0 | 1376 | } |
michael@0 | 1377 | |
michael@0 | 1378 | } |
michael@0 | 1379 | |
michael@0 | 1380 | |
michael@0 | 1381 | |
michael@0 | 1382 | // |
michael@0 | 1383 | // This is a slightly modified copy of u_strFromUTF8, |
michael@0 | 1384 | // Inserts a Replacement Char rather than failing on invalid UTF-8 |
michael@0 | 1385 | // Removes unnecessary features. |
michael@0 | 1386 | // |
michael@0 | 1387 | static UChar* |
michael@0 | 1388 | utext_strFromUTF8(UChar *dest, |
michael@0 | 1389 | int32_t destCapacity, |
michael@0 | 1390 | int32_t *pDestLength, |
michael@0 | 1391 | const char* src, |
michael@0 | 1392 | int32_t srcLength, // required. NUL terminated not supported. |
michael@0 | 1393 | UErrorCode *pErrorCode |
michael@0 | 1394 | ) |
michael@0 | 1395 | { |
michael@0 | 1396 | |
michael@0 | 1397 | UChar *pDest = dest; |
michael@0 | 1398 | UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; |
michael@0 | 1399 | UChar32 ch=0; |
michael@0 | 1400 | int32_t index = 0; |
michael@0 | 1401 | int32_t reqLength = 0; |
michael@0 | 1402 | uint8_t* pSrc = (uint8_t*) src; |
michael@0 | 1403 | |
michael@0 | 1404 | |
michael@0 | 1405 | while((index < srcLength)&&(pDest<pDestLimit)){ |
michael@0 | 1406 | ch = pSrc[index++]; |
michael@0 | 1407 | if(ch <=0x7f){ |
michael@0 | 1408 | *pDest++=(UChar)ch; |
michael@0 | 1409 | }else{ |
michael@0 | 1410 | ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); |
michael@0 | 1411 | if(U_IS_BMP(ch)){ |
michael@0 | 1412 | *(pDest++)=(UChar)ch; |
michael@0 | 1413 | }else{ |
michael@0 | 1414 | *(pDest++)=U16_LEAD(ch); |
michael@0 | 1415 | if(pDest<pDestLimit){ |
michael@0 | 1416 | *(pDest++)=U16_TRAIL(ch); |
michael@0 | 1417 | }else{ |
michael@0 | 1418 | reqLength++; |
michael@0 | 1419 | break; |
michael@0 | 1420 | } |
michael@0 | 1421 | } |
michael@0 | 1422 | } |
michael@0 | 1423 | } |
michael@0 | 1424 | /* donot fill the dest buffer just count the UChars needed */ |
michael@0 | 1425 | while(index < srcLength){ |
michael@0 | 1426 | ch = pSrc[index++]; |
michael@0 | 1427 | if(ch <= 0x7f){ |
michael@0 | 1428 | reqLength++; |
michael@0 | 1429 | }else{ |
michael@0 | 1430 | ch=utf8_nextCharSafeBody(pSrc, &index, srcLength, ch, -3); |
michael@0 | 1431 | reqLength+=U16_LENGTH(ch); |
michael@0 | 1432 | } |
michael@0 | 1433 | } |
michael@0 | 1434 | |
michael@0 | 1435 | reqLength+=(int32_t)(pDest - dest); |
michael@0 | 1436 | |
michael@0 | 1437 | if(pDestLength){ |
michael@0 | 1438 | *pDestLength = reqLength; |
michael@0 | 1439 | } |
michael@0 | 1440 | |
michael@0 | 1441 | /* Terminate the buffer */ |
michael@0 | 1442 | u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); |
michael@0 | 1443 | |
michael@0 | 1444 | return dest; |
michael@0 | 1445 | } |
michael@0 | 1446 | |
michael@0 | 1447 | |
michael@0 | 1448 | |
michael@0 | 1449 | static int32_t U_CALLCONV |
michael@0 | 1450 | utf8TextExtract(UText *ut, |
michael@0 | 1451 | int64_t start, int64_t limit, |
michael@0 | 1452 | UChar *dest, int32_t destCapacity, |
michael@0 | 1453 | UErrorCode *pErrorCode) { |
michael@0 | 1454 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 1455 | return 0; |
michael@0 | 1456 | } |
michael@0 | 1457 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { |
michael@0 | 1458 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1459 | return 0; |
michael@0 | 1460 | } |
michael@0 | 1461 | int32_t length = ut->b; |
michael@0 | 1462 | int32_t start32 = pinIndex(start, length); |
michael@0 | 1463 | int32_t limit32 = pinIndex(limit, length); |
michael@0 | 1464 | |
michael@0 | 1465 | if(start32>limit32) { |
michael@0 | 1466 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1467 | return 0; |
michael@0 | 1468 | } |
michael@0 | 1469 | |
michael@0 | 1470 | |
michael@0 | 1471 | // adjust the incoming indexes to land on code point boundaries if needed. |
michael@0 | 1472 | // adjust by no more than three, because that is the largest number of trail bytes |
michael@0 | 1473 | // in a well formed UTF8 character. |
michael@0 | 1474 | const uint8_t *buf = (const uint8_t *)ut->context; |
michael@0 | 1475 | int i; |
michael@0 | 1476 | if (start32 < ut->chunkNativeLimit) { |
michael@0 | 1477 | for (i=0; i<3; i++) { |
michael@0 | 1478 | if (U8_IS_SINGLE(buf[start32]) || U8_IS_LEAD(buf[start32]) || start32==0) { |
michael@0 | 1479 | break; |
michael@0 | 1480 | } |
michael@0 | 1481 | start32--; |
michael@0 | 1482 | } |
michael@0 | 1483 | } |
michael@0 | 1484 | |
michael@0 | 1485 | if (limit32 < ut->chunkNativeLimit) { |
michael@0 | 1486 | for (i=0; i<3; i++) { |
michael@0 | 1487 | if (U8_IS_SINGLE(buf[limit32]) || U8_IS_LEAD(buf[limit32]) || limit32==0) { |
michael@0 | 1488 | break; |
michael@0 | 1489 | } |
michael@0 | 1490 | limit32--; |
michael@0 | 1491 | } |
michael@0 | 1492 | } |
michael@0 | 1493 | |
michael@0 | 1494 | // Do the actual extract. |
michael@0 | 1495 | int32_t destLength=0; |
michael@0 | 1496 | utext_strFromUTF8(dest, destCapacity, &destLength, |
michael@0 | 1497 | (const char *)ut->context+start32, limit32-start32, |
michael@0 | 1498 | pErrorCode); |
michael@0 | 1499 | utf8TextAccess(ut, limit32, TRUE); |
michael@0 | 1500 | return destLength; |
michael@0 | 1501 | } |
michael@0 | 1502 | |
michael@0 | 1503 | // |
michael@0 | 1504 | // utf8TextMapOffsetToNative |
michael@0 | 1505 | // |
michael@0 | 1506 | // Map a chunk (UTF-16) offset to a native index. |
michael@0 | 1507 | static int64_t U_CALLCONV |
michael@0 | 1508 | utf8TextMapOffsetToNative(const UText *ut) { |
michael@0 | 1509 | // |
michael@0 | 1510 | UTF8Buf *u8b = (UTF8Buf *)ut->p; |
michael@0 | 1511 | U_ASSERT(ut->chunkOffset>ut->nativeIndexingLimit && ut->chunkOffset<=ut->chunkLength); |
michael@0 | 1512 | int32_t nativeOffset = u8b->mapToNative[ut->chunkOffset + u8b->bufStartIdx] + u8b->toUCharsMapStart; |
michael@0 | 1513 | U_ASSERT(nativeOffset >= ut->chunkNativeStart && nativeOffset <= ut->chunkNativeLimit); |
michael@0 | 1514 | return nativeOffset; |
michael@0 | 1515 | } |
michael@0 | 1516 | |
michael@0 | 1517 | // |
michael@0 | 1518 | // Map a native index to the corrsponding chunk offset |
michael@0 | 1519 | // |
michael@0 | 1520 | static int32_t U_CALLCONV |
michael@0 | 1521 | utf8TextMapIndexToUTF16(const UText *ut, int64_t index64) { |
michael@0 | 1522 | U_ASSERT(index64 <= 0x7fffffff); |
michael@0 | 1523 | int32_t index = (int32_t)index64; |
michael@0 | 1524 | UTF8Buf *u8b = (UTF8Buf *)ut->p; |
michael@0 | 1525 | U_ASSERT(index>=ut->chunkNativeStart+ut->nativeIndexingLimit); |
michael@0 | 1526 | U_ASSERT(index<=ut->chunkNativeLimit); |
michael@0 | 1527 | int32_t mapIndex = index - u8b->toUCharsMapStart; |
michael@0 | 1528 | int32_t offset = u8b->mapToUChars[mapIndex] - u8b->bufStartIdx; |
michael@0 | 1529 | U_ASSERT(offset>=0 && offset<=ut->chunkLength); |
michael@0 | 1530 | return offset; |
michael@0 | 1531 | } |
michael@0 | 1532 | |
michael@0 | 1533 | static UText * U_CALLCONV |
michael@0 | 1534 | utf8TextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) |
michael@0 | 1535 | { |
michael@0 | 1536 | // First do a generic shallow clone. Does everything needed for the UText struct itself. |
michael@0 | 1537 | dest = shallowTextClone(dest, src, status); |
michael@0 | 1538 | |
michael@0 | 1539 | // For deep clones, make a copy of the string. |
michael@0 | 1540 | // The copied storage is owned by the newly created clone. |
michael@0 | 1541 | // |
michael@0 | 1542 | // TODO: There is an isssue with using utext_nativeLength(). |
michael@0 | 1543 | // That function is non-const in cases where the input was NUL terminated |
michael@0 | 1544 | // and the length has not yet been determined. |
michael@0 | 1545 | // This function (clone()) is const. |
michael@0 | 1546 | // There potentially a thread safety issue lurking here. |
michael@0 | 1547 | // |
michael@0 | 1548 | if (deep && U_SUCCESS(*status)) { |
michael@0 | 1549 | int32_t len = (int32_t)utext_nativeLength((UText *)src); |
michael@0 | 1550 | char *copyStr = (char *)uprv_malloc(len+1); |
michael@0 | 1551 | if (copyStr == NULL) { |
michael@0 | 1552 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 1553 | } else { |
michael@0 | 1554 | uprv_memcpy(copyStr, src->context, len+1); |
michael@0 | 1555 | dest->context = copyStr; |
michael@0 | 1556 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); |
michael@0 | 1557 | } |
michael@0 | 1558 | } |
michael@0 | 1559 | return dest; |
michael@0 | 1560 | } |
michael@0 | 1561 | |
michael@0 | 1562 | |
michael@0 | 1563 | static void U_CALLCONV |
michael@0 | 1564 | utf8TextClose(UText *ut) { |
michael@0 | 1565 | // Most of the work of close is done by the generic UText framework close. |
michael@0 | 1566 | // All that needs to be done here is to delete the UTF8 string if the UText |
michael@0 | 1567 | // owns it. This occurs if the UText was created by cloning. |
michael@0 | 1568 | if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { |
michael@0 | 1569 | char *s = (char *)ut->context; |
michael@0 | 1570 | uprv_free(s); |
michael@0 | 1571 | ut->context = NULL; |
michael@0 | 1572 | } |
michael@0 | 1573 | } |
michael@0 | 1574 | |
michael@0 | 1575 | U_CDECL_END |
michael@0 | 1576 | |
michael@0 | 1577 | |
michael@0 | 1578 | static const struct UTextFuncs utf8Funcs = |
michael@0 | 1579 | { |
michael@0 | 1580 | sizeof(UTextFuncs), |
michael@0 | 1581 | 0, 0, 0, // Reserved alignment padding |
michael@0 | 1582 | utf8TextClone, |
michael@0 | 1583 | utf8TextLength, |
michael@0 | 1584 | utf8TextAccess, |
michael@0 | 1585 | utf8TextExtract, |
michael@0 | 1586 | NULL, /* replace*/ |
michael@0 | 1587 | NULL, /* copy */ |
michael@0 | 1588 | utf8TextMapOffsetToNative, |
michael@0 | 1589 | utf8TextMapIndexToUTF16, |
michael@0 | 1590 | utf8TextClose, |
michael@0 | 1591 | NULL, // spare 1 |
michael@0 | 1592 | NULL, // spare 2 |
michael@0 | 1593 | NULL // spare 3 |
michael@0 | 1594 | }; |
michael@0 | 1595 | |
michael@0 | 1596 | |
michael@0 | 1597 | static const char gEmptyString[] = {0}; |
michael@0 | 1598 | |
michael@0 | 1599 | U_CAPI UText * U_EXPORT2 |
michael@0 | 1600 | utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status) { |
michael@0 | 1601 | if(U_FAILURE(*status)) { |
michael@0 | 1602 | return NULL; |
michael@0 | 1603 | } |
michael@0 | 1604 | if(s==NULL && length==0) { |
michael@0 | 1605 | s = gEmptyString; |
michael@0 | 1606 | } |
michael@0 | 1607 | |
michael@0 | 1608 | if(s==NULL || length<-1 || length>INT32_MAX) { |
michael@0 | 1609 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1610 | return NULL; |
michael@0 | 1611 | } |
michael@0 | 1612 | |
michael@0 | 1613 | ut = utext_setup(ut, sizeof(UTF8Buf) * 2, status); |
michael@0 | 1614 | if (U_FAILURE(*status)) { |
michael@0 | 1615 | return ut; |
michael@0 | 1616 | } |
michael@0 | 1617 | |
michael@0 | 1618 | ut->pFuncs = &utf8Funcs; |
michael@0 | 1619 | ut->context = s; |
michael@0 | 1620 | ut->b = (int32_t)length; |
michael@0 | 1621 | ut->c = (int32_t)length; |
michael@0 | 1622 | if (ut->c < 0) { |
michael@0 | 1623 | ut->c = 0; |
michael@0 | 1624 | ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 1625 | } |
michael@0 | 1626 | ut->p = ut->pExtra; |
michael@0 | 1627 | ut->q = (char *)ut->pExtra + sizeof(UTF8Buf); |
michael@0 | 1628 | return ut; |
michael@0 | 1629 | |
michael@0 | 1630 | } |
michael@0 | 1631 | |
michael@0 | 1632 | |
michael@0 | 1633 | |
michael@0 | 1634 | |
michael@0 | 1635 | |
michael@0 | 1636 | |
michael@0 | 1637 | |
michael@0 | 1638 | |
michael@0 | 1639 | //------------------------------------------------------------------------------ |
michael@0 | 1640 | // |
michael@0 | 1641 | // UText implementation wrapper for Replaceable (read/write) |
michael@0 | 1642 | // |
michael@0 | 1643 | // Use of UText data members: |
michael@0 | 1644 | // context pointer to Replaceable. |
michael@0 | 1645 | // p pointer to Replaceable if it is owned by the UText. |
michael@0 | 1646 | // |
michael@0 | 1647 | //------------------------------------------------------------------------------ |
michael@0 | 1648 | |
michael@0 | 1649 | |
michael@0 | 1650 | |
michael@0 | 1651 | // minimum chunk size for this implementation: 3 |
michael@0 | 1652 | // to allow for possible trimming for code point boundaries |
michael@0 | 1653 | enum { REP_TEXT_CHUNK_SIZE=10 }; |
michael@0 | 1654 | |
michael@0 | 1655 | struct ReplExtra { |
michael@0 | 1656 | /* |
michael@0 | 1657 | * Chunk UChars. |
michael@0 | 1658 | * +1 to simplify filling with surrogate pair at the end. |
michael@0 | 1659 | */ |
michael@0 | 1660 | UChar s[REP_TEXT_CHUNK_SIZE+1]; |
michael@0 | 1661 | }; |
michael@0 | 1662 | |
michael@0 | 1663 | |
michael@0 | 1664 | U_CDECL_BEGIN |
michael@0 | 1665 | |
michael@0 | 1666 | static UText * U_CALLCONV |
michael@0 | 1667 | repTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { |
michael@0 | 1668 | // First do a generic shallow clone. Does everything needed for the UText struct itself. |
michael@0 | 1669 | dest = shallowTextClone(dest, src, status); |
michael@0 | 1670 | |
michael@0 | 1671 | // For deep clones, make a copy of the Replaceable. |
michael@0 | 1672 | // The copied Replaceable storage is owned by the newly created UText clone. |
michael@0 | 1673 | // A non-NULL pointer in UText.p is the signal to the close() function to delete |
michael@0 | 1674 | // it. |
michael@0 | 1675 | // |
michael@0 | 1676 | if (deep && U_SUCCESS(*status)) { |
michael@0 | 1677 | const Replaceable *replSrc = (const Replaceable *)src->context; |
michael@0 | 1678 | dest->context = replSrc->clone(); |
michael@0 | 1679 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); |
michael@0 | 1680 | |
michael@0 | 1681 | // with deep clone, the copy is writable, even when the source is not. |
michael@0 | 1682 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); |
michael@0 | 1683 | } |
michael@0 | 1684 | return dest; |
michael@0 | 1685 | } |
michael@0 | 1686 | |
michael@0 | 1687 | |
michael@0 | 1688 | static void U_CALLCONV |
michael@0 | 1689 | repTextClose(UText *ut) { |
michael@0 | 1690 | // Most of the work of close is done by the generic UText framework close. |
michael@0 | 1691 | // All that needs to be done here is delete the Replaceable if the UText |
michael@0 | 1692 | // owns it. This occurs if the UText was created by cloning. |
michael@0 | 1693 | if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { |
michael@0 | 1694 | Replaceable *rep = (Replaceable *)ut->context; |
michael@0 | 1695 | delete rep; |
michael@0 | 1696 | ut->context = NULL; |
michael@0 | 1697 | } |
michael@0 | 1698 | } |
michael@0 | 1699 | |
michael@0 | 1700 | |
michael@0 | 1701 | static int64_t U_CALLCONV |
michael@0 | 1702 | repTextLength(UText *ut) { |
michael@0 | 1703 | const Replaceable *replSrc = (const Replaceable *)ut->context; |
michael@0 | 1704 | int32_t len = replSrc->length(); |
michael@0 | 1705 | return len; |
michael@0 | 1706 | } |
michael@0 | 1707 | |
michael@0 | 1708 | |
michael@0 | 1709 | static UBool U_CALLCONV |
michael@0 | 1710 | repTextAccess(UText *ut, int64_t index, UBool forward) { |
michael@0 | 1711 | const Replaceable *rep=(const Replaceable *)ut->context; |
michael@0 | 1712 | int32_t length=rep->length(); // Full length of the input text (bigger than a chunk) |
michael@0 | 1713 | |
michael@0 | 1714 | // clip the requested index to the limits of the text. |
michael@0 | 1715 | int32_t index32 = pinIndex(index, length); |
michael@0 | 1716 | U_ASSERT(index<=INT32_MAX); |
michael@0 | 1717 | |
michael@0 | 1718 | |
michael@0 | 1719 | /* |
michael@0 | 1720 | * Compute start/limit boundaries around index, for a segment of text |
michael@0 | 1721 | * to be extracted. |
michael@0 | 1722 | * To allow for the possibility that our user gave an index to the trailing |
michael@0 | 1723 | * half of a surrogate pair, we must request one extra preceding UChar when |
michael@0 | 1724 | * going in the forward direction. This will ensure that the buffer has the |
michael@0 | 1725 | * entire code point at the specified index. |
michael@0 | 1726 | */ |
michael@0 | 1727 | if(forward) { |
michael@0 | 1728 | |
michael@0 | 1729 | if (index32>=ut->chunkNativeStart && index32<ut->chunkNativeLimit) { |
michael@0 | 1730 | // Buffer already contains the requested position. |
michael@0 | 1731 | ut->chunkOffset = (int32_t)(index - ut->chunkNativeStart); |
michael@0 | 1732 | return TRUE; |
michael@0 | 1733 | } |
michael@0 | 1734 | if (index32>=length && ut->chunkNativeLimit==length) { |
michael@0 | 1735 | // Request for end of string, and buffer already extends up to it. |
michael@0 | 1736 | // Can't get the data, but don't change the buffer. |
michael@0 | 1737 | ut->chunkOffset = length - (int32_t)ut->chunkNativeStart; |
michael@0 | 1738 | return FALSE; |
michael@0 | 1739 | } |
michael@0 | 1740 | |
michael@0 | 1741 | ut->chunkNativeLimit = index + REP_TEXT_CHUNK_SIZE - 1; |
michael@0 | 1742 | // Going forward, so we want to have the buffer with stuff at and beyond |
michael@0 | 1743 | // the requested index. The -1 gets us one code point before the |
michael@0 | 1744 | // requested index also, to handle the case of the index being on |
michael@0 | 1745 | // a trail surrogate of a surrogate pair. |
michael@0 | 1746 | if(ut->chunkNativeLimit > length) { |
michael@0 | 1747 | ut->chunkNativeLimit = length; |
michael@0 | 1748 | } |
michael@0 | 1749 | // unless buffer ran off end, start is index-1. |
michael@0 | 1750 | ut->chunkNativeStart = ut->chunkNativeLimit - REP_TEXT_CHUNK_SIZE; |
michael@0 | 1751 | if(ut->chunkNativeStart < 0) { |
michael@0 | 1752 | ut->chunkNativeStart = 0; |
michael@0 | 1753 | } |
michael@0 | 1754 | } else { |
michael@0 | 1755 | // Reverse iteration. Fill buffer with data preceding the requested index. |
michael@0 | 1756 | if (index32>ut->chunkNativeStart && index32<=ut->chunkNativeLimit) { |
michael@0 | 1757 | // Requested position already in buffer. |
michael@0 | 1758 | ut->chunkOffset = index32 - (int32_t)ut->chunkNativeStart; |
michael@0 | 1759 | return TRUE; |
michael@0 | 1760 | } |
michael@0 | 1761 | if (index32==0 && ut->chunkNativeStart==0) { |
michael@0 | 1762 | // Request for start, buffer already begins at start. |
michael@0 | 1763 | // No data, but keep the buffer as is. |
michael@0 | 1764 | ut->chunkOffset = 0; |
michael@0 | 1765 | return FALSE; |
michael@0 | 1766 | } |
michael@0 | 1767 | |
michael@0 | 1768 | // Figure out the bounds of the chunk to extract for reverse iteration. |
michael@0 | 1769 | // Need to worry about chunk not splitting surrogate pairs, and while still |
michael@0 | 1770 | // containing the data we need. |
michael@0 | 1771 | // Fix by requesting a chunk that includes an extra UChar at the end. |
michael@0 | 1772 | // If this turns out to be a lead surrogate, we can lop it off and still have |
michael@0 | 1773 | // the data we wanted. |
michael@0 | 1774 | ut->chunkNativeStart = index32 + 1 - REP_TEXT_CHUNK_SIZE; |
michael@0 | 1775 | if (ut->chunkNativeStart < 0) { |
michael@0 | 1776 | ut->chunkNativeStart = 0; |
michael@0 | 1777 | } |
michael@0 | 1778 | |
michael@0 | 1779 | ut->chunkNativeLimit = index32 + 1; |
michael@0 | 1780 | if (ut->chunkNativeLimit > length) { |
michael@0 | 1781 | ut->chunkNativeLimit = length; |
michael@0 | 1782 | } |
michael@0 | 1783 | } |
michael@0 | 1784 | |
michael@0 | 1785 | // Extract the new chunk of text from the Replaceable source. |
michael@0 | 1786 | ReplExtra *ex = (ReplExtra *)ut->pExtra; |
michael@0 | 1787 | // UnicodeString with its buffer a writable alias to the chunk buffer |
michael@0 | 1788 | UnicodeString buffer(ex->s, 0 /*buffer length*/, REP_TEXT_CHUNK_SIZE /*buffer capacity*/); |
michael@0 | 1789 | rep->extractBetween((int32_t)ut->chunkNativeStart, (int32_t)ut->chunkNativeLimit, buffer); |
michael@0 | 1790 | |
michael@0 | 1791 | ut->chunkContents = ex->s; |
michael@0 | 1792 | ut->chunkLength = (int32_t)(ut->chunkNativeLimit - ut->chunkNativeStart); |
michael@0 | 1793 | ut->chunkOffset = (int32_t)(index32 - ut->chunkNativeStart); |
michael@0 | 1794 | |
michael@0 | 1795 | // Surrogate pairs from the input text must not span chunk boundaries. |
michael@0 | 1796 | // If end of chunk could be the start of a surrogate, trim it off. |
michael@0 | 1797 | if (ut->chunkNativeLimit < length && |
michael@0 | 1798 | U16_IS_LEAD(ex->s[ut->chunkLength-1])) { |
michael@0 | 1799 | ut->chunkLength--; |
michael@0 | 1800 | ut->chunkNativeLimit--; |
michael@0 | 1801 | if (ut->chunkOffset > ut->chunkLength) { |
michael@0 | 1802 | ut->chunkOffset = ut->chunkLength; |
michael@0 | 1803 | } |
michael@0 | 1804 | } |
michael@0 | 1805 | |
michael@0 | 1806 | // if the first UChar in the chunk could be the trailing half of a surrogate pair, |
michael@0 | 1807 | // trim it off. |
michael@0 | 1808 | if(ut->chunkNativeStart>0 && U16_IS_TRAIL(ex->s[0])) { |
michael@0 | 1809 | ++(ut->chunkContents); |
michael@0 | 1810 | ++(ut->chunkNativeStart); |
michael@0 | 1811 | --(ut->chunkLength); |
michael@0 | 1812 | --(ut->chunkOffset); |
michael@0 | 1813 | } |
michael@0 | 1814 | |
michael@0 | 1815 | // adjust the index/chunkOffset to a code point boundary |
michael@0 | 1816 | U16_SET_CP_START(ut->chunkContents, 0, ut->chunkOffset); |
michael@0 | 1817 | |
michael@0 | 1818 | // Use fast indexing for get/setNativeIndex() |
michael@0 | 1819 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 1820 | |
michael@0 | 1821 | return TRUE; |
michael@0 | 1822 | } |
michael@0 | 1823 | |
michael@0 | 1824 | |
michael@0 | 1825 | |
michael@0 | 1826 | static int32_t U_CALLCONV |
michael@0 | 1827 | repTextExtract(UText *ut, |
michael@0 | 1828 | int64_t start, int64_t limit, |
michael@0 | 1829 | UChar *dest, int32_t destCapacity, |
michael@0 | 1830 | UErrorCode *status) { |
michael@0 | 1831 | const Replaceable *rep=(const Replaceable *)ut->context; |
michael@0 | 1832 | int32_t length=rep->length(); |
michael@0 | 1833 | |
michael@0 | 1834 | if(U_FAILURE(*status)) { |
michael@0 | 1835 | return 0; |
michael@0 | 1836 | } |
michael@0 | 1837 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { |
michael@0 | 1838 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1839 | } |
michael@0 | 1840 | if(start>limit) { |
michael@0 | 1841 | *status=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1842 | return 0; |
michael@0 | 1843 | } |
michael@0 | 1844 | |
michael@0 | 1845 | int32_t start32 = pinIndex(start, length); |
michael@0 | 1846 | int32_t limit32 = pinIndex(limit, length); |
michael@0 | 1847 | |
michael@0 | 1848 | // adjust start, limit if they point to trail half of surrogates |
michael@0 | 1849 | if (start32<length && U16_IS_TRAIL(rep->charAt(start32)) && |
michael@0 | 1850 | U_IS_SUPPLEMENTARY(rep->char32At(start32))){ |
michael@0 | 1851 | start32--; |
michael@0 | 1852 | } |
michael@0 | 1853 | if (limit32<length && U16_IS_TRAIL(rep->charAt(limit32)) && |
michael@0 | 1854 | U_IS_SUPPLEMENTARY(rep->char32At(limit32))){ |
michael@0 | 1855 | limit32--; |
michael@0 | 1856 | } |
michael@0 | 1857 | |
michael@0 | 1858 | length=limit32-start32; |
michael@0 | 1859 | if(length>destCapacity) { |
michael@0 | 1860 | limit32 = start32 + destCapacity; |
michael@0 | 1861 | } |
michael@0 | 1862 | UnicodeString buffer(dest, 0, destCapacity); // writable alias |
michael@0 | 1863 | rep->extractBetween(start32, limit32, buffer); |
michael@0 | 1864 | repTextAccess(ut, limit32, TRUE); |
michael@0 | 1865 | |
michael@0 | 1866 | return u_terminateUChars(dest, destCapacity, length, status); |
michael@0 | 1867 | } |
michael@0 | 1868 | |
michael@0 | 1869 | static int32_t U_CALLCONV |
michael@0 | 1870 | repTextReplace(UText *ut, |
michael@0 | 1871 | int64_t start, int64_t limit, |
michael@0 | 1872 | const UChar *src, int32_t length, |
michael@0 | 1873 | UErrorCode *status) { |
michael@0 | 1874 | Replaceable *rep=(Replaceable *)ut->context; |
michael@0 | 1875 | int32_t oldLength; |
michael@0 | 1876 | |
michael@0 | 1877 | if(U_FAILURE(*status)) { |
michael@0 | 1878 | return 0; |
michael@0 | 1879 | } |
michael@0 | 1880 | if(src==NULL && length!=0) { |
michael@0 | 1881 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1882 | return 0; |
michael@0 | 1883 | } |
michael@0 | 1884 | oldLength=rep->length(); // will subtract from new length |
michael@0 | 1885 | if(start>limit ) { |
michael@0 | 1886 | *status=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1887 | return 0; |
michael@0 | 1888 | } |
michael@0 | 1889 | |
michael@0 | 1890 | int32_t start32 = pinIndex(start, oldLength); |
michael@0 | 1891 | int32_t limit32 = pinIndex(limit, oldLength); |
michael@0 | 1892 | |
michael@0 | 1893 | // Snap start & limit to code point boundaries. |
michael@0 | 1894 | if (start32<oldLength && U16_IS_TRAIL(rep->charAt(start32)) && |
michael@0 | 1895 | start32>0 && U16_IS_LEAD(rep->charAt(start32-1))) |
michael@0 | 1896 | { |
michael@0 | 1897 | start32--; |
michael@0 | 1898 | } |
michael@0 | 1899 | if (limit32<oldLength && U16_IS_LEAD(rep->charAt(limit32-1)) && |
michael@0 | 1900 | U16_IS_TRAIL(rep->charAt(limit32))) |
michael@0 | 1901 | { |
michael@0 | 1902 | limit32++; |
michael@0 | 1903 | } |
michael@0 | 1904 | |
michael@0 | 1905 | // Do the actual replace operation using methods of the Replaceable class |
michael@0 | 1906 | UnicodeString replStr((UBool)(length<0), src, length); // read-only alias |
michael@0 | 1907 | rep->handleReplaceBetween(start32, limit32, replStr); |
michael@0 | 1908 | int32_t newLength = rep->length(); |
michael@0 | 1909 | int32_t lengthDelta = newLength - oldLength; |
michael@0 | 1910 | |
michael@0 | 1911 | // Is the UText chunk buffer OK? |
michael@0 | 1912 | if (ut->chunkNativeLimit > start32) { |
michael@0 | 1913 | // this replace operation may have impacted the current chunk. |
michael@0 | 1914 | // invalidate it, which will force a reload on the next access. |
michael@0 | 1915 | invalidateChunk(ut); |
michael@0 | 1916 | } |
michael@0 | 1917 | |
michael@0 | 1918 | // set the iteration position to the end of the newly inserted replacement text. |
michael@0 | 1919 | int32_t newIndexPos = limit32 + lengthDelta; |
michael@0 | 1920 | repTextAccess(ut, newIndexPos, TRUE); |
michael@0 | 1921 | |
michael@0 | 1922 | return lengthDelta; |
michael@0 | 1923 | } |
michael@0 | 1924 | |
michael@0 | 1925 | |
michael@0 | 1926 | static void U_CALLCONV |
michael@0 | 1927 | repTextCopy(UText *ut, |
michael@0 | 1928 | int64_t start, int64_t limit, |
michael@0 | 1929 | int64_t destIndex, |
michael@0 | 1930 | UBool move, |
michael@0 | 1931 | UErrorCode *status) |
michael@0 | 1932 | { |
michael@0 | 1933 | Replaceable *rep=(Replaceable *)ut->context; |
michael@0 | 1934 | int32_t length=rep->length(); |
michael@0 | 1935 | |
michael@0 | 1936 | if(U_FAILURE(*status)) { |
michael@0 | 1937 | return; |
michael@0 | 1938 | } |
michael@0 | 1939 | if (start>limit || (start<destIndex && destIndex<limit)) |
michael@0 | 1940 | { |
michael@0 | 1941 | *status=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1942 | return; |
michael@0 | 1943 | } |
michael@0 | 1944 | |
michael@0 | 1945 | int32_t start32 = pinIndex(start, length); |
michael@0 | 1946 | int32_t limit32 = pinIndex(limit, length); |
michael@0 | 1947 | int32_t destIndex32 = pinIndex(destIndex, length); |
michael@0 | 1948 | |
michael@0 | 1949 | // TODO: snap input parameters to code point boundaries. |
michael@0 | 1950 | |
michael@0 | 1951 | if(move) { |
michael@0 | 1952 | // move: copy to destIndex, then replace original with nothing |
michael@0 | 1953 | int32_t segLength=limit32-start32; |
michael@0 | 1954 | rep->copy(start32, limit32, destIndex32); |
michael@0 | 1955 | if(destIndex32<start32) { |
michael@0 | 1956 | start32+=segLength; |
michael@0 | 1957 | limit32+=segLength; |
michael@0 | 1958 | } |
michael@0 | 1959 | rep->handleReplaceBetween(start32, limit32, UnicodeString()); |
michael@0 | 1960 | } else { |
michael@0 | 1961 | // copy |
michael@0 | 1962 | rep->copy(start32, limit32, destIndex32); |
michael@0 | 1963 | } |
michael@0 | 1964 | |
michael@0 | 1965 | // If the change to the text touched the region in the chunk buffer, |
michael@0 | 1966 | // invalidate the buffer. |
michael@0 | 1967 | int32_t firstAffectedIndex = destIndex32; |
michael@0 | 1968 | if (move && start32<firstAffectedIndex) { |
michael@0 | 1969 | firstAffectedIndex = start32; |
michael@0 | 1970 | } |
michael@0 | 1971 | if (firstAffectedIndex < ut->chunkNativeLimit) { |
michael@0 | 1972 | // changes may have affected range covered by the chunk |
michael@0 | 1973 | invalidateChunk(ut); |
michael@0 | 1974 | } |
michael@0 | 1975 | |
michael@0 | 1976 | // Put iteration position at the newly inserted (moved) block, |
michael@0 | 1977 | int32_t nativeIterIndex = destIndex32 + limit32 - start32; |
michael@0 | 1978 | if (move && destIndex32>start32) { |
michael@0 | 1979 | // moved a block of text towards the end of the string. |
michael@0 | 1980 | nativeIterIndex = destIndex32; |
michael@0 | 1981 | } |
michael@0 | 1982 | |
michael@0 | 1983 | // Set position, reload chunk if needed. |
michael@0 | 1984 | repTextAccess(ut, nativeIterIndex, TRUE); |
michael@0 | 1985 | } |
michael@0 | 1986 | |
michael@0 | 1987 | static const struct UTextFuncs repFuncs = |
michael@0 | 1988 | { |
michael@0 | 1989 | sizeof(UTextFuncs), |
michael@0 | 1990 | 0, 0, 0, // Reserved alignment padding |
michael@0 | 1991 | repTextClone, |
michael@0 | 1992 | repTextLength, |
michael@0 | 1993 | repTextAccess, |
michael@0 | 1994 | repTextExtract, |
michael@0 | 1995 | repTextReplace, |
michael@0 | 1996 | repTextCopy, |
michael@0 | 1997 | NULL, // MapOffsetToNative, |
michael@0 | 1998 | NULL, // MapIndexToUTF16, |
michael@0 | 1999 | repTextClose, |
michael@0 | 2000 | NULL, // spare 1 |
michael@0 | 2001 | NULL, // spare 2 |
michael@0 | 2002 | NULL // spare 3 |
michael@0 | 2003 | }; |
michael@0 | 2004 | |
michael@0 | 2005 | |
michael@0 | 2006 | U_CAPI UText * U_EXPORT2 |
michael@0 | 2007 | utext_openReplaceable(UText *ut, Replaceable *rep, UErrorCode *status) |
michael@0 | 2008 | { |
michael@0 | 2009 | if(U_FAILURE(*status)) { |
michael@0 | 2010 | return NULL; |
michael@0 | 2011 | } |
michael@0 | 2012 | if(rep==NULL) { |
michael@0 | 2013 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2014 | return NULL; |
michael@0 | 2015 | } |
michael@0 | 2016 | ut = utext_setup(ut, sizeof(ReplExtra), status); |
michael@0 | 2017 | |
michael@0 | 2018 | ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_WRITABLE); |
michael@0 | 2019 | if(rep->hasMetaData()) { |
michael@0 | 2020 | ut->providerProperties |=I32_FLAG(UTEXT_PROVIDER_HAS_META_DATA); |
michael@0 | 2021 | } |
michael@0 | 2022 | |
michael@0 | 2023 | ut->pFuncs = &repFuncs; |
michael@0 | 2024 | ut->context = rep; |
michael@0 | 2025 | return ut; |
michael@0 | 2026 | } |
michael@0 | 2027 | |
michael@0 | 2028 | U_CDECL_END |
michael@0 | 2029 | |
michael@0 | 2030 | |
michael@0 | 2031 | |
michael@0 | 2032 | |
michael@0 | 2033 | |
michael@0 | 2034 | |
michael@0 | 2035 | |
michael@0 | 2036 | |
michael@0 | 2037 | //------------------------------------------------------------------------------ |
michael@0 | 2038 | // |
michael@0 | 2039 | // UText implementation for UnicodeString (read/write) and |
michael@0 | 2040 | // for const UnicodeString (read only) |
michael@0 | 2041 | // (same implementation, only the flags are different) |
michael@0 | 2042 | // |
michael@0 | 2043 | // Use of UText data members: |
michael@0 | 2044 | // context pointer to UnicodeString |
michael@0 | 2045 | // p pointer to UnicodeString IF this UText owns the string |
michael@0 | 2046 | // and it must be deleted on close(). NULL otherwise. |
michael@0 | 2047 | // |
michael@0 | 2048 | //------------------------------------------------------------------------------ |
michael@0 | 2049 | |
michael@0 | 2050 | U_CDECL_BEGIN |
michael@0 | 2051 | |
michael@0 | 2052 | |
michael@0 | 2053 | static UText * U_CALLCONV |
michael@0 | 2054 | unistrTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status) { |
michael@0 | 2055 | // First do a generic shallow clone. Does everything needed for the UText struct itself. |
michael@0 | 2056 | dest = shallowTextClone(dest, src, status); |
michael@0 | 2057 | |
michael@0 | 2058 | // For deep clones, make a copy of the UnicodeSring. |
michael@0 | 2059 | // The copied UnicodeString storage is owned by the newly created UText clone. |
michael@0 | 2060 | // A non-NULL pointer in UText.p is the signal to the close() function to delete |
michael@0 | 2061 | // the UText. |
michael@0 | 2062 | // |
michael@0 | 2063 | if (deep && U_SUCCESS(*status)) { |
michael@0 | 2064 | const UnicodeString *srcString = (const UnicodeString *)src->context; |
michael@0 | 2065 | dest->context = new UnicodeString(*srcString); |
michael@0 | 2066 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); |
michael@0 | 2067 | |
michael@0 | 2068 | // with deep clone, the copy is writable, even when the source is not. |
michael@0 | 2069 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); |
michael@0 | 2070 | } |
michael@0 | 2071 | return dest; |
michael@0 | 2072 | } |
michael@0 | 2073 | |
michael@0 | 2074 | static void U_CALLCONV |
michael@0 | 2075 | unistrTextClose(UText *ut) { |
michael@0 | 2076 | // Most of the work of close is done by the generic UText framework close. |
michael@0 | 2077 | // All that needs to be done here is delete the UnicodeString if the UText |
michael@0 | 2078 | // owns it. This occurs if the UText was created by cloning. |
michael@0 | 2079 | if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { |
michael@0 | 2080 | UnicodeString *str = (UnicodeString *)ut->context; |
michael@0 | 2081 | delete str; |
michael@0 | 2082 | ut->context = NULL; |
michael@0 | 2083 | } |
michael@0 | 2084 | } |
michael@0 | 2085 | |
michael@0 | 2086 | |
michael@0 | 2087 | static int64_t U_CALLCONV |
michael@0 | 2088 | unistrTextLength(UText *t) { |
michael@0 | 2089 | return ((const UnicodeString *)t->context)->length(); |
michael@0 | 2090 | } |
michael@0 | 2091 | |
michael@0 | 2092 | |
michael@0 | 2093 | static UBool U_CALLCONV |
michael@0 | 2094 | unistrTextAccess(UText *ut, int64_t index, UBool forward) { |
michael@0 | 2095 | int32_t length = ut->chunkLength; |
michael@0 | 2096 | ut->chunkOffset = pinIndex(index, length); |
michael@0 | 2097 | |
michael@0 | 2098 | // Check whether request is at the start or end |
michael@0 | 2099 | UBool retVal = (forward && index<length) || (!forward && index>0); |
michael@0 | 2100 | return retVal; |
michael@0 | 2101 | } |
michael@0 | 2102 | |
michael@0 | 2103 | |
michael@0 | 2104 | |
michael@0 | 2105 | static int32_t U_CALLCONV |
michael@0 | 2106 | unistrTextExtract(UText *t, |
michael@0 | 2107 | int64_t start, int64_t limit, |
michael@0 | 2108 | UChar *dest, int32_t destCapacity, |
michael@0 | 2109 | UErrorCode *pErrorCode) { |
michael@0 | 2110 | const UnicodeString *us=(const UnicodeString *)t->context; |
michael@0 | 2111 | int32_t length=us->length(); |
michael@0 | 2112 | |
michael@0 | 2113 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 2114 | return 0; |
michael@0 | 2115 | } |
michael@0 | 2116 | if(destCapacity<0 || (dest==NULL && destCapacity>0)) { |
michael@0 | 2117 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2118 | } |
michael@0 | 2119 | if(start<0 || start>limit) { |
michael@0 | 2120 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 2121 | return 0; |
michael@0 | 2122 | } |
michael@0 | 2123 | |
michael@0 | 2124 | int32_t start32 = start<length ? us->getChar32Start((int32_t)start) : length; |
michael@0 | 2125 | int32_t limit32 = limit<length ? us->getChar32Start((int32_t)limit) : length; |
michael@0 | 2126 | |
michael@0 | 2127 | length=limit32-start32; |
michael@0 | 2128 | if (destCapacity>0 && dest!=NULL) { |
michael@0 | 2129 | int32_t trimmedLength = length; |
michael@0 | 2130 | if(trimmedLength>destCapacity) { |
michael@0 | 2131 | trimmedLength=destCapacity; |
michael@0 | 2132 | } |
michael@0 | 2133 | us->extract(start32, trimmedLength, dest); |
michael@0 | 2134 | t->chunkOffset = start32+trimmedLength; |
michael@0 | 2135 | } else { |
michael@0 | 2136 | t->chunkOffset = start32; |
michael@0 | 2137 | } |
michael@0 | 2138 | u_terminateUChars(dest, destCapacity, length, pErrorCode); |
michael@0 | 2139 | return length; |
michael@0 | 2140 | } |
michael@0 | 2141 | |
michael@0 | 2142 | static int32_t U_CALLCONV |
michael@0 | 2143 | unistrTextReplace(UText *ut, |
michael@0 | 2144 | int64_t start, int64_t limit, |
michael@0 | 2145 | const UChar *src, int32_t length, |
michael@0 | 2146 | UErrorCode *pErrorCode) { |
michael@0 | 2147 | UnicodeString *us=(UnicodeString *)ut->context; |
michael@0 | 2148 | int32_t oldLength; |
michael@0 | 2149 | |
michael@0 | 2150 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 2151 | return 0; |
michael@0 | 2152 | } |
michael@0 | 2153 | if(src==NULL && length!=0) { |
michael@0 | 2154 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2155 | } |
michael@0 | 2156 | if(start>limit) { |
michael@0 | 2157 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 2158 | return 0; |
michael@0 | 2159 | } |
michael@0 | 2160 | oldLength=us->length(); |
michael@0 | 2161 | int32_t start32 = pinIndex(start, oldLength); |
michael@0 | 2162 | int32_t limit32 = pinIndex(limit, oldLength); |
michael@0 | 2163 | if (start32 < oldLength) { |
michael@0 | 2164 | start32 = us->getChar32Start(start32); |
michael@0 | 2165 | } |
michael@0 | 2166 | if (limit32 < oldLength) { |
michael@0 | 2167 | limit32 = us->getChar32Start(limit32); |
michael@0 | 2168 | } |
michael@0 | 2169 | |
michael@0 | 2170 | // replace |
michael@0 | 2171 | us->replace(start32, limit32-start32, src, length); |
michael@0 | 2172 | int32_t newLength = us->length(); |
michael@0 | 2173 | |
michael@0 | 2174 | // Update the chunk description. |
michael@0 | 2175 | ut->chunkContents = us->getBuffer(); |
michael@0 | 2176 | ut->chunkLength = newLength; |
michael@0 | 2177 | ut->chunkNativeLimit = newLength; |
michael@0 | 2178 | ut->nativeIndexingLimit = newLength; |
michael@0 | 2179 | |
michael@0 | 2180 | // Set iteration position to the point just following the newly inserted text. |
michael@0 | 2181 | int32_t lengthDelta = newLength - oldLength; |
michael@0 | 2182 | ut->chunkOffset = limit32 + lengthDelta; |
michael@0 | 2183 | |
michael@0 | 2184 | return lengthDelta; |
michael@0 | 2185 | } |
michael@0 | 2186 | |
michael@0 | 2187 | static void U_CALLCONV |
michael@0 | 2188 | unistrTextCopy(UText *ut, |
michael@0 | 2189 | int64_t start, int64_t limit, |
michael@0 | 2190 | int64_t destIndex, |
michael@0 | 2191 | UBool move, |
michael@0 | 2192 | UErrorCode *pErrorCode) { |
michael@0 | 2193 | UnicodeString *us=(UnicodeString *)ut->context; |
michael@0 | 2194 | int32_t length=us->length(); |
michael@0 | 2195 | |
michael@0 | 2196 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 2197 | return; |
michael@0 | 2198 | } |
michael@0 | 2199 | int32_t start32 = pinIndex(start, length); |
michael@0 | 2200 | int32_t limit32 = pinIndex(limit, length); |
michael@0 | 2201 | int32_t destIndex32 = pinIndex(destIndex, length); |
michael@0 | 2202 | |
michael@0 | 2203 | if( start32>limit32 || (start32<destIndex32 && destIndex32<limit32)) { |
michael@0 | 2204 | *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 2205 | return; |
michael@0 | 2206 | } |
michael@0 | 2207 | |
michael@0 | 2208 | if(move) { |
michael@0 | 2209 | // move: copy to destIndex, then replace original with nothing |
michael@0 | 2210 | int32_t segLength=limit32-start32; |
michael@0 | 2211 | us->copy(start32, limit32, destIndex32); |
michael@0 | 2212 | if(destIndex32<start32) { |
michael@0 | 2213 | start32+=segLength; |
michael@0 | 2214 | } |
michael@0 | 2215 | us->replace(start32, segLength, NULL, 0); |
michael@0 | 2216 | } else { |
michael@0 | 2217 | // copy |
michael@0 | 2218 | us->copy(start32, limit32, destIndex32); |
michael@0 | 2219 | } |
michael@0 | 2220 | |
michael@0 | 2221 | // update chunk description, set iteration position. |
michael@0 | 2222 | ut->chunkContents = us->getBuffer(); |
michael@0 | 2223 | if (move==FALSE) { |
michael@0 | 2224 | // copy operation, string length grows |
michael@0 | 2225 | ut->chunkLength += limit32-start32; |
michael@0 | 2226 | ut->chunkNativeLimit = ut->chunkLength; |
michael@0 | 2227 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 2228 | } |
michael@0 | 2229 | |
michael@0 | 2230 | // Iteration position to end of the newly inserted text. |
michael@0 | 2231 | ut->chunkOffset = destIndex32+limit32-start32; |
michael@0 | 2232 | if (move && destIndex32>start32) { |
michael@0 | 2233 | ut->chunkOffset = destIndex32; |
michael@0 | 2234 | } |
michael@0 | 2235 | |
michael@0 | 2236 | } |
michael@0 | 2237 | |
michael@0 | 2238 | static const struct UTextFuncs unistrFuncs = |
michael@0 | 2239 | { |
michael@0 | 2240 | sizeof(UTextFuncs), |
michael@0 | 2241 | 0, 0, 0, // Reserved alignment padding |
michael@0 | 2242 | unistrTextClone, |
michael@0 | 2243 | unistrTextLength, |
michael@0 | 2244 | unistrTextAccess, |
michael@0 | 2245 | unistrTextExtract, |
michael@0 | 2246 | unistrTextReplace, |
michael@0 | 2247 | unistrTextCopy, |
michael@0 | 2248 | NULL, // MapOffsetToNative, |
michael@0 | 2249 | NULL, // MapIndexToUTF16, |
michael@0 | 2250 | unistrTextClose, |
michael@0 | 2251 | NULL, // spare 1 |
michael@0 | 2252 | NULL, // spare 2 |
michael@0 | 2253 | NULL // spare 3 |
michael@0 | 2254 | }; |
michael@0 | 2255 | |
michael@0 | 2256 | |
michael@0 | 2257 | |
michael@0 | 2258 | U_CDECL_END |
michael@0 | 2259 | |
michael@0 | 2260 | |
michael@0 | 2261 | U_CAPI UText * U_EXPORT2 |
michael@0 | 2262 | utext_openUnicodeString(UText *ut, UnicodeString *s, UErrorCode *status) { |
michael@0 | 2263 | ut = utext_openConstUnicodeString(ut, s, status); |
michael@0 | 2264 | if (U_SUCCESS(*status)) { |
michael@0 | 2265 | ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_WRITABLE); |
michael@0 | 2266 | } |
michael@0 | 2267 | return ut; |
michael@0 | 2268 | } |
michael@0 | 2269 | |
michael@0 | 2270 | |
michael@0 | 2271 | |
michael@0 | 2272 | U_CAPI UText * U_EXPORT2 |
michael@0 | 2273 | utext_openConstUnicodeString(UText *ut, const UnicodeString *s, UErrorCode *status) { |
michael@0 | 2274 | if (U_SUCCESS(*status) && s->isBogus()) { |
michael@0 | 2275 | // The UnicodeString is bogus, but we still need to detach the UText |
michael@0 | 2276 | // from whatever it was hooked to before, if anything. |
michael@0 | 2277 | utext_openUChars(ut, NULL, 0, status); |
michael@0 | 2278 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2279 | return ut; |
michael@0 | 2280 | } |
michael@0 | 2281 | ut = utext_setup(ut, 0, status); |
michael@0 | 2282 | // note: use the standard (writable) function table for UnicodeString. |
michael@0 | 2283 | // The flag settings disable writing, so having the functions in |
michael@0 | 2284 | // the table is harmless. |
michael@0 | 2285 | if (U_SUCCESS(*status)) { |
michael@0 | 2286 | ut->pFuncs = &unistrFuncs; |
michael@0 | 2287 | ut->context = s; |
michael@0 | 2288 | ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); |
michael@0 | 2289 | ut->chunkContents = s->getBuffer(); |
michael@0 | 2290 | ut->chunkLength = s->length(); |
michael@0 | 2291 | ut->chunkNativeStart = 0; |
michael@0 | 2292 | ut->chunkNativeLimit = ut->chunkLength; |
michael@0 | 2293 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 2294 | } |
michael@0 | 2295 | return ut; |
michael@0 | 2296 | } |
michael@0 | 2297 | |
michael@0 | 2298 | //------------------------------------------------------------------------------ |
michael@0 | 2299 | // |
michael@0 | 2300 | // UText implementation for const UChar * strings |
michael@0 | 2301 | // |
michael@0 | 2302 | // Use of UText data members: |
michael@0 | 2303 | // context pointer to UnicodeString |
michael@0 | 2304 | // a length. -1 if not yet known. |
michael@0 | 2305 | // |
michael@0 | 2306 | // TODO: support 64 bit lengths. |
michael@0 | 2307 | // |
michael@0 | 2308 | //------------------------------------------------------------------------------ |
michael@0 | 2309 | |
michael@0 | 2310 | U_CDECL_BEGIN |
michael@0 | 2311 | |
michael@0 | 2312 | |
michael@0 | 2313 | static UText * U_CALLCONV |
michael@0 | 2314 | ucstrTextClone(UText *dest, const UText * src, UBool deep, UErrorCode * status) { |
michael@0 | 2315 | // First do a generic shallow clone. |
michael@0 | 2316 | dest = shallowTextClone(dest, src, status); |
michael@0 | 2317 | |
michael@0 | 2318 | // For deep clones, make a copy of the string. |
michael@0 | 2319 | // The copied storage is owned by the newly created clone. |
michael@0 | 2320 | // A non-NULL pointer in UText.p is the signal to the close() function to delete |
michael@0 | 2321 | // it. |
michael@0 | 2322 | // |
michael@0 | 2323 | if (deep && U_SUCCESS(*status)) { |
michael@0 | 2324 | U_ASSERT(utext_nativeLength(dest) < INT32_MAX); |
michael@0 | 2325 | int32_t len = (int32_t)utext_nativeLength(dest); |
michael@0 | 2326 | |
michael@0 | 2327 | // The cloned string IS going to be NUL terminated, whether or not the original was. |
michael@0 | 2328 | const UChar *srcStr = (const UChar *)src->context; |
michael@0 | 2329 | UChar *copyStr = (UChar *)uprv_malloc((len+1) * sizeof(UChar)); |
michael@0 | 2330 | if (copyStr == NULL) { |
michael@0 | 2331 | *status = U_MEMORY_ALLOCATION_ERROR; |
michael@0 | 2332 | } else { |
michael@0 | 2333 | int64_t i; |
michael@0 | 2334 | for (i=0; i<len; i++) { |
michael@0 | 2335 | copyStr[i] = srcStr[i]; |
michael@0 | 2336 | } |
michael@0 | 2337 | copyStr[len] = 0; |
michael@0 | 2338 | dest->context = copyStr; |
michael@0 | 2339 | dest->providerProperties |= I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT); |
michael@0 | 2340 | } |
michael@0 | 2341 | } |
michael@0 | 2342 | return dest; |
michael@0 | 2343 | } |
michael@0 | 2344 | |
michael@0 | 2345 | |
michael@0 | 2346 | static void U_CALLCONV |
michael@0 | 2347 | ucstrTextClose(UText *ut) { |
michael@0 | 2348 | // Most of the work of close is done by the generic UText framework close. |
michael@0 | 2349 | // All that needs to be done here is delete the string if the UText |
michael@0 | 2350 | // owns it. This occurs if the UText was created by cloning. |
michael@0 | 2351 | if (ut->providerProperties & I32_FLAG(UTEXT_PROVIDER_OWNS_TEXT)) { |
michael@0 | 2352 | UChar *s = (UChar *)ut->context; |
michael@0 | 2353 | uprv_free(s); |
michael@0 | 2354 | ut->context = NULL; |
michael@0 | 2355 | } |
michael@0 | 2356 | } |
michael@0 | 2357 | |
michael@0 | 2358 | |
michael@0 | 2359 | |
michael@0 | 2360 | static int64_t U_CALLCONV |
michael@0 | 2361 | ucstrTextLength(UText *ut) { |
michael@0 | 2362 | if (ut->a < 0) { |
michael@0 | 2363 | // null terminated, we don't yet know the length. Scan for it. |
michael@0 | 2364 | // Access is not convenient for doing this |
michael@0 | 2365 | // because the current interation postion can't be changed. |
michael@0 | 2366 | const UChar *str = (const UChar *)ut->context; |
michael@0 | 2367 | for (;;) { |
michael@0 | 2368 | if (str[ut->chunkNativeLimit] == 0) { |
michael@0 | 2369 | break; |
michael@0 | 2370 | } |
michael@0 | 2371 | ut->chunkNativeLimit++; |
michael@0 | 2372 | } |
michael@0 | 2373 | ut->a = ut->chunkNativeLimit; |
michael@0 | 2374 | ut->chunkLength = (int32_t)ut->chunkNativeLimit; |
michael@0 | 2375 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 2376 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 2377 | } |
michael@0 | 2378 | return ut->a; |
michael@0 | 2379 | } |
michael@0 | 2380 | |
michael@0 | 2381 | |
michael@0 | 2382 | static UBool U_CALLCONV |
michael@0 | 2383 | ucstrTextAccess(UText *ut, int64_t index, UBool forward) { |
michael@0 | 2384 | const UChar *str = (const UChar *)ut->context; |
michael@0 | 2385 | |
michael@0 | 2386 | // pin the requested index to the bounds of the string, |
michael@0 | 2387 | // and set current iteration position. |
michael@0 | 2388 | if (index<0) { |
michael@0 | 2389 | index = 0; |
michael@0 | 2390 | } else if (index < ut->chunkNativeLimit) { |
michael@0 | 2391 | // The request data is within the chunk as it is known so far. |
michael@0 | 2392 | // Put index on a code point boundary. |
michael@0 | 2393 | U16_SET_CP_START(str, 0, index); |
michael@0 | 2394 | } else if (ut->a >= 0) { |
michael@0 | 2395 | // We know the length of this string, and the user is requesting something |
michael@0 | 2396 | // at or beyond the length. Pin the requested index to the length. |
michael@0 | 2397 | index = ut->a; |
michael@0 | 2398 | } else { |
michael@0 | 2399 | // Null terminated string, length not yet known, and the requested index |
michael@0 | 2400 | // is beyond where we have scanned so far. |
michael@0 | 2401 | // Scan to 32 UChars beyond the requested index. The strategy here is |
michael@0 | 2402 | // to avoid fully scanning a long string when the caller only wants to |
michael@0 | 2403 | // see a few characters at its beginning. |
michael@0 | 2404 | int32_t scanLimit = (int32_t)index + 32; |
michael@0 | 2405 | if ((index + 32)>INT32_MAX || (index + 32)<0 ) { // note: int64 expression |
michael@0 | 2406 | scanLimit = INT32_MAX; |
michael@0 | 2407 | } |
michael@0 | 2408 | |
michael@0 | 2409 | int32_t chunkLimit = (int32_t)ut->chunkNativeLimit; |
michael@0 | 2410 | for (; chunkLimit<scanLimit; chunkLimit++) { |
michael@0 | 2411 | if (str[chunkLimit] == 0) { |
michael@0 | 2412 | // We found the end of the string. Remember it, pin the requested index to it, |
michael@0 | 2413 | // and bail out of here. |
michael@0 | 2414 | ut->a = chunkLimit; |
michael@0 | 2415 | ut->chunkLength = chunkLimit; |
michael@0 | 2416 | ut->nativeIndexingLimit = chunkLimit; |
michael@0 | 2417 | if (index >= chunkLimit) { |
michael@0 | 2418 | index = chunkLimit; |
michael@0 | 2419 | } else { |
michael@0 | 2420 | U16_SET_CP_START(str, 0, index); |
michael@0 | 2421 | } |
michael@0 | 2422 | |
michael@0 | 2423 | ut->chunkNativeLimit = chunkLimit; |
michael@0 | 2424 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 2425 | goto breakout; |
michael@0 | 2426 | } |
michael@0 | 2427 | } |
michael@0 | 2428 | // We scanned through the next batch of UChars without finding the end. |
michael@0 | 2429 | U16_SET_CP_START(str, 0, index); |
michael@0 | 2430 | if (chunkLimit == INT32_MAX) { |
michael@0 | 2431 | // Scanned to the limit of a 32 bit length. |
michael@0 | 2432 | // Forceably trim the overlength string back so length fits in int32 |
michael@0 | 2433 | // TODO: add support for 64 bit strings. |
michael@0 | 2434 | ut->a = chunkLimit; |
michael@0 | 2435 | ut->chunkLength = chunkLimit; |
michael@0 | 2436 | ut->nativeIndexingLimit = chunkLimit; |
michael@0 | 2437 | if (index > chunkLimit) { |
michael@0 | 2438 | index = chunkLimit; |
michael@0 | 2439 | } |
michael@0 | 2440 | ut->chunkNativeLimit = chunkLimit; |
michael@0 | 2441 | ut->providerProperties &= ~I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 2442 | } else { |
michael@0 | 2443 | // The endpoint of a chunk must not be left in the middle of a surrogate pair. |
michael@0 | 2444 | // If the current end is on a lead surrogate, back the end up by one. |
michael@0 | 2445 | // It doesn't matter if the end char happens to be an unpaired surrogate, |
michael@0 | 2446 | // and it's simpler not to worry about it. |
michael@0 | 2447 | if (U16_IS_LEAD(str[chunkLimit-1])) { |
michael@0 | 2448 | --chunkLimit; |
michael@0 | 2449 | } |
michael@0 | 2450 | // Null-terminated chunk with end still unknown. |
michael@0 | 2451 | // Update the chunk length to reflect what has been scanned thus far. |
michael@0 | 2452 | // That the full length is still unknown is (still) flagged by |
michael@0 | 2453 | // ut->a being < 0. |
michael@0 | 2454 | ut->chunkNativeLimit = chunkLimit; |
michael@0 | 2455 | ut->nativeIndexingLimit = chunkLimit; |
michael@0 | 2456 | ut->chunkLength = chunkLimit; |
michael@0 | 2457 | } |
michael@0 | 2458 | |
michael@0 | 2459 | } |
michael@0 | 2460 | breakout: |
michael@0 | 2461 | U_ASSERT(index<=INT32_MAX); |
michael@0 | 2462 | ut->chunkOffset = (int32_t)index; |
michael@0 | 2463 | |
michael@0 | 2464 | // Check whether request is at the start or end |
michael@0 | 2465 | UBool retVal = (forward && index<ut->chunkNativeLimit) || (!forward && index>0); |
michael@0 | 2466 | return retVal; |
michael@0 | 2467 | } |
michael@0 | 2468 | |
michael@0 | 2469 | |
michael@0 | 2470 | |
michael@0 | 2471 | static int32_t U_CALLCONV |
michael@0 | 2472 | ucstrTextExtract(UText *ut, |
michael@0 | 2473 | int64_t start, int64_t limit, |
michael@0 | 2474 | UChar *dest, int32_t destCapacity, |
michael@0 | 2475 | UErrorCode *pErrorCode) |
michael@0 | 2476 | { |
michael@0 | 2477 | if(U_FAILURE(*pErrorCode)) { |
michael@0 | 2478 | return 0; |
michael@0 | 2479 | } |
michael@0 | 2480 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { |
michael@0 | 2481 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2482 | return 0; |
michael@0 | 2483 | } |
michael@0 | 2484 | |
michael@0 | 2485 | //const UChar *s=(const UChar *)ut->context; |
michael@0 | 2486 | int32_t si, di; |
michael@0 | 2487 | |
michael@0 | 2488 | int32_t start32; |
michael@0 | 2489 | int32_t limit32; |
michael@0 | 2490 | |
michael@0 | 2491 | // Access the start. Does two things we need: |
michael@0 | 2492 | // Pins 'start' to the length of the string, if it came in out-of-bounds. |
michael@0 | 2493 | // Snaps 'start' to the beginning of a code point. |
michael@0 | 2494 | ucstrTextAccess(ut, start, TRUE); |
michael@0 | 2495 | const UChar *s=ut->chunkContents; |
michael@0 | 2496 | start32 = ut->chunkOffset; |
michael@0 | 2497 | |
michael@0 | 2498 | int32_t strLength=(int32_t)ut->a; |
michael@0 | 2499 | if (strLength >= 0) { |
michael@0 | 2500 | limit32 = pinIndex(limit, strLength); |
michael@0 | 2501 | } else { |
michael@0 | 2502 | limit32 = pinIndex(limit, INT32_MAX); |
michael@0 | 2503 | } |
michael@0 | 2504 | di = 0; |
michael@0 | 2505 | for (si=start32; si<limit32; si++) { |
michael@0 | 2506 | if (strLength<0 && s[si]==0) { |
michael@0 | 2507 | // Just hit the end of a null-terminated string. |
michael@0 | 2508 | ut->a = si; // set string length for this UText |
michael@0 | 2509 | ut->chunkNativeLimit = si; |
michael@0 | 2510 | ut->chunkLength = si; |
michael@0 | 2511 | ut->nativeIndexingLimit = si; |
michael@0 | 2512 | strLength = si; |
michael@0 | 2513 | break; |
michael@0 | 2514 | } |
michael@0 | 2515 | U_ASSERT(di>=0); /* to ensure di never exceeds INT32_MAX, which must not happen logically */ |
michael@0 | 2516 | if (di<destCapacity) { |
michael@0 | 2517 | // only store if there is space. |
michael@0 | 2518 | dest[di] = s[si]; |
michael@0 | 2519 | } else { |
michael@0 | 2520 | if (strLength>=0) { |
michael@0 | 2521 | // We have filled the destination buffer, and the string length is known. |
michael@0 | 2522 | // Cut the loop short. There is no need to scan string termination. |
michael@0 | 2523 | di = limit32 - start32; |
michael@0 | 2524 | si = limit32; |
michael@0 | 2525 | break; |
michael@0 | 2526 | } |
michael@0 | 2527 | } |
michael@0 | 2528 | di++; |
michael@0 | 2529 | } |
michael@0 | 2530 | |
michael@0 | 2531 | // If the limit index points to a lead surrogate of a pair, |
michael@0 | 2532 | // add the corresponding trail surrogate to the destination. |
michael@0 | 2533 | if (si>0 && U16_IS_LEAD(s[si-1]) && |
michael@0 | 2534 | ((si<strLength || strLength<0) && U16_IS_TRAIL(s[si]))) |
michael@0 | 2535 | { |
michael@0 | 2536 | if (di<destCapacity) { |
michael@0 | 2537 | // store only if there is space in the output buffer. |
michael@0 | 2538 | dest[di++] = s[si++]; |
michael@0 | 2539 | } |
michael@0 | 2540 | } |
michael@0 | 2541 | |
michael@0 | 2542 | // Put iteration position at the point just following the extracted text |
michael@0 | 2543 | ut->chunkOffset = uprv_min(strLength, start32 + destCapacity); |
michael@0 | 2544 | |
michael@0 | 2545 | // Add a terminating NUL if space in the buffer permits, |
michael@0 | 2546 | // and set the error status as required. |
michael@0 | 2547 | u_terminateUChars(dest, destCapacity, di, pErrorCode); |
michael@0 | 2548 | return di; |
michael@0 | 2549 | } |
michael@0 | 2550 | |
michael@0 | 2551 | static const struct UTextFuncs ucstrFuncs = |
michael@0 | 2552 | { |
michael@0 | 2553 | sizeof(UTextFuncs), |
michael@0 | 2554 | 0, 0, 0, // Reserved alignment padding |
michael@0 | 2555 | ucstrTextClone, |
michael@0 | 2556 | ucstrTextLength, |
michael@0 | 2557 | ucstrTextAccess, |
michael@0 | 2558 | ucstrTextExtract, |
michael@0 | 2559 | NULL, // Replace |
michael@0 | 2560 | NULL, // Copy |
michael@0 | 2561 | NULL, // MapOffsetToNative, |
michael@0 | 2562 | NULL, // MapIndexToUTF16, |
michael@0 | 2563 | ucstrTextClose, |
michael@0 | 2564 | NULL, // spare 1 |
michael@0 | 2565 | NULL, // spare 2 |
michael@0 | 2566 | NULL, // spare 3 |
michael@0 | 2567 | }; |
michael@0 | 2568 | |
michael@0 | 2569 | U_CDECL_END |
michael@0 | 2570 | |
michael@0 | 2571 | static const UChar gEmptyUString[] = {0}; |
michael@0 | 2572 | |
michael@0 | 2573 | U_CAPI UText * U_EXPORT2 |
michael@0 | 2574 | utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status) { |
michael@0 | 2575 | if (U_FAILURE(*status)) { |
michael@0 | 2576 | return NULL; |
michael@0 | 2577 | } |
michael@0 | 2578 | if(s==NULL && length==0) { |
michael@0 | 2579 | s = gEmptyUString; |
michael@0 | 2580 | } |
michael@0 | 2581 | if (s==NULL || length < -1 || length>INT32_MAX) { |
michael@0 | 2582 | *status = U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2583 | return NULL; |
michael@0 | 2584 | } |
michael@0 | 2585 | ut = utext_setup(ut, 0, status); |
michael@0 | 2586 | if (U_SUCCESS(*status)) { |
michael@0 | 2587 | ut->pFuncs = &ucstrFuncs; |
michael@0 | 2588 | ut->context = s; |
michael@0 | 2589 | ut->providerProperties = I32_FLAG(UTEXT_PROVIDER_STABLE_CHUNKS); |
michael@0 | 2590 | if (length==-1) { |
michael@0 | 2591 | ut->providerProperties |= I32_FLAG(UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE); |
michael@0 | 2592 | } |
michael@0 | 2593 | ut->a = length; |
michael@0 | 2594 | ut->chunkContents = s; |
michael@0 | 2595 | ut->chunkNativeStart = 0; |
michael@0 | 2596 | ut->chunkNativeLimit = length>=0? length : 0; |
michael@0 | 2597 | ut->chunkLength = (int32_t)ut->chunkNativeLimit; |
michael@0 | 2598 | ut->chunkOffset = 0; |
michael@0 | 2599 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 2600 | } |
michael@0 | 2601 | return ut; |
michael@0 | 2602 | } |
michael@0 | 2603 | |
michael@0 | 2604 | |
michael@0 | 2605 | //------------------------------------------------------------------------------ |
michael@0 | 2606 | // |
michael@0 | 2607 | // UText implementation for text from ICU CharacterIterators |
michael@0 | 2608 | // |
michael@0 | 2609 | // Use of UText data members: |
michael@0 | 2610 | // context pointer to the CharacterIterator |
michael@0 | 2611 | // a length of the full text. |
michael@0 | 2612 | // p pointer to buffer 1 |
michael@0 | 2613 | // b start index of local buffer 1 contents |
michael@0 | 2614 | // q pointer to buffer 2 |
michael@0 | 2615 | // c start index of local buffer 2 contents |
michael@0 | 2616 | // r pointer to the character iterator if the UText owns it. |
michael@0 | 2617 | // Null otherwise. |
michael@0 | 2618 | // |
michael@0 | 2619 | //------------------------------------------------------------------------------ |
michael@0 | 2620 | #define CIBufSize 16 |
michael@0 | 2621 | |
michael@0 | 2622 | U_CDECL_BEGIN |
michael@0 | 2623 | static void U_CALLCONV |
michael@0 | 2624 | charIterTextClose(UText *ut) { |
michael@0 | 2625 | // Most of the work of close is done by the generic UText framework close. |
michael@0 | 2626 | // All that needs to be done here is delete the CharacterIterator if the UText |
michael@0 | 2627 | // owns it. This occurs if the UText was created by cloning. |
michael@0 | 2628 | CharacterIterator *ci = (CharacterIterator *)ut->r; |
michael@0 | 2629 | delete ci; |
michael@0 | 2630 | ut->r = NULL; |
michael@0 | 2631 | } |
michael@0 | 2632 | |
michael@0 | 2633 | static int64_t U_CALLCONV |
michael@0 | 2634 | charIterTextLength(UText *ut) { |
michael@0 | 2635 | return (int32_t)ut->a; |
michael@0 | 2636 | } |
michael@0 | 2637 | |
michael@0 | 2638 | static UBool U_CALLCONV |
michael@0 | 2639 | charIterTextAccess(UText *ut, int64_t index, UBool forward) { |
michael@0 | 2640 | CharacterIterator *ci = (CharacterIterator *)ut->context; |
michael@0 | 2641 | |
michael@0 | 2642 | int32_t clippedIndex = (int32_t)index; |
michael@0 | 2643 | if (clippedIndex<0) { |
michael@0 | 2644 | clippedIndex=0; |
michael@0 | 2645 | } else if (clippedIndex>=ut->a) { |
michael@0 | 2646 | clippedIndex=(int32_t)ut->a; |
michael@0 | 2647 | } |
michael@0 | 2648 | int32_t neededIndex = clippedIndex; |
michael@0 | 2649 | if (!forward && neededIndex>0) { |
michael@0 | 2650 | // reverse iteration, want the position just before what was asked for. |
michael@0 | 2651 | neededIndex--; |
michael@0 | 2652 | } else if (forward && neededIndex==ut->a && neededIndex>0) { |
michael@0 | 2653 | // Forward iteration, don't ask for something past the end of the text. |
michael@0 | 2654 | neededIndex--; |
michael@0 | 2655 | } |
michael@0 | 2656 | |
michael@0 | 2657 | // Find the native index of the start of the buffer containing what we want. |
michael@0 | 2658 | neededIndex -= neededIndex % CIBufSize; |
michael@0 | 2659 | |
michael@0 | 2660 | UChar *buf = NULL; |
michael@0 | 2661 | UBool needChunkSetup = TRUE; |
michael@0 | 2662 | int i; |
michael@0 | 2663 | if (ut->chunkNativeStart == neededIndex) { |
michael@0 | 2664 | // The buffer we want is already the current chunk. |
michael@0 | 2665 | needChunkSetup = FALSE; |
michael@0 | 2666 | } else if (ut->b == neededIndex) { |
michael@0 | 2667 | // The first buffer (buffer p) has what we need. |
michael@0 | 2668 | buf = (UChar *)ut->p; |
michael@0 | 2669 | } else if (ut->c == neededIndex) { |
michael@0 | 2670 | // The second buffer (buffer q) has what we need. |
michael@0 | 2671 | buf = (UChar *)ut->q; |
michael@0 | 2672 | } else { |
michael@0 | 2673 | // Neither buffer already has what we need. |
michael@0 | 2674 | // Load new data from the character iterator. |
michael@0 | 2675 | // Use the buf that is not the current buffer. |
michael@0 | 2676 | buf = (UChar *)ut->p; |
michael@0 | 2677 | if (ut->p == ut->chunkContents) { |
michael@0 | 2678 | buf = (UChar *)ut->q; |
michael@0 | 2679 | } |
michael@0 | 2680 | ci->setIndex(neededIndex); |
michael@0 | 2681 | for (i=0; i<CIBufSize; i++) { |
michael@0 | 2682 | buf[i] = ci->nextPostInc(); |
michael@0 | 2683 | if (i+neededIndex > ut->a) { |
michael@0 | 2684 | break; |
michael@0 | 2685 | } |
michael@0 | 2686 | } |
michael@0 | 2687 | } |
michael@0 | 2688 | |
michael@0 | 2689 | // We have a buffer with the data we need. |
michael@0 | 2690 | // Set it up as the current chunk, if it wasn't already. |
michael@0 | 2691 | if (needChunkSetup) { |
michael@0 | 2692 | ut->chunkContents = buf; |
michael@0 | 2693 | ut->chunkLength = CIBufSize; |
michael@0 | 2694 | ut->chunkNativeStart = neededIndex; |
michael@0 | 2695 | ut->chunkNativeLimit = neededIndex + CIBufSize; |
michael@0 | 2696 | if (ut->chunkNativeLimit > ut->a) { |
michael@0 | 2697 | ut->chunkNativeLimit = ut->a; |
michael@0 | 2698 | ut->chunkLength = (int32_t)(ut->chunkNativeLimit)-(int32_t)(ut->chunkNativeStart); |
michael@0 | 2699 | } |
michael@0 | 2700 | ut->nativeIndexingLimit = ut->chunkLength; |
michael@0 | 2701 | U_ASSERT(ut->chunkOffset>=0 && ut->chunkOffset<=CIBufSize); |
michael@0 | 2702 | } |
michael@0 | 2703 | ut->chunkOffset = clippedIndex - (int32_t)ut->chunkNativeStart; |
michael@0 | 2704 | UBool success = (forward? ut->chunkOffset<ut->chunkLength : ut->chunkOffset>0); |
michael@0 | 2705 | return success; |
michael@0 | 2706 | } |
michael@0 | 2707 | |
michael@0 | 2708 | static UText * U_CALLCONV |
michael@0 | 2709 | charIterTextClone(UText *dest, const UText *src, UBool deep, UErrorCode * status) { |
michael@0 | 2710 | if (U_FAILURE(*status)) { |
michael@0 | 2711 | return NULL; |
michael@0 | 2712 | } |
michael@0 | 2713 | |
michael@0 | 2714 | if (deep) { |
michael@0 | 2715 | // There is no CharacterIterator API for cloning the underlying text storage. |
michael@0 | 2716 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 2717 | return NULL; |
michael@0 | 2718 | } else { |
michael@0 | 2719 | CharacterIterator *srcCI =(CharacterIterator *)src->context; |
michael@0 | 2720 | srcCI = srcCI->clone(); |
michael@0 | 2721 | dest = utext_openCharacterIterator(dest, srcCI, status); |
michael@0 | 2722 | // cast off const on getNativeIndex. |
michael@0 | 2723 | // For CharacterIterator based UTexts, this is safe, the operation is const. |
michael@0 | 2724 | int64_t ix = utext_getNativeIndex((UText *)src); |
michael@0 | 2725 | utext_setNativeIndex(dest, ix); |
michael@0 | 2726 | dest->r = srcCI; // flags that this UText owns the CharacterIterator |
michael@0 | 2727 | } |
michael@0 | 2728 | return dest; |
michael@0 | 2729 | } |
michael@0 | 2730 | |
michael@0 | 2731 | static int32_t U_CALLCONV |
michael@0 | 2732 | charIterTextExtract(UText *ut, |
michael@0 | 2733 | int64_t start, int64_t limit, |
michael@0 | 2734 | UChar *dest, int32_t destCapacity, |
michael@0 | 2735 | UErrorCode *status) |
michael@0 | 2736 | { |
michael@0 | 2737 | if(U_FAILURE(*status)) { |
michael@0 | 2738 | return 0; |
michael@0 | 2739 | } |
michael@0 | 2740 | if(destCapacity<0 || (dest==NULL && destCapacity>0) || start>limit) { |
michael@0 | 2741 | *status=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 2742 | return 0; |
michael@0 | 2743 | } |
michael@0 | 2744 | int32_t length = (int32_t)ut->a; |
michael@0 | 2745 | int32_t start32 = pinIndex(start, length); |
michael@0 | 2746 | int32_t limit32 = pinIndex(limit, length); |
michael@0 | 2747 | int32_t desti = 0; |
michael@0 | 2748 | int32_t srci; |
michael@0 | 2749 | int32_t copyLimit; |
michael@0 | 2750 | |
michael@0 | 2751 | CharacterIterator *ci = (CharacterIterator *)ut->context; |
michael@0 | 2752 | ci->setIndex32(start32); // Moves ix to lead of surrogate pair, if needed. |
michael@0 | 2753 | srci = ci->getIndex(); |
michael@0 | 2754 | copyLimit = srci; |
michael@0 | 2755 | while (srci<limit32) { |
michael@0 | 2756 | UChar32 c = ci->next32PostInc(); |
michael@0 | 2757 | int32_t len = U16_LENGTH(c); |
michael@0 | 2758 | U_ASSERT(desti+len>0); /* to ensure desti+len never exceeds MAX_INT32, which must not happen logically */ |
michael@0 | 2759 | if (desti+len <= destCapacity) { |
michael@0 | 2760 | U16_APPEND_UNSAFE(dest, desti, c); |
michael@0 | 2761 | copyLimit = srci+len; |
michael@0 | 2762 | } else { |
michael@0 | 2763 | desti += len; |
michael@0 | 2764 | *status = U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 2765 | } |
michael@0 | 2766 | srci += len; |
michael@0 | 2767 | } |
michael@0 | 2768 | |
michael@0 | 2769 | charIterTextAccess(ut, copyLimit, TRUE); |
michael@0 | 2770 | |
michael@0 | 2771 | u_terminateUChars(dest, destCapacity, desti, status); |
michael@0 | 2772 | return desti; |
michael@0 | 2773 | } |
michael@0 | 2774 | |
michael@0 | 2775 | static const struct UTextFuncs charIterFuncs = |
michael@0 | 2776 | { |
michael@0 | 2777 | sizeof(UTextFuncs), |
michael@0 | 2778 | 0, 0, 0, // Reserved alignment padding |
michael@0 | 2779 | charIterTextClone, |
michael@0 | 2780 | charIterTextLength, |
michael@0 | 2781 | charIterTextAccess, |
michael@0 | 2782 | charIterTextExtract, |
michael@0 | 2783 | NULL, // Replace |
michael@0 | 2784 | NULL, // Copy |
michael@0 | 2785 | NULL, // MapOffsetToNative, |
michael@0 | 2786 | NULL, // MapIndexToUTF16, |
michael@0 | 2787 | charIterTextClose, |
michael@0 | 2788 | NULL, // spare 1 |
michael@0 | 2789 | NULL, // spare 2 |
michael@0 | 2790 | NULL // spare 3 |
michael@0 | 2791 | }; |
michael@0 | 2792 | U_CDECL_END |
michael@0 | 2793 | |
michael@0 | 2794 | |
michael@0 | 2795 | U_CAPI UText * U_EXPORT2 |
michael@0 | 2796 | utext_openCharacterIterator(UText *ut, CharacterIterator *ci, UErrorCode *status) { |
michael@0 | 2797 | if (U_FAILURE(*status)) { |
michael@0 | 2798 | return NULL; |
michael@0 | 2799 | } |
michael@0 | 2800 | |
michael@0 | 2801 | if (ci->startIndex() > 0) { |
michael@0 | 2802 | // No support for CharacterIterators that do not start indexing from zero. |
michael@0 | 2803 | *status = U_UNSUPPORTED_ERROR; |
michael@0 | 2804 | return NULL; |
michael@0 | 2805 | } |
michael@0 | 2806 | |
michael@0 | 2807 | // Extra space in UText for 2 buffers of CIBufSize UChars each. |
michael@0 | 2808 | int32_t extraSpace = 2 * CIBufSize * sizeof(UChar); |
michael@0 | 2809 | ut = utext_setup(ut, extraSpace, status); |
michael@0 | 2810 | if (U_SUCCESS(*status)) { |
michael@0 | 2811 | ut->pFuncs = &charIterFuncs; |
michael@0 | 2812 | ut->context = ci; |
michael@0 | 2813 | ut->providerProperties = 0; |
michael@0 | 2814 | ut->a = ci->endIndex(); // Length of text |
michael@0 | 2815 | ut->p = ut->pExtra; // First buffer |
michael@0 | 2816 | ut->b = -1; // Native index of first buffer contents |
michael@0 | 2817 | ut->q = (UChar*)ut->pExtra+CIBufSize; // Second buffer |
michael@0 | 2818 | ut->c = -1; // Native index of second buffer contents |
michael@0 | 2819 | |
michael@0 | 2820 | // Initialize current chunk contents to be empty. |
michael@0 | 2821 | // First access will fault something in. |
michael@0 | 2822 | // Note: The initial nativeStart and chunkOffset must sum to zero |
michael@0 | 2823 | // so that getNativeIndex() will correctly compute to zero |
michael@0 | 2824 | // if no call to Access() has ever been made. They can't be both |
michael@0 | 2825 | // zero without Access() thinking that the chunk is valid. |
michael@0 | 2826 | ut->chunkContents = (UChar *)ut->p; |
michael@0 | 2827 | ut->chunkNativeStart = -1; |
michael@0 | 2828 | ut->chunkOffset = 1; |
michael@0 | 2829 | ut->chunkNativeLimit = 0; |
michael@0 | 2830 | ut->chunkLength = 0; |
michael@0 | 2831 | ut->nativeIndexingLimit = ut->chunkOffset; // enables native indexing |
michael@0 | 2832 | } |
michael@0 | 2833 | return ut; |
michael@0 | 2834 | } |
michael@0 | 2835 | |
michael@0 | 2836 | |
michael@0 | 2837 |