1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/uiter.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1106 @@ 1.4 +/* 1.5 +******************************************************************************* 1.6 +* 1.7 +* Copyright (C) 2002-2012, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +******************************************************************************* 1.11 +* file name: uiter.cpp 1.12 +* encoding: US-ASCII 1.13 +* tab size: 8 (not used) 1.14 +* indentation:4 1.15 +* 1.16 +* created on: 2002jan18 1.17 +* created by: Markus W. Scherer 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 +#include "unicode/ustring.h" 1.22 +#include "unicode/chariter.h" 1.23 +#include "unicode/rep.h" 1.24 +#include "unicode/uiter.h" 1.25 +#include "unicode/utf.h" 1.26 +#include "unicode/utf8.h" 1.27 +#include "unicode/utf16.h" 1.28 +#include "cstring.h" 1.29 + 1.30 +U_NAMESPACE_USE 1.31 + 1.32 +#define IS_EVEN(n) (((n)&1)==0) 1.33 +#define IS_POINTER_EVEN(p) IS_EVEN((size_t)p) 1.34 + 1.35 +U_CDECL_BEGIN 1.36 + 1.37 +/* No-Op UCharIterator implementation for illegal input --------------------- */ 1.38 + 1.39 +static int32_t U_CALLCONV 1.40 +noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) { 1.41 + return 0; 1.42 +} 1.43 + 1.44 +static int32_t U_CALLCONV 1.45 +noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) { 1.46 + return 0; 1.47 +} 1.48 + 1.49 +static UBool U_CALLCONV 1.50 +noopHasNext(UCharIterator * /*iter*/) { 1.51 + return FALSE; 1.52 +} 1.53 + 1.54 +static UChar32 U_CALLCONV 1.55 +noopCurrent(UCharIterator * /*iter*/) { 1.56 + return U_SENTINEL; 1.57 +} 1.58 + 1.59 +static uint32_t U_CALLCONV 1.60 +noopGetState(const UCharIterator * /*iter*/) { 1.61 + return UITER_NO_STATE; 1.62 +} 1.63 + 1.64 +static void U_CALLCONV 1.65 +noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) { 1.66 + *pErrorCode=U_UNSUPPORTED_ERROR; 1.67 +} 1.68 + 1.69 +static const UCharIterator noopIterator={ 1.70 + 0, 0, 0, 0, 0, 0, 1.71 + noopGetIndex, 1.72 + noopMove, 1.73 + noopHasNext, 1.74 + noopHasNext, 1.75 + noopCurrent, 1.76 + noopCurrent, 1.77 + noopCurrent, 1.78 + NULL, 1.79 + noopGetState, 1.80 + noopSetState 1.81 +}; 1.82 + 1.83 +/* UCharIterator implementation for simple strings -------------------------- */ 1.84 + 1.85 +/* 1.86 + * This is an implementation of a code unit (UChar) iterator 1.87 + * for UChar * strings. 1.88 + * 1.89 + * The UCharIterator.context field holds a pointer to the string. 1.90 + */ 1.91 + 1.92 +static int32_t U_CALLCONV 1.93 +stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 1.94 + switch(origin) { 1.95 + case UITER_ZERO: 1.96 + return 0; 1.97 + case UITER_START: 1.98 + return iter->start; 1.99 + case UITER_CURRENT: 1.100 + return iter->index; 1.101 + case UITER_LIMIT: 1.102 + return iter->limit; 1.103 + case UITER_LENGTH: 1.104 + return iter->length; 1.105 + default: 1.106 + /* not a valid origin */ 1.107 + /* Should never get here! */ 1.108 + return -1; 1.109 + } 1.110 +} 1.111 + 1.112 +static int32_t U_CALLCONV 1.113 +stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 1.114 + int32_t pos; 1.115 + 1.116 + switch(origin) { 1.117 + case UITER_ZERO: 1.118 + pos=delta; 1.119 + break; 1.120 + case UITER_START: 1.121 + pos=iter->start+delta; 1.122 + break; 1.123 + case UITER_CURRENT: 1.124 + pos=iter->index+delta; 1.125 + break; 1.126 + case UITER_LIMIT: 1.127 + pos=iter->limit+delta; 1.128 + break; 1.129 + case UITER_LENGTH: 1.130 + pos=iter->length+delta; 1.131 + break; 1.132 + default: 1.133 + return -1; /* Error */ 1.134 + } 1.135 + 1.136 + if(pos<iter->start) { 1.137 + pos=iter->start; 1.138 + } else if(pos>iter->limit) { 1.139 + pos=iter->limit; 1.140 + } 1.141 + 1.142 + return iter->index=pos; 1.143 +} 1.144 + 1.145 +static UBool U_CALLCONV 1.146 +stringIteratorHasNext(UCharIterator *iter) { 1.147 + return iter->index<iter->limit; 1.148 +} 1.149 + 1.150 +static UBool U_CALLCONV 1.151 +stringIteratorHasPrevious(UCharIterator *iter) { 1.152 + return iter->index>iter->start; 1.153 +} 1.154 + 1.155 +static UChar32 U_CALLCONV 1.156 +stringIteratorCurrent(UCharIterator *iter) { 1.157 + if(iter->index<iter->limit) { 1.158 + return ((const UChar *)(iter->context))[iter->index]; 1.159 + } else { 1.160 + return U_SENTINEL; 1.161 + } 1.162 +} 1.163 + 1.164 +static UChar32 U_CALLCONV 1.165 +stringIteratorNext(UCharIterator *iter) { 1.166 + if(iter->index<iter->limit) { 1.167 + return ((const UChar *)(iter->context))[iter->index++]; 1.168 + } else { 1.169 + return U_SENTINEL; 1.170 + } 1.171 +} 1.172 + 1.173 +static UChar32 U_CALLCONV 1.174 +stringIteratorPrevious(UCharIterator *iter) { 1.175 + if(iter->index>iter->start) { 1.176 + return ((const UChar *)(iter->context))[--iter->index]; 1.177 + } else { 1.178 + return U_SENTINEL; 1.179 + } 1.180 +} 1.181 + 1.182 +static uint32_t U_CALLCONV 1.183 +stringIteratorGetState(const UCharIterator *iter) { 1.184 + return (uint32_t)iter->index; 1.185 +} 1.186 + 1.187 +static void U_CALLCONV 1.188 +stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1.189 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.190 + /* do nothing */ 1.191 + } else if(iter==NULL) { 1.192 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.193 + } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) { 1.194 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.195 + } else { 1.196 + iter->index=(int32_t)state; 1.197 + } 1.198 +} 1.199 + 1.200 +static const UCharIterator stringIterator={ 1.201 + 0, 0, 0, 0, 0, 0, 1.202 + stringIteratorGetIndex, 1.203 + stringIteratorMove, 1.204 + stringIteratorHasNext, 1.205 + stringIteratorHasPrevious, 1.206 + stringIteratorCurrent, 1.207 + stringIteratorNext, 1.208 + stringIteratorPrevious, 1.209 + NULL, 1.210 + stringIteratorGetState, 1.211 + stringIteratorSetState 1.212 +}; 1.213 + 1.214 +U_CAPI void U_EXPORT2 1.215 +uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) { 1.216 + if(iter!=0) { 1.217 + if(s!=0 && length>=-1) { 1.218 + *iter=stringIterator; 1.219 + iter->context=s; 1.220 + if(length>=0) { 1.221 + iter->length=length; 1.222 + } else { 1.223 + iter->length=u_strlen(s); 1.224 + } 1.225 + iter->limit=iter->length; 1.226 + } else { 1.227 + *iter=noopIterator; 1.228 + } 1.229 + } 1.230 +} 1.231 + 1.232 +/* UCharIterator implementation for UTF-16BE strings ------------------------ */ 1.233 + 1.234 +/* 1.235 + * This is an implementation of a code unit (UChar) iterator 1.236 + * for UTF-16BE strings, i.e., strings in byte-vectors where 1.237 + * each UChar is stored as a big-endian pair of bytes. 1.238 + * 1.239 + * The UCharIterator.context field holds a pointer to the string. 1.240 + * Everything works just like with a normal UChar iterator (uiter_setString), 1.241 + * except that UChars are assembled from byte pairs. 1.242 + */ 1.243 + 1.244 +/* internal helper function */ 1.245 +static inline UChar32 1.246 +utf16BEIteratorGet(UCharIterator *iter, int32_t index) { 1.247 + const uint8_t *p=(const uint8_t *)iter->context; 1.248 + return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1]; 1.249 +} 1.250 + 1.251 +static UChar32 U_CALLCONV 1.252 +utf16BEIteratorCurrent(UCharIterator *iter) { 1.253 + int32_t index; 1.254 + 1.255 + if((index=iter->index)<iter->limit) { 1.256 + return utf16BEIteratorGet(iter, index); 1.257 + } else { 1.258 + return U_SENTINEL; 1.259 + } 1.260 +} 1.261 + 1.262 +static UChar32 U_CALLCONV 1.263 +utf16BEIteratorNext(UCharIterator *iter) { 1.264 + int32_t index; 1.265 + 1.266 + if((index=iter->index)<iter->limit) { 1.267 + iter->index=index+1; 1.268 + return utf16BEIteratorGet(iter, index); 1.269 + } else { 1.270 + return U_SENTINEL; 1.271 + } 1.272 +} 1.273 + 1.274 +static UChar32 U_CALLCONV 1.275 +utf16BEIteratorPrevious(UCharIterator *iter) { 1.276 + int32_t index; 1.277 + 1.278 + if((index=iter->index)>iter->start) { 1.279 + iter->index=--index; 1.280 + return utf16BEIteratorGet(iter, index); 1.281 + } else { 1.282 + return U_SENTINEL; 1.283 + } 1.284 +} 1.285 + 1.286 +static const UCharIterator utf16BEIterator={ 1.287 + 0, 0, 0, 0, 0, 0, 1.288 + stringIteratorGetIndex, 1.289 + stringIteratorMove, 1.290 + stringIteratorHasNext, 1.291 + stringIteratorHasPrevious, 1.292 + utf16BEIteratorCurrent, 1.293 + utf16BEIteratorNext, 1.294 + utf16BEIteratorPrevious, 1.295 + NULL, 1.296 + stringIteratorGetState, 1.297 + stringIteratorSetState 1.298 +}; 1.299 + 1.300 +/* 1.301 + * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL, 1.302 + * i.e., before a pair of 0 bytes where the first 0 byte is at an even 1.303 + * offset from s. 1.304 + */ 1.305 +static int32_t 1.306 +utf16BE_strlen(const char *s) { 1.307 + if(IS_POINTER_EVEN(s)) { 1.308 + /* 1.309 + * even-aligned, call u_strlen(s) 1.310 + * we are probably on a little-endian machine, but searching for UChar NUL 1.311 + * does not care about endianness 1.312 + */ 1.313 + return u_strlen((const UChar *)s); 1.314 + } else { 1.315 + /* odd-aligned, search for pair of 0 bytes */ 1.316 + const char *p=s; 1.317 + 1.318 + while(!(*p==0 && p[1]==0)) { 1.319 + p+=2; 1.320 + } 1.321 + return (int32_t)((p-s)/2); 1.322 + } 1.323 +} 1.324 + 1.325 +U_CAPI void U_EXPORT2 1.326 +uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) { 1.327 + if(iter!=NULL) { 1.328 + /* allow only even-length strings (the input length counts bytes) */ 1.329 + if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) { 1.330 + /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */ 1.331 + length>>=1; 1.332 + 1.333 + if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) { 1.334 + /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */ 1.335 + uiter_setString(iter, (const UChar *)s, length); 1.336 + return; 1.337 + } 1.338 + 1.339 + *iter=utf16BEIterator; 1.340 + iter->context=s; 1.341 + if(length>=0) { 1.342 + iter->length=length; 1.343 + } else { 1.344 + iter->length=utf16BE_strlen(s); 1.345 + } 1.346 + iter->limit=iter->length; 1.347 + } else { 1.348 + *iter=noopIterator; 1.349 + } 1.350 + } 1.351 +} 1.352 + 1.353 +/* UCharIterator wrapper around CharacterIterator --------------------------- */ 1.354 + 1.355 +/* 1.356 + * This is wrapper code around a C++ CharacterIterator to 1.357 + * look like a C UCharIterator. 1.358 + * 1.359 + * The UCharIterator.context field holds a pointer to the CharacterIterator. 1.360 + */ 1.361 + 1.362 +static int32_t U_CALLCONV 1.363 +characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 1.364 + switch(origin) { 1.365 + case UITER_ZERO: 1.366 + return 0; 1.367 + case UITER_START: 1.368 + return ((CharacterIterator *)(iter->context))->startIndex(); 1.369 + case UITER_CURRENT: 1.370 + return ((CharacterIterator *)(iter->context))->getIndex(); 1.371 + case UITER_LIMIT: 1.372 + return ((CharacterIterator *)(iter->context))->endIndex(); 1.373 + case UITER_LENGTH: 1.374 + return ((CharacterIterator *)(iter->context))->getLength(); 1.375 + default: 1.376 + /* not a valid origin */ 1.377 + /* Should never get here! */ 1.378 + return -1; 1.379 + } 1.380 +} 1.381 + 1.382 +static int32_t U_CALLCONV 1.383 +characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 1.384 + switch(origin) { 1.385 + case UITER_ZERO: 1.386 + ((CharacterIterator *)(iter->context))->setIndex(delta); 1.387 + return ((CharacterIterator *)(iter->context))->getIndex(); 1.388 + case UITER_START: 1.389 + case UITER_CURRENT: 1.390 + case UITER_LIMIT: 1.391 + return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin); 1.392 + case UITER_LENGTH: 1.393 + ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta); 1.394 + return ((CharacterIterator *)(iter->context))->getIndex(); 1.395 + default: 1.396 + /* not a valid origin */ 1.397 + /* Should never get here! */ 1.398 + return -1; 1.399 + } 1.400 +} 1.401 + 1.402 +static UBool U_CALLCONV 1.403 +characterIteratorHasNext(UCharIterator *iter) { 1.404 + return ((CharacterIterator *)(iter->context))->hasNext(); 1.405 +} 1.406 + 1.407 +static UBool U_CALLCONV 1.408 +characterIteratorHasPrevious(UCharIterator *iter) { 1.409 + return ((CharacterIterator *)(iter->context))->hasPrevious(); 1.410 +} 1.411 + 1.412 +static UChar32 U_CALLCONV 1.413 +characterIteratorCurrent(UCharIterator *iter) { 1.414 + UChar32 c; 1.415 + 1.416 + c=((CharacterIterator *)(iter->context))->current(); 1.417 + if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) { 1.418 + return c; 1.419 + } else { 1.420 + return U_SENTINEL; 1.421 + } 1.422 +} 1.423 + 1.424 +static UChar32 U_CALLCONV 1.425 +characterIteratorNext(UCharIterator *iter) { 1.426 + if(((CharacterIterator *)(iter->context))->hasNext()) { 1.427 + return ((CharacterIterator *)(iter->context))->nextPostInc(); 1.428 + } else { 1.429 + return U_SENTINEL; 1.430 + } 1.431 +} 1.432 + 1.433 +static UChar32 U_CALLCONV 1.434 +characterIteratorPrevious(UCharIterator *iter) { 1.435 + if(((CharacterIterator *)(iter->context))->hasPrevious()) { 1.436 + return ((CharacterIterator *)(iter->context))->previous(); 1.437 + } else { 1.438 + return U_SENTINEL; 1.439 + } 1.440 +} 1.441 + 1.442 +static uint32_t U_CALLCONV 1.443 +characterIteratorGetState(const UCharIterator *iter) { 1.444 + return ((CharacterIterator *)(iter->context))->getIndex(); 1.445 +} 1.446 + 1.447 +static void U_CALLCONV 1.448 +characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1.449 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.450 + /* do nothing */ 1.451 + } else if(iter==NULL || iter->context==NULL) { 1.452 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.453 + } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) { 1.454 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.455 + } else { 1.456 + ((CharacterIterator *)(iter->context))->setIndex((int32_t)state); 1.457 + } 1.458 +} 1.459 + 1.460 +static const UCharIterator characterIteratorWrapper={ 1.461 + 0, 0, 0, 0, 0, 0, 1.462 + characterIteratorGetIndex, 1.463 + characterIteratorMove, 1.464 + characterIteratorHasNext, 1.465 + characterIteratorHasPrevious, 1.466 + characterIteratorCurrent, 1.467 + characterIteratorNext, 1.468 + characterIteratorPrevious, 1.469 + NULL, 1.470 + characterIteratorGetState, 1.471 + characterIteratorSetState 1.472 +}; 1.473 + 1.474 +U_CAPI void U_EXPORT2 1.475 +uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) { 1.476 + if(iter!=0) { 1.477 + if(charIter!=0) { 1.478 + *iter=characterIteratorWrapper; 1.479 + iter->context=charIter; 1.480 + } else { 1.481 + *iter=noopIterator; 1.482 + } 1.483 + } 1.484 +} 1.485 + 1.486 +/* UCharIterator wrapper around Replaceable --------------------------------- */ 1.487 + 1.488 +/* 1.489 + * This is an implementation of a code unit (UChar) iterator 1.490 + * based on a Replaceable object. 1.491 + * 1.492 + * The UCharIterator.context field holds a pointer to the Replaceable. 1.493 + * UCharIterator.length and UCharIterator.index hold Replaceable.length() 1.494 + * and the iteration index. 1.495 + */ 1.496 + 1.497 +static UChar32 U_CALLCONV 1.498 +replaceableIteratorCurrent(UCharIterator *iter) { 1.499 + if(iter->index<iter->limit) { 1.500 + return ((Replaceable *)(iter->context))->charAt(iter->index); 1.501 + } else { 1.502 + return U_SENTINEL; 1.503 + } 1.504 +} 1.505 + 1.506 +static UChar32 U_CALLCONV 1.507 +replaceableIteratorNext(UCharIterator *iter) { 1.508 + if(iter->index<iter->limit) { 1.509 + return ((Replaceable *)(iter->context))->charAt(iter->index++); 1.510 + } else { 1.511 + return U_SENTINEL; 1.512 + } 1.513 +} 1.514 + 1.515 +static UChar32 U_CALLCONV 1.516 +replaceableIteratorPrevious(UCharIterator *iter) { 1.517 + if(iter->index>iter->start) { 1.518 + return ((Replaceable *)(iter->context))->charAt(--iter->index); 1.519 + } else { 1.520 + return U_SENTINEL; 1.521 + } 1.522 +} 1.523 + 1.524 +static const UCharIterator replaceableIterator={ 1.525 + 0, 0, 0, 0, 0, 0, 1.526 + stringIteratorGetIndex, 1.527 + stringIteratorMove, 1.528 + stringIteratorHasNext, 1.529 + stringIteratorHasPrevious, 1.530 + replaceableIteratorCurrent, 1.531 + replaceableIteratorNext, 1.532 + replaceableIteratorPrevious, 1.533 + NULL, 1.534 + stringIteratorGetState, 1.535 + stringIteratorSetState 1.536 +}; 1.537 + 1.538 +U_CAPI void U_EXPORT2 1.539 +uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) { 1.540 + if(iter!=0) { 1.541 + if(rep!=0) { 1.542 + *iter=replaceableIterator; 1.543 + iter->context=rep; 1.544 + iter->limit=iter->length=rep->length(); 1.545 + } else { 1.546 + *iter=noopIterator; 1.547 + } 1.548 + } 1.549 +} 1.550 + 1.551 +/* UCharIterator implementation for UTF-8 strings --------------------------- */ 1.552 + 1.553 +/* 1.554 + * Possible, probably necessary only for an implementation for arbitrary 1.555 + * converters: 1.556 + * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text. 1.557 + * This would require to turn reservedFn into a close function and 1.558 + * to introduce a uiter_close(iter). 1.559 + */ 1.560 + 1.561 +#define UITER_CNV_CAPACITY 16 1.562 + 1.563 +/* 1.564 + * Minimal implementation: 1.565 + * Maintain a single-UChar buffer for an additional surrogate. 1.566 + * The caller must not modify start and limit because they are used internally. 1.567 + * 1.568 + * Use UCharIterator fields as follows: 1.569 + * context pointer to UTF-8 string 1.570 + * length UTF-16 length of the string; -1 until lazy evaluation 1.571 + * start current UTF-8 index 1.572 + * index current UTF-16 index; may be -1="unknown" after setState() 1.573 + * limit UTF-8 length of the string 1.574 + * reservedField supplementary code point 1.575 + * 1.576 + * Since UCharIterator delivers 16-bit code units, the iteration can be 1.577 + * currently in the middle of the byte sequence for a supplementary code point. 1.578 + * In this case, reservedField will contain that code point and start will 1.579 + * point to after the corresponding byte sequence. The UTF-16 index will be 1.580 + * one less than what it would otherwise be corresponding to the UTF-8 index. 1.581 + * Otherwise, reservedField will be 0. 1.582 + */ 1.583 + 1.584 +/* 1.585 + * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings: 1.586 + * Add implementations that do not call strlen() for iteration but check for NUL. 1.587 + */ 1.588 + 1.589 +static int32_t U_CALLCONV 1.590 +utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) { 1.591 + switch(origin) { 1.592 + case UITER_ZERO: 1.593 + case UITER_START: 1.594 + return 0; 1.595 + case UITER_CURRENT: 1.596 + if(iter->index<0) { 1.597 + /* the current UTF-16 index is unknown after setState(), count from the beginning */ 1.598 + const uint8_t *s; 1.599 + UChar32 c; 1.600 + int32_t i, limit, index; 1.601 + 1.602 + s=(const uint8_t *)iter->context; 1.603 + i=index=0; 1.604 + limit=iter->start; /* count up to the UTF-8 index */ 1.605 + while(i<limit) { 1.606 + U8_NEXT_OR_FFFD(s, i, limit, c); 1.607 + index+=U16_LENGTH(c); 1.608 + } 1.609 + 1.610 + iter->start=i; /* just in case setState() did not get us to a code point boundary */ 1.611 + if(i==iter->limit) { 1.612 + iter->length=index; /* in case it was <0 or wrong */ 1.613 + } 1.614 + if(iter->reservedField!=0) { 1.615 + --index; /* we are in the middle of a supplementary code point */ 1.616 + } 1.617 + iter->index=index; 1.618 + } 1.619 + return iter->index; 1.620 + case UITER_LIMIT: 1.621 + case UITER_LENGTH: 1.622 + if(iter->length<0) { 1.623 + const uint8_t *s; 1.624 + UChar32 c; 1.625 + int32_t i, limit, length; 1.626 + 1.627 + s=(const uint8_t *)iter->context; 1.628 + if(iter->index<0) { 1.629 + /* 1.630 + * the current UTF-16 index is unknown after setState(), 1.631 + * we must first count from the beginning to here 1.632 + */ 1.633 + i=length=0; 1.634 + limit=iter->start; 1.635 + 1.636 + /* count from the beginning to the current index */ 1.637 + while(i<limit) { 1.638 + U8_NEXT_OR_FFFD(s, i, limit, c); 1.639 + length+=U16_LENGTH(c); 1.640 + } 1.641 + 1.642 + /* assume i==limit==iter->start, set the UTF-16 index */ 1.643 + iter->start=i; /* just in case setState() did not get us to a code point boundary */ 1.644 + iter->index= iter->reservedField!=0 ? length-1 : length; 1.645 + } else { 1.646 + i=iter->start; 1.647 + length=iter->index; 1.648 + if(iter->reservedField!=0) { 1.649 + ++length; 1.650 + } 1.651 + } 1.652 + 1.653 + /* count from the current index to the end */ 1.654 + limit=iter->limit; 1.655 + while(i<limit) { 1.656 + U8_NEXT_OR_FFFD(s, i, limit, c); 1.657 + length+=U16_LENGTH(c); 1.658 + } 1.659 + iter->length=length; 1.660 + } 1.661 + return iter->length; 1.662 + default: 1.663 + /* not a valid origin */ 1.664 + /* Should never get here! */ 1.665 + return -1; 1.666 + } 1.667 +} 1.668 + 1.669 +static int32_t U_CALLCONV 1.670 +utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) { 1.671 + const uint8_t *s; 1.672 + UChar32 c; 1.673 + int32_t pos; /* requested UTF-16 index */ 1.674 + int32_t i; /* UTF-8 index */ 1.675 + UBool havePos; 1.676 + 1.677 + /* calculate the requested UTF-16 index */ 1.678 + switch(origin) { 1.679 + case UITER_ZERO: 1.680 + case UITER_START: 1.681 + pos=delta; 1.682 + havePos=TRUE; 1.683 + /* iter->index<0 (unknown) is possible */ 1.684 + break; 1.685 + case UITER_CURRENT: 1.686 + if(iter->index>=0) { 1.687 + pos=iter->index+delta; 1.688 + havePos=TRUE; 1.689 + } else { 1.690 + /* the current UTF-16 index is unknown after setState(), use only delta */ 1.691 + pos=0; 1.692 + havePos=FALSE; 1.693 + } 1.694 + break; 1.695 + case UITER_LIMIT: 1.696 + case UITER_LENGTH: 1.697 + if(iter->length>=0) { 1.698 + pos=iter->length+delta; 1.699 + havePos=TRUE; 1.700 + } else { 1.701 + /* pin to the end, avoid counting the length */ 1.702 + iter->index=-1; 1.703 + iter->start=iter->limit; 1.704 + iter->reservedField=0; 1.705 + if(delta>=0) { 1.706 + return UITER_UNKNOWN_INDEX; 1.707 + } else { 1.708 + /* the current UTF-16 index is unknown, use only delta */ 1.709 + pos=0; 1.710 + havePos=FALSE; 1.711 + } 1.712 + } 1.713 + break; 1.714 + default: 1.715 + return -1; /* Error */ 1.716 + } 1.717 + 1.718 + if(havePos) { 1.719 + /* shortcuts: pinning to the edges of the string */ 1.720 + if(pos<=0) { 1.721 + iter->index=iter->start=iter->reservedField=0; 1.722 + return 0; 1.723 + } else if(iter->length>=0 && pos>=iter->length) { 1.724 + iter->index=iter->length; 1.725 + iter->start=iter->limit; 1.726 + iter->reservedField=0; 1.727 + return iter->index; 1.728 + } 1.729 + 1.730 + /* minimize the number of U8_NEXT/PREV operations */ 1.731 + if(iter->index<0 || pos<iter->index/2) { 1.732 + /* go forward from the start instead of backward from the current index */ 1.733 + iter->index=iter->start=iter->reservedField=0; 1.734 + } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) { 1.735 + /* 1.736 + * if we have the UTF-16 index and length and the new position is 1.737 + * closer to the end than the current index, 1.738 + * then go backward from the end instead of forward from the current index 1.739 + */ 1.740 + iter->index=iter->length; 1.741 + iter->start=iter->limit; 1.742 + iter->reservedField=0; 1.743 + } 1.744 + 1.745 + delta=pos-iter->index; 1.746 + if(delta==0) { 1.747 + return iter->index; /* nothing to do */ 1.748 + } 1.749 + } else { 1.750 + /* move relative to unknown UTF-16 index */ 1.751 + if(delta==0) { 1.752 + return UITER_UNKNOWN_INDEX; /* nothing to do */ 1.753 + } else if(-delta>=iter->start) { 1.754 + /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */ 1.755 + iter->index=iter->start=iter->reservedField=0; 1.756 + return 0; 1.757 + } else if(delta>=(iter->limit-iter->start)) { 1.758 + /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */ 1.759 + iter->index=iter->length; /* may or may not be <0 (unknown) */ 1.760 + iter->start=iter->limit; 1.761 + iter->reservedField=0; 1.762 + return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX; 1.763 + } 1.764 + } 1.765 + 1.766 + /* delta!=0 */ 1.767 + 1.768 + /* move towards the requested position, pin to the edges of the string */ 1.769 + s=(const uint8_t *)iter->context; 1.770 + pos=iter->index; /* could be <0 (unknown) */ 1.771 + i=iter->start; 1.772 + if(delta>0) { 1.773 + /* go forward */ 1.774 + int32_t limit=iter->limit; 1.775 + if(iter->reservedField!=0) { 1.776 + iter->reservedField=0; 1.777 + ++pos; 1.778 + --delta; 1.779 + } 1.780 + while(delta>0 && i<limit) { 1.781 + U8_NEXT_OR_FFFD(s, i, limit, c); 1.782 + if(c<=0xffff) { 1.783 + ++pos; 1.784 + --delta; 1.785 + } else if(delta>=2) { 1.786 + pos+=2; 1.787 + delta-=2; 1.788 + } else /* delta==1 */ { 1.789 + /* stop in the middle of a supplementary code point */ 1.790 + iter->reservedField=c; 1.791 + ++pos; 1.792 + break; /* delta=0; */ 1.793 + } 1.794 + } 1.795 + if(i==limit) { 1.796 + if(iter->length<0 && iter->index>=0) { 1.797 + iter->length= iter->reservedField==0 ? pos : pos+1; 1.798 + } else if(iter->index<0 && iter->length>=0) { 1.799 + iter->index= iter->reservedField==0 ? iter->length : iter->length-1; 1.800 + } 1.801 + } 1.802 + } else /* delta<0 */ { 1.803 + /* go backward */ 1.804 + if(iter->reservedField!=0) { 1.805 + iter->reservedField=0; 1.806 + i-=4; /* we stayed behind the supplementary code point; go before it now */ 1.807 + --pos; 1.808 + ++delta; 1.809 + } 1.810 + while(delta<0 && i>0) { 1.811 + U8_PREV_OR_FFFD(s, 0, i, c); 1.812 + if(c<=0xffff) { 1.813 + --pos; 1.814 + ++delta; 1.815 + } else if(delta<=-2) { 1.816 + pos-=2; 1.817 + delta+=2; 1.818 + } else /* delta==-1 */ { 1.819 + /* stop in the middle of a supplementary code point */ 1.820 + i+=4; /* back to behind this supplementary code point for consistent state */ 1.821 + iter->reservedField=c; 1.822 + --pos; 1.823 + break; /* delta=0; */ 1.824 + } 1.825 + } 1.826 + } 1.827 + 1.828 + iter->start=i; 1.829 + if(iter->index>=0) { 1.830 + return iter->index=pos; 1.831 + } else { 1.832 + /* we started with index<0 (unknown) so pos is bogus */ 1.833 + if(i<=1) { 1.834 + return iter->index=i; /* reached the beginning */ 1.835 + } else { 1.836 + /* we still don't know the UTF-16 index */ 1.837 + return UITER_UNKNOWN_INDEX; 1.838 + } 1.839 + } 1.840 +} 1.841 + 1.842 +static UBool U_CALLCONV 1.843 +utf8IteratorHasNext(UCharIterator *iter) { 1.844 + return iter->start<iter->limit || iter->reservedField!=0; 1.845 +} 1.846 + 1.847 +static UBool U_CALLCONV 1.848 +utf8IteratorHasPrevious(UCharIterator *iter) { 1.849 + return iter->start>0; 1.850 +} 1.851 + 1.852 +static UChar32 U_CALLCONV 1.853 +utf8IteratorCurrent(UCharIterator *iter) { 1.854 + if(iter->reservedField!=0) { 1.855 + return U16_TRAIL(iter->reservedField); 1.856 + } else if(iter->start<iter->limit) { 1.857 + const uint8_t *s=(const uint8_t *)iter->context; 1.858 + UChar32 c; 1.859 + int32_t i=iter->start; 1.860 + 1.861 + U8_NEXT_OR_FFFD(s, i, iter->limit, c); 1.862 + if(c<=0xffff) { 1.863 + return c; 1.864 + } else { 1.865 + return U16_LEAD(c); 1.866 + } 1.867 + } else { 1.868 + return U_SENTINEL; 1.869 + } 1.870 +} 1.871 + 1.872 +static UChar32 U_CALLCONV 1.873 +utf8IteratorNext(UCharIterator *iter) { 1.874 + int32_t index; 1.875 + 1.876 + if(iter->reservedField!=0) { 1.877 + UChar trail=U16_TRAIL(iter->reservedField); 1.878 + iter->reservedField=0; 1.879 + if((index=iter->index)>=0) { 1.880 + iter->index=index+1; 1.881 + } 1.882 + return trail; 1.883 + } else if(iter->start<iter->limit) { 1.884 + const uint8_t *s=(const uint8_t *)iter->context; 1.885 + UChar32 c; 1.886 + 1.887 + U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c); 1.888 + if((index=iter->index)>=0) { 1.889 + iter->index=++index; 1.890 + if(iter->length<0 && iter->start==iter->limit) { 1.891 + iter->length= c<=0xffff ? index : index+1; 1.892 + } 1.893 + } else if(iter->start==iter->limit && iter->length>=0) { 1.894 + iter->index= c<=0xffff ? iter->length : iter->length-1; 1.895 + } 1.896 + if(c<=0xffff) { 1.897 + return c; 1.898 + } else { 1.899 + iter->reservedField=c; 1.900 + return U16_LEAD(c); 1.901 + } 1.902 + } else { 1.903 + return U_SENTINEL; 1.904 + } 1.905 +} 1.906 + 1.907 +static UChar32 U_CALLCONV 1.908 +utf8IteratorPrevious(UCharIterator *iter) { 1.909 + int32_t index; 1.910 + 1.911 + if(iter->reservedField!=0) { 1.912 + UChar lead=U16_LEAD(iter->reservedField); 1.913 + iter->reservedField=0; 1.914 + iter->start-=4; /* we stayed behind the supplementary code point; go before it now */ 1.915 + if((index=iter->index)>0) { 1.916 + iter->index=index-1; 1.917 + } 1.918 + return lead; 1.919 + } else if(iter->start>0) { 1.920 + const uint8_t *s=(const uint8_t *)iter->context; 1.921 + UChar32 c; 1.922 + 1.923 + U8_PREV_OR_FFFD(s, 0, iter->start, c); 1.924 + if((index=iter->index)>0) { 1.925 + iter->index=index-1; 1.926 + } else if(iter->start<=1) { 1.927 + iter->index= c<=0xffff ? iter->start : iter->start+1; 1.928 + } 1.929 + if(c<=0xffff) { 1.930 + return c; 1.931 + } else { 1.932 + iter->start+=4; /* back to behind this supplementary code point for consistent state */ 1.933 + iter->reservedField=c; 1.934 + return U16_TRAIL(c); 1.935 + } 1.936 + } else { 1.937 + return U_SENTINEL; 1.938 + } 1.939 +} 1.940 + 1.941 +static uint32_t U_CALLCONV 1.942 +utf8IteratorGetState(const UCharIterator *iter) { 1.943 + uint32_t state=(uint32_t)(iter->start<<1); 1.944 + if(iter->reservedField!=0) { 1.945 + state|=1; 1.946 + } 1.947 + return state; 1.948 +} 1.949 + 1.950 +static void U_CALLCONV 1.951 +utf8IteratorSetState(UCharIterator *iter, 1.952 + uint32_t state, 1.953 + UErrorCode *pErrorCode) 1.954 +{ 1.955 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.956 + /* do nothing */ 1.957 + } else if(iter==NULL) { 1.958 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.959 + } else if(state==utf8IteratorGetState(iter)) { 1.960 + /* setting to the current state: no-op */ 1.961 + } else { 1.962 + int32_t index=(int32_t)(state>>1); /* UTF-8 index */ 1.963 + state&=1; /* 1 if in surrogate pair, must be index>=4 */ 1.964 + 1.965 + if((state==0 ? index<0 : index<4) || iter->limit<index) { 1.966 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.967 + } else { 1.968 + iter->start=index; /* restore UTF-8 byte index */ 1.969 + if(index<=1) { 1.970 + iter->index=index; 1.971 + } else { 1.972 + iter->index=-1; /* unknown UTF-16 index */ 1.973 + } 1.974 + if(state==0) { 1.975 + iter->reservedField=0; 1.976 + } else { 1.977 + /* verified index>=4 above */ 1.978 + UChar32 c; 1.979 + U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c); 1.980 + if(c<=0xffff) { 1.981 + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; 1.982 + } else { 1.983 + iter->reservedField=c; 1.984 + } 1.985 + } 1.986 + } 1.987 + } 1.988 +} 1.989 + 1.990 +static const UCharIterator utf8Iterator={ 1.991 + 0, 0, 0, 0, 0, 0, 1.992 + utf8IteratorGetIndex, 1.993 + utf8IteratorMove, 1.994 + utf8IteratorHasNext, 1.995 + utf8IteratorHasPrevious, 1.996 + utf8IteratorCurrent, 1.997 + utf8IteratorNext, 1.998 + utf8IteratorPrevious, 1.999 + NULL, 1.1000 + utf8IteratorGetState, 1.1001 + utf8IteratorSetState 1.1002 +}; 1.1003 + 1.1004 +U_CAPI void U_EXPORT2 1.1005 +uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) { 1.1006 + if(iter!=0) { 1.1007 + if(s!=0 && length>=-1) { 1.1008 + *iter=utf8Iterator; 1.1009 + iter->context=s; 1.1010 + if(length>=0) { 1.1011 + iter->limit=length; 1.1012 + } else { 1.1013 + iter->limit=(int32_t)uprv_strlen(s); 1.1014 + } 1.1015 + iter->length= iter->limit<=1 ? iter->limit : -1; 1.1016 + } else { 1.1017 + *iter=noopIterator; 1.1018 + } 1.1019 + } 1.1020 +} 1.1021 + 1.1022 +/* Helper functions --------------------------------------------------------- */ 1.1023 + 1.1024 +U_CAPI UChar32 U_EXPORT2 1.1025 +uiter_current32(UCharIterator *iter) { 1.1026 + UChar32 c, c2; 1.1027 + 1.1028 + c=iter->current(iter); 1.1029 + if(U16_IS_SURROGATE(c)) { 1.1030 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1031 + /* 1.1032 + * go to the next code unit 1.1033 + * we know that we are not at the limit because c!=U_SENTINEL 1.1034 + */ 1.1035 + iter->move(iter, 1, UITER_CURRENT); 1.1036 + if(U16_IS_TRAIL(c2=iter->current(iter))) { 1.1037 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.1038 + } 1.1039 + 1.1040 + /* undo index movement */ 1.1041 + iter->move(iter, -1, UITER_CURRENT); 1.1042 + } else { 1.1043 + if(U16_IS_LEAD(c2=iter->previous(iter))) { 1.1044 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.1045 + } 1.1046 + if(c2>=0) { 1.1047 + /* undo index movement */ 1.1048 + iter->move(iter, 1, UITER_CURRENT); 1.1049 + } 1.1050 + } 1.1051 + } 1.1052 + return c; 1.1053 +} 1.1054 + 1.1055 +U_CAPI UChar32 U_EXPORT2 1.1056 +uiter_next32(UCharIterator *iter) { 1.1057 + UChar32 c, c2; 1.1058 + 1.1059 + c=iter->next(iter); 1.1060 + if(U16_IS_LEAD(c)) { 1.1061 + if(U16_IS_TRAIL(c2=iter->next(iter))) { 1.1062 + c=U16_GET_SUPPLEMENTARY(c, c2); 1.1063 + } else if(c2>=0) { 1.1064 + /* unmatched first surrogate, undo index movement */ 1.1065 + iter->move(iter, -1, UITER_CURRENT); 1.1066 + } 1.1067 + } 1.1068 + return c; 1.1069 +} 1.1070 + 1.1071 +U_CAPI UChar32 U_EXPORT2 1.1072 +uiter_previous32(UCharIterator *iter) { 1.1073 + UChar32 c, c2; 1.1074 + 1.1075 + c=iter->previous(iter); 1.1076 + if(U16_IS_TRAIL(c)) { 1.1077 + if(U16_IS_LEAD(c2=iter->previous(iter))) { 1.1078 + c=U16_GET_SUPPLEMENTARY(c2, c); 1.1079 + } else if(c2>=0) { 1.1080 + /* unmatched second surrogate, undo index movement */ 1.1081 + iter->move(iter, 1, UITER_CURRENT); 1.1082 + } 1.1083 + } 1.1084 + return c; 1.1085 +} 1.1086 + 1.1087 +U_CAPI uint32_t U_EXPORT2 1.1088 +uiter_getState(const UCharIterator *iter) { 1.1089 + if(iter==NULL || iter->getState==NULL) { 1.1090 + return UITER_NO_STATE; 1.1091 + } else { 1.1092 + return iter->getState(iter); 1.1093 + } 1.1094 +} 1.1095 + 1.1096 +U_CAPI void U_EXPORT2 1.1097 +uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) { 1.1098 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 1.1099 + /* do nothing */ 1.1100 + } else if(iter==NULL) { 1.1101 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1102 + } else if(iter->setState==NULL) { 1.1103 + *pErrorCode=U_UNSUPPORTED_ERROR; 1.1104 + } else { 1.1105 + iter->setState(iter, state, pErrorCode); 1.1106 + } 1.1107 +} 1.1108 + 1.1109 +U_CDECL_END