intl/icu/source/common/uiter.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 2002-2012, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  uiter.cpp
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 2002jan18
    14 *   created by: Markus W. Scherer
    15 */
    17 #include "unicode/utypes.h"
    18 #include "unicode/ustring.h"
    19 #include "unicode/chariter.h"
    20 #include "unicode/rep.h"
    21 #include "unicode/uiter.h"
    22 #include "unicode/utf.h"
    23 #include "unicode/utf8.h"
    24 #include "unicode/utf16.h"
    25 #include "cstring.h"
    27 U_NAMESPACE_USE
    29 #define IS_EVEN(n) (((n)&1)==0)
    30 #define IS_POINTER_EVEN(p) IS_EVEN((size_t)p)
    32 U_CDECL_BEGIN
    34 /* No-Op UCharIterator implementation for illegal input --------------------- */
    36 static int32_t U_CALLCONV
    37 noopGetIndex(UCharIterator * /*iter*/, UCharIteratorOrigin /*origin*/) {
    38     return 0;
    39 }
    41 static int32_t U_CALLCONV
    42 noopMove(UCharIterator * /*iter*/, int32_t /*delta*/, UCharIteratorOrigin /*origin*/) {
    43     return 0;
    44 }
    46 static UBool U_CALLCONV
    47 noopHasNext(UCharIterator * /*iter*/) {
    48     return FALSE;
    49 }
    51 static UChar32 U_CALLCONV
    52 noopCurrent(UCharIterator * /*iter*/) {
    53     return U_SENTINEL;
    54 }
    56 static uint32_t U_CALLCONV
    57 noopGetState(const UCharIterator * /*iter*/) {
    58     return UITER_NO_STATE;
    59 }
    61 static void U_CALLCONV
    62 noopSetState(UCharIterator * /*iter*/, uint32_t /*state*/, UErrorCode *pErrorCode) {
    63     *pErrorCode=U_UNSUPPORTED_ERROR;
    64 }
    66 static const UCharIterator noopIterator={
    67     0, 0, 0, 0, 0, 0,
    68     noopGetIndex,
    69     noopMove,
    70     noopHasNext,
    71     noopHasNext,
    72     noopCurrent,
    73     noopCurrent,
    74     noopCurrent,
    75     NULL,
    76     noopGetState,
    77     noopSetState
    78 };
    80 /* UCharIterator implementation for simple strings -------------------------- */
    82 /*
    83  * This is an implementation of a code unit (UChar) iterator
    84  * for UChar * strings.
    85  *
    86  * The UCharIterator.context field holds a pointer to the string.
    87  */
    89 static int32_t U_CALLCONV
    90 stringIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
    91     switch(origin) {
    92     case UITER_ZERO:
    93         return 0;
    94     case UITER_START:
    95         return iter->start;
    96     case UITER_CURRENT:
    97         return iter->index;
    98     case UITER_LIMIT:
    99         return iter->limit;
   100     case UITER_LENGTH:
   101         return iter->length;
   102     default:
   103         /* not a valid origin */
   104         /* Should never get here! */
   105         return -1;
   106     }
   107 }
   109 static int32_t U_CALLCONV
   110 stringIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
   111     int32_t pos;
   113     switch(origin) {
   114     case UITER_ZERO:
   115         pos=delta;
   116         break;
   117     case UITER_START:
   118         pos=iter->start+delta;
   119         break;
   120     case UITER_CURRENT:
   121         pos=iter->index+delta;
   122         break;
   123     case UITER_LIMIT:
   124         pos=iter->limit+delta;
   125         break;
   126     case UITER_LENGTH:
   127         pos=iter->length+delta;
   128         break;
   129     default:
   130         return -1;  /* Error */
   131     }
   133     if(pos<iter->start) {
   134         pos=iter->start;
   135     } else if(pos>iter->limit) {
   136         pos=iter->limit;
   137     }
   139     return iter->index=pos;
   140 }
   142 static UBool U_CALLCONV
   143 stringIteratorHasNext(UCharIterator *iter) {
   144     return iter->index<iter->limit;
   145 }
   147 static UBool U_CALLCONV
   148 stringIteratorHasPrevious(UCharIterator *iter) {
   149     return iter->index>iter->start;
   150 }
   152 static UChar32 U_CALLCONV
   153 stringIteratorCurrent(UCharIterator *iter) {
   154     if(iter->index<iter->limit) {
   155         return ((const UChar *)(iter->context))[iter->index];
   156     } else {
   157         return U_SENTINEL;
   158     }
   159 }
   161 static UChar32 U_CALLCONV
   162 stringIteratorNext(UCharIterator *iter) {
   163     if(iter->index<iter->limit) {
   164         return ((const UChar *)(iter->context))[iter->index++];
   165     } else {
   166         return U_SENTINEL;
   167     }
   168 }
   170 static UChar32 U_CALLCONV
   171 stringIteratorPrevious(UCharIterator *iter) {
   172     if(iter->index>iter->start) {
   173         return ((const UChar *)(iter->context))[--iter->index];
   174     } else {
   175         return U_SENTINEL;
   176     }
   177 }
   179 static uint32_t U_CALLCONV
   180 stringIteratorGetState(const UCharIterator *iter) {
   181     return (uint32_t)iter->index;
   182 }
   184 static void U_CALLCONV
   185 stringIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
   186     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   187         /* do nothing */
   188     } else if(iter==NULL) {
   189         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   190     } else if((int32_t)state<iter->start || iter->limit<(int32_t)state) {
   191         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   192     } else {
   193         iter->index=(int32_t)state;
   194     }
   195 }
   197 static const UCharIterator stringIterator={
   198     0, 0, 0, 0, 0, 0,
   199     stringIteratorGetIndex,
   200     stringIteratorMove,
   201     stringIteratorHasNext,
   202     stringIteratorHasPrevious,
   203     stringIteratorCurrent,
   204     stringIteratorNext,
   205     stringIteratorPrevious,
   206     NULL,
   207     stringIteratorGetState,
   208     stringIteratorSetState
   209 };
   211 U_CAPI void U_EXPORT2
   212 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length) {
   213     if(iter!=0) {
   214         if(s!=0 && length>=-1) {
   215             *iter=stringIterator;
   216             iter->context=s;
   217             if(length>=0) {
   218                 iter->length=length;
   219             } else {
   220                 iter->length=u_strlen(s);
   221             }
   222             iter->limit=iter->length;
   223         } else {
   224             *iter=noopIterator;
   225         }
   226     }
   227 }
   229 /* UCharIterator implementation for UTF-16BE strings ------------------------ */
   231 /*
   232  * This is an implementation of a code unit (UChar) iterator
   233  * for UTF-16BE strings, i.e., strings in byte-vectors where
   234  * each UChar is stored as a big-endian pair of bytes.
   235  *
   236  * The UCharIterator.context field holds a pointer to the string.
   237  * Everything works just like with a normal UChar iterator (uiter_setString),
   238  * except that UChars are assembled from byte pairs.
   239  */
   241 /* internal helper function */
   242 static inline UChar32
   243 utf16BEIteratorGet(UCharIterator *iter, int32_t index) {
   244     const uint8_t *p=(const uint8_t *)iter->context;
   245     return ((UChar)p[2*index]<<8)|(UChar)p[2*index+1];
   246 }
   248 static UChar32 U_CALLCONV
   249 utf16BEIteratorCurrent(UCharIterator *iter) {
   250     int32_t index;
   252     if((index=iter->index)<iter->limit) {
   253         return utf16BEIteratorGet(iter, index);
   254     } else {
   255         return U_SENTINEL;
   256     }
   257 }
   259 static UChar32 U_CALLCONV
   260 utf16BEIteratorNext(UCharIterator *iter) {
   261     int32_t index;
   263     if((index=iter->index)<iter->limit) {
   264         iter->index=index+1;
   265         return utf16BEIteratorGet(iter, index);
   266     } else {
   267         return U_SENTINEL;
   268     }
   269 }
   271 static UChar32 U_CALLCONV
   272 utf16BEIteratorPrevious(UCharIterator *iter) {
   273     int32_t index;
   275     if((index=iter->index)>iter->start) {
   276         iter->index=--index;
   277         return utf16BEIteratorGet(iter, index);
   278     } else {
   279         return U_SENTINEL;
   280     }
   281 }
   283 static const UCharIterator utf16BEIterator={
   284     0, 0, 0, 0, 0, 0,
   285     stringIteratorGetIndex,
   286     stringIteratorMove,
   287     stringIteratorHasNext,
   288     stringIteratorHasPrevious,
   289     utf16BEIteratorCurrent,
   290     utf16BEIteratorNext,
   291     utf16BEIteratorPrevious,
   292     NULL,
   293     stringIteratorGetState,
   294     stringIteratorSetState
   295 };
   297 /*
   298  * Count the number of UChars in a UTF-16BE string before a terminating UChar NUL,
   299  * i.e., before a pair of 0 bytes where the first 0 byte is at an even
   300  * offset from s.
   301  */
   302 static int32_t
   303 utf16BE_strlen(const char *s) {
   304     if(IS_POINTER_EVEN(s)) {
   305         /*
   306          * even-aligned, call u_strlen(s)
   307          * we are probably on a little-endian machine, but searching for UChar NUL
   308          * does not care about endianness
   309          */
   310         return u_strlen((const UChar *)s);
   311     } else {
   312         /* odd-aligned, search for pair of 0 bytes */
   313         const char *p=s;
   315         while(!(*p==0 && p[1]==0)) {
   316             p+=2;
   317         }
   318         return (int32_t)((p-s)/2);
   319     }
   320 }
   322 U_CAPI void U_EXPORT2
   323 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length) {
   324     if(iter!=NULL) {
   325         /* allow only even-length strings (the input length counts bytes) */
   326         if(s!=NULL && (length==-1 || (length>=0 && IS_EVEN(length)))) {
   327             /* length/=2, except that >>=1 also works for -1 (-1/2==0, -1>>1==-1) */
   328             length>>=1;
   330             if(U_IS_BIG_ENDIAN && IS_POINTER_EVEN(s)) {
   331                 /* big-endian machine and 2-aligned UTF-16BE string: use normal UChar iterator */
   332                 uiter_setString(iter, (const UChar *)s, length);
   333                 return;
   334             }
   336             *iter=utf16BEIterator;
   337             iter->context=s;
   338             if(length>=0) {
   339                 iter->length=length;
   340             } else {
   341                 iter->length=utf16BE_strlen(s);
   342             }
   343             iter->limit=iter->length;
   344         } else {
   345             *iter=noopIterator;
   346         }
   347     }
   348 }
   350 /* UCharIterator wrapper around CharacterIterator --------------------------- */
   352 /*
   353  * This is wrapper code around a C++ CharacterIterator to
   354  * look like a C UCharIterator.
   355  *
   356  * The UCharIterator.context field holds a pointer to the CharacterIterator.
   357  */
   359 static int32_t U_CALLCONV
   360 characterIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
   361     switch(origin) {
   362     case UITER_ZERO:
   363         return 0;
   364     case UITER_START:
   365         return ((CharacterIterator *)(iter->context))->startIndex();
   366     case UITER_CURRENT:
   367         return ((CharacterIterator *)(iter->context))->getIndex();
   368     case UITER_LIMIT:
   369         return ((CharacterIterator *)(iter->context))->endIndex();
   370     case UITER_LENGTH:
   371         return ((CharacterIterator *)(iter->context))->getLength();
   372     default:
   373         /* not a valid origin */
   374         /* Should never get here! */
   375         return -1;
   376     }
   377 }
   379 static int32_t U_CALLCONV
   380 characterIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
   381     switch(origin) {
   382     case UITER_ZERO:
   383         ((CharacterIterator *)(iter->context))->setIndex(delta);
   384         return ((CharacterIterator *)(iter->context))->getIndex();
   385     case UITER_START:
   386     case UITER_CURRENT:
   387     case UITER_LIMIT:
   388         return ((CharacterIterator *)(iter->context))->move(delta, (CharacterIterator::EOrigin)origin);
   389     case UITER_LENGTH:
   390         ((CharacterIterator *)(iter->context))->setIndex(((CharacterIterator *)(iter->context))->getLength()+delta);
   391         return ((CharacterIterator *)(iter->context))->getIndex();
   392     default:
   393         /* not a valid origin */
   394         /* Should never get here! */
   395         return -1;
   396     }
   397 }
   399 static UBool U_CALLCONV
   400 characterIteratorHasNext(UCharIterator *iter) {
   401     return ((CharacterIterator *)(iter->context))->hasNext();
   402 }
   404 static UBool U_CALLCONV
   405 characterIteratorHasPrevious(UCharIterator *iter) {
   406     return ((CharacterIterator *)(iter->context))->hasPrevious();
   407 }
   409 static UChar32 U_CALLCONV
   410 characterIteratorCurrent(UCharIterator *iter) {
   411     UChar32 c;
   413     c=((CharacterIterator *)(iter->context))->current();
   414     if(c!=0xffff || ((CharacterIterator *)(iter->context))->hasNext()) {
   415         return c;
   416     } else {
   417         return U_SENTINEL;
   418     }
   419 }
   421 static UChar32 U_CALLCONV
   422 characterIteratorNext(UCharIterator *iter) {
   423     if(((CharacterIterator *)(iter->context))->hasNext()) {
   424         return ((CharacterIterator *)(iter->context))->nextPostInc();
   425     } else {
   426         return U_SENTINEL;
   427     }
   428 }
   430 static UChar32 U_CALLCONV
   431 characterIteratorPrevious(UCharIterator *iter) {
   432     if(((CharacterIterator *)(iter->context))->hasPrevious()) {
   433         return ((CharacterIterator *)(iter->context))->previous();
   434     } else {
   435         return U_SENTINEL;
   436     }
   437 }
   439 static uint32_t U_CALLCONV
   440 characterIteratorGetState(const UCharIterator *iter) {
   441     return ((CharacterIterator *)(iter->context))->getIndex();
   442 }
   444 static void U_CALLCONV
   445 characterIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
   446     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   447         /* do nothing */
   448     } else if(iter==NULL || iter->context==NULL) {
   449         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   450     } else if((int32_t)state<((CharacterIterator *)(iter->context))->startIndex() || ((CharacterIterator *)(iter->context))->endIndex()<(int32_t)state) {
   451         *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   452     } else {
   453         ((CharacterIterator *)(iter->context))->setIndex((int32_t)state);
   454     }
   455 }
   457 static const UCharIterator characterIteratorWrapper={
   458     0, 0, 0, 0, 0, 0,
   459     characterIteratorGetIndex,
   460     characterIteratorMove,
   461     characterIteratorHasNext,
   462     characterIteratorHasPrevious,
   463     characterIteratorCurrent,
   464     characterIteratorNext,
   465     characterIteratorPrevious,
   466     NULL,
   467     characterIteratorGetState,
   468     characterIteratorSetState
   469 };
   471 U_CAPI void U_EXPORT2
   472 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter) {
   473     if(iter!=0) {
   474         if(charIter!=0) {
   475             *iter=characterIteratorWrapper;
   476             iter->context=charIter;
   477         } else {
   478             *iter=noopIterator;
   479         }
   480     }
   481 }
   483 /* UCharIterator wrapper around Replaceable --------------------------------- */
   485 /*
   486  * This is an implementation of a code unit (UChar) iterator
   487  * based on a Replaceable object.
   488  *
   489  * The UCharIterator.context field holds a pointer to the Replaceable.
   490  * UCharIterator.length and UCharIterator.index hold Replaceable.length()
   491  * and the iteration index.
   492  */
   494 static UChar32 U_CALLCONV
   495 replaceableIteratorCurrent(UCharIterator *iter) {
   496     if(iter->index<iter->limit) {
   497         return ((Replaceable *)(iter->context))->charAt(iter->index);
   498     } else {
   499         return U_SENTINEL;
   500     }
   501 }
   503 static UChar32 U_CALLCONV
   504 replaceableIteratorNext(UCharIterator *iter) {
   505     if(iter->index<iter->limit) {
   506         return ((Replaceable *)(iter->context))->charAt(iter->index++);
   507     } else {
   508         return U_SENTINEL;
   509     }
   510 }
   512 static UChar32 U_CALLCONV
   513 replaceableIteratorPrevious(UCharIterator *iter) {
   514     if(iter->index>iter->start) {
   515         return ((Replaceable *)(iter->context))->charAt(--iter->index);
   516     } else {
   517         return U_SENTINEL;
   518     }
   519 }
   521 static const UCharIterator replaceableIterator={
   522     0, 0, 0, 0, 0, 0,
   523     stringIteratorGetIndex,
   524     stringIteratorMove,
   525     stringIteratorHasNext,
   526     stringIteratorHasPrevious,
   527     replaceableIteratorCurrent,
   528     replaceableIteratorNext,
   529     replaceableIteratorPrevious,
   530     NULL,
   531     stringIteratorGetState,
   532     stringIteratorSetState
   533 };
   535 U_CAPI void U_EXPORT2
   536 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep) {
   537     if(iter!=0) {
   538         if(rep!=0) {
   539             *iter=replaceableIterator;
   540             iter->context=rep;
   541             iter->limit=iter->length=rep->length();
   542         } else {
   543             *iter=noopIterator;
   544         }
   545     }
   546 }
   548 /* UCharIterator implementation for UTF-8 strings --------------------------- */
   550 /*
   551  * Possible, probably necessary only for an implementation for arbitrary
   552  * converters:
   553  * Maintain a buffer (ring buffer?) for a piece of converted 16-bit text.
   554  * This would require to turn reservedFn into a close function and
   555  * to introduce a uiter_close(iter).
   556  */
   558 #define UITER_CNV_CAPACITY 16
   560 /*
   561  * Minimal implementation:
   562  * Maintain a single-UChar buffer for an additional surrogate.
   563  * The caller must not modify start and limit because they are used internally.
   564  *
   565  * Use UCharIterator fields as follows:
   566  *   context        pointer to UTF-8 string
   567  *   length         UTF-16 length of the string; -1 until lazy evaluation
   568  *   start          current UTF-8 index
   569  *   index          current UTF-16 index; may be -1="unknown" after setState()
   570  *   limit          UTF-8 length of the string
   571  *   reservedField  supplementary code point
   572  *
   573  * Since UCharIterator delivers 16-bit code units, the iteration can be
   574  * currently in the middle of the byte sequence for a supplementary code point.
   575  * In this case, reservedField will contain that code point and start will
   576  * point to after the corresponding byte sequence. The UTF-16 index will be
   577  * one less than what it would otherwise be corresponding to the UTF-8 index.
   578  * Otherwise, reservedField will be 0.
   579  */
   581 /*
   582  * Possible optimization for NUL-terminated UTF-8 and UTF-16 strings:
   583  * Add implementations that do not call strlen() for iteration but check for NUL.
   584  */
   586 static int32_t U_CALLCONV
   587 utf8IteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin) {
   588     switch(origin) {
   589     case UITER_ZERO:
   590     case UITER_START:
   591         return 0;
   592     case UITER_CURRENT:
   593         if(iter->index<0) {
   594             /* the current UTF-16 index is unknown after setState(), count from the beginning */
   595             const uint8_t *s;
   596             UChar32 c;
   597             int32_t i, limit, index;
   599             s=(const uint8_t *)iter->context;
   600             i=index=0;
   601             limit=iter->start; /* count up to the UTF-8 index */
   602             while(i<limit) {
   603                 U8_NEXT_OR_FFFD(s, i, limit, c);
   604                 index+=U16_LENGTH(c);
   605             }
   607             iter->start=i; /* just in case setState() did not get us to a code point boundary */
   608             if(i==iter->limit) {
   609                 iter->length=index; /* in case it was <0 or wrong */
   610             }
   611             if(iter->reservedField!=0) {
   612                 --index; /* we are in the middle of a supplementary code point */
   613             }
   614             iter->index=index;
   615         }
   616         return iter->index;
   617     case UITER_LIMIT:
   618     case UITER_LENGTH:
   619         if(iter->length<0) {
   620             const uint8_t *s;
   621             UChar32 c;
   622             int32_t i, limit, length;
   624             s=(const uint8_t *)iter->context;
   625             if(iter->index<0) {
   626                 /*
   627                  * the current UTF-16 index is unknown after setState(),
   628                  * we must first count from the beginning to here
   629                  */
   630                 i=length=0;
   631                 limit=iter->start;
   633                 /* count from the beginning to the current index */
   634                 while(i<limit) {
   635                     U8_NEXT_OR_FFFD(s, i, limit, c);
   636                     length+=U16_LENGTH(c);
   637                 }
   639                 /* assume i==limit==iter->start, set the UTF-16 index */
   640                 iter->start=i; /* just in case setState() did not get us to a code point boundary */
   641                 iter->index= iter->reservedField!=0 ? length-1 : length;
   642             } else {
   643                 i=iter->start;
   644                 length=iter->index;
   645                 if(iter->reservedField!=0) {
   646                     ++length;
   647                 }
   648             }
   650             /* count from the current index to the end */
   651             limit=iter->limit;
   652             while(i<limit) {
   653                 U8_NEXT_OR_FFFD(s, i, limit, c);
   654                 length+=U16_LENGTH(c);
   655             }
   656             iter->length=length;
   657         }
   658         return iter->length;
   659     default:
   660         /* not a valid origin */
   661         /* Should never get here! */
   662         return -1;
   663     }
   664 }
   666 static int32_t U_CALLCONV
   667 utf8IteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
   668     const uint8_t *s;
   669     UChar32 c;
   670     int32_t pos; /* requested UTF-16 index */
   671     int32_t i; /* UTF-8 index */
   672     UBool havePos;
   674     /* calculate the requested UTF-16 index */
   675     switch(origin) {
   676     case UITER_ZERO:
   677     case UITER_START:
   678         pos=delta;
   679         havePos=TRUE;
   680         /* iter->index<0 (unknown) is possible */
   681         break;
   682     case UITER_CURRENT:
   683         if(iter->index>=0) {
   684             pos=iter->index+delta;
   685             havePos=TRUE;
   686         } else {
   687             /* the current UTF-16 index is unknown after setState(), use only delta */
   688             pos=0;
   689             havePos=FALSE;
   690         }
   691         break;
   692     case UITER_LIMIT:
   693     case UITER_LENGTH:
   694         if(iter->length>=0) {
   695             pos=iter->length+delta;
   696             havePos=TRUE;
   697         } else {
   698             /* pin to the end, avoid counting the length */
   699             iter->index=-1;
   700             iter->start=iter->limit;
   701             iter->reservedField=0;
   702             if(delta>=0) {
   703                 return UITER_UNKNOWN_INDEX;
   704             } else {
   705                 /* the current UTF-16 index is unknown, use only delta */
   706                 pos=0;
   707                 havePos=FALSE;
   708             }
   709         }
   710         break;
   711     default:
   712         return -1;  /* Error */
   713     }
   715     if(havePos) {
   716         /* shortcuts: pinning to the edges of the string */
   717         if(pos<=0) {
   718             iter->index=iter->start=iter->reservedField=0;
   719             return 0;
   720         } else if(iter->length>=0 && pos>=iter->length) {
   721             iter->index=iter->length;
   722             iter->start=iter->limit;
   723             iter->reservedField=0;
   724             return iter->index;
   725         }
   727         /* minimize the number of U8_NEXT/PREV operations */
   728         if(iter->index<0 || pos<iter->index/2) {
   729             /* go forward from the start instead of backward from the current index */
   730             iter->index=iter->start=iter->reservedField=0;
   731         } else if(iter->length>=0 && (iter->length-pos)<(pos-iter->index)) {
   732             /*
   733              * if we have the UTF-16 index and length and the new position is
   734              * closer to the end than the current index,
   735              * then go backward from the end instead of forward from the current index
   736              */
   737             iter->index=iter->length;
   738             iter->start=iter->limit;
   739             iter->reservedField=0;
   740         }
   742         delta=pos-iter->index;
   743         if(delta==0) {
   744             return iter->index; /* nothing to do */
   745         }
   746     } else {
   747         /* move relative to unknown UTF-16 index */
   748         if(delta==0) {
   749             return UITER_UNKNOWN_INDEX; /* nothing to do */
   750         } else if(-delta>=iter->start) {
   751             /* moving backwards by more UChars than there are UTF-8 bytes, pin to 0 */
   752             iter->index=iter->start=iter->reservedField=0;
   753             return 0;
   754         } else if(delta>=(iter->limit-iter->start)) {
   755             /* moving forward by more UChars than the remaining UTF-8 bytes, pin to the end */
   756             iter->index=iter->length; /* may or may not be <0 (unknown) */
   757             iter->start=iter->limit;
   758             iter->reservedField=0;
   759             return iter->index>=0 ? iter->index : (int32_t)UITER_UNKNOWN_INDEX;
   760         }
   761     }
   763     /* delta!=0 */
   765     /* move towards the requested position, pin to the edges of the string */
   766     s=(const uint8_t *)iter->context;
   767     pos=iter->index; /* could be <0 (unknown) */
   768     i=iter->start;
   769     if(delta>0) {
   770         /* go forward */
   771         int32_t limit=iter->limit;
   772         if(iter->reservedField!=0) {
   773             iter->reservedField=0;
   774             ++pos;
   775             --delta;
   776         }
   777         while(delta>0 && i<limit) {
   778             U8_NEXT_OR_FFFD(s, i, limit, c);
   779             if(c<=0xffff) {
   780                 ++pos;
   781                 --delta;
   782             } else if(delta>=2) {
   783                 pos+=2;
   784                 delta-=2;
   785             } else /* delta==1 */ {
   786                 /* stop in the middle of a supplementary code point */
   787                 iter->reservedField=c;
   788                 ++pos;
   789                 break; /* delta=0; */
   790             }
   791         }
   792         if(i==limit) {
   793             if(iter->length<0 && iter->index>=0) {
   794                 iter->length= iter->reservedField==0 ? pos : pos+1;
   795             } else if(iter->index<0 && iter->length>=0) {
   796                 iter->index= iter->reservedField==0 ? iter->length : iter->length-1;
   797             }
   798         }
   799     } else /* delta<0 */ {
   800         /* go backward */
   801         if(iter->reservedField!=0) {
   802             iter->reservedField=0;
   803             i-=4; /* we stayed behind the supplementary code point; go before it now */
   804             --pos;
   805             ++delta;
   806         }
   807         while(delta<0 && i>0) {
   808             U8_PREV_OR_FFFD(s, 0, i, c);
   809             if(c<=0xffff) {
   810                 --pos;
   811                 ++delta;
   812             } else if(delta<=-2) {
   813                 pos-=2;
   814                 delta+=2;
   815             } else /* delta==-1 */ {
   816                 /* stop in the middle of a supplementary code point */
   817                 i+=4; /* back to behind this supplementary code point for consistent state */
   818                 iter->reservedField=c;
   819                 --pos;
   820                 break; /* delta=0; */
   821             }
   822         }
   823     }
   825     iter->start=i;
   826     if(iter->index>=0) {
   827         return iter->index=pos;
   828     } else {
   829         /* we started with index<0 (unknown) so pos is bogus */
   830         if(i<=1) {
   831             return iter->index=i; /* reached the beginning */
   832         } else {
   833             /* we still don't know the UTF-16 index */
   834             return UITER_UNKNOWN_INDEX;
   835         }
   836     }
   837 }
   839 static UBool U_CALLCONV
   840 utf8IteratorHasNext(UCharIterator *iter) {
   841     return iter->start<iter->limit || iter->reservedField!=0;
   842 }
   844 static UBool U_CALLCONV
   845 utf8IteratorHasPrevious(UCharIterator *iter) {
   846     return iter->start>0;
   847 }
   849 static UChar32 U_CALLCONV
   850 utf8IteratorCurrent(UCharIterator *iter) {
   851     if(iter->reservedField!=0) {
   852         return U16_TRAIL(iter->reservedField);
   853     } else if(iter->start<iter->limit) {
   854         const uint8_t *s=(const uint8_t *)iter->context;
   855         UChar32 c;
   856         int32_t i=iter->start;
   858         U8_NEXT_OR_FFFD(s, i, iter->limit, c);
   859         if(c<=0xffff) {
   860             return c;
   861         } else {
   862             return U16_LEAD(c);
   863         }
   864     } else {
   865         return U_SENTINEL;
   866     }
   867 }
   869 static UChar32 U_CALLCONV
   870 utf8IteratorNext(UCharIterator *iter) {
   871     int32_t index;
   873     if(iter->reservedField!=0) {
   874         UChar trail=U16_TRAIL(iter->reservedField);
   875         iter->reservedField=0;
   876         if((index=iter->index)>=0) {
   877             iter->index=index+1;
   878         }
   879         return trail;
   880     } else if(iter->start<iter->limit) {
   881         const uint8_t *s=(const uint8_t *)iter->context;
   882         UChar32 c;
   884         U8_NEXT_OR_FFFD(s, iter->start, iter->limit, c);
   885         if((index=iter->index)>=0) {
   886             iter->index=++index;
   887             if(iter->length<0 && iter->start==iter->limit) {
   888                 iter->length= c<=0xffff ? index : index+1;
   889             }
   890         } else if(iter->start==iter->limit && iter->length>=0) {
   891             iter->index= c<=0xffff ? iter->length : iter->length-1;
   892         }
   893         if(c<=0xffff) {
   894             return c;
   895         } else {
   896             iter->reservedField=c;
   897             return U16_LEAD(c);
   898         }
   899     } else {
   900         return U_SENTINEL;
   901     }
   902 }
   904 static UChar32 U_CALLCONV
   905 utf8IteratorPrevious(UCharIterator *iter) {
   906     int32_t index;
   908     if(iter->reservedField!=0) {
   909         UChar lead=U16_LEAD(iter->reservedField);
   910         iter->reservedField=0;
   911         iter->start-=4; /* we stayed behind the supplementary code point; go before it now */
   912         if((index=iter->index)>0) {
   913             iter->index=index-1;
   914         }
   915         return lead;
   916     } else if(iter->start>0) {
   917         const uint8_t *s=(const uint8_t *)iter->context;
   918         UChar32 c;
   920         U8_PREV_OR_FFFD(s, 0, iter->start, c);
   921         if((index=iter->index)>0) {
   922             iter->index=index-1;
   923         } else if(iter->start<=1) {
   924             iter->index= c<=0xffff ? iter->start : iter->start+1;
   925         }
   926         if(c<=0xffff) {
   927             return c;
   928         } else {
   929             iter->start+=4; /* back to behind this supplementary code point for consistent state */
   930             iter->reservedField=c;
   931             return U16_TRAIL(c);
   932         }
   933     } else {
   934         return U_SENTINEL;
   935     }
   936 }
   938 static uint32_t U_CALLCONV
   939 utf8IteratorGetState(const UCharIterator *iter) {
   940     uint32_t state=(uint32_t)(iter->start<<1);
   941     if(iter->reservedField!=0) {
   942         state|=1;
   943     }
   944     return state;
   945 }
   947 static void U_CALLCONV
   948 utf8IteratorSetState(UCharIterator *iter,
   949                      uint32_t state,
   950                      UErrorCode *pErrorCode)
   951 {
   952     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
   953         /* do nothing */
   954     } else if(iter==NULL) {
   955         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   956     } else if(state==utf8IteratorGetState(iter)) {
   957         /* setting to the current state: no-op */
   958     } else {
   959         int32_t index=(int32_t)(state>>1); /* UTF-8 index */
   960         state&=1; /* 1 if in surrogate pair, must be index>=4 */
   962         if((state==0 ? index<0 : index<4) || iter->limit<index) {
   963             *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   964         } else {
   965             iter->start=index; /* restore UTF-8 byte index */
   966             if(index<=1) {
   967                 iter->index=index;
   968             } else {
   969                 iter->index=-1; /* unknown UTF-16 index */
   970             }
   971             if(state==0) {
   972                 iter->reservedField=0;
   973             } else {
   974                 /* verified index>=4 above */
   975                 UChar32 c;
   976                 U8_PREV_OR_FFFD((const uint8_t *)iter->context, 0, index, c);
   977                 if(c<=0xffff) {
   978                     *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
   979                 } else {
   980                     iter->reservedField=c;
   981                 }
   982             }
   983         }
   984     }
   985 }
   987 static const UCharIterator utf8Iterator={
   988     0, 0, 0, 0, 0, 0,
   989     utf8IteratorGetIndex,
   990     utf8IteratorMove,
   991     utf8IteratorHasNext,
   992     utf8IteratorHasPrevious,
   993     utf8IteratorCurrent,
   994     utf8IteratorNext,
   995     utf8IteratorPrevious,
   996     NULL,
   997     utf8IteratorGetState,
   998     utf8IteratorSetState
   999 };
  1001 U_CAPI void U_EXPORT2
  1002 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length) {
  1003     if(iter!=0) {
  1004         if(s!=0 && length>=-1) {
  1005             *iter=utf8Iterator;
  1006             iter->context=s;
  1007             if(length>=0) {
  1008                 iter->limit=length;
  1009             } else {
  1010                 iter->limit=(int32_t)uprv_strlen(s);
  1012             iter->length= iter->limit<=1 ? iter->limit : -1;
  1013         } else {
  1014             *iter=noopIterator;
  1019 /* Helper functions --------------------------------------------------------- */
  1021 U_CAPI UChar32 U_EXPORT2
  1022 uiter_current32(UCharIterator *iter) {
  1023     UChar32 c, c2;
  1025     c=iter->current(iter);
  1026     if(U16_IS_SURROGATE(c)) {
  1027         if(U16_IS_SURROGATE_LEAD(c)) {
  1028             /*
  1029              * go to the next code unit
  1030              * we know that we are not at the limit because c!=U_SENTINEL
  1031              */
  1032             iter->move(iter, 1, UITER_CURRENT);
  1033             if(U16_IS_TRAIL(c2=iter->current(iter))) {
  1034                 c=U16_GET_SUPPLEMENTARY(c, c2);
  1037             /* undo index movement */
  1038             iter->move(iter, -1, UITER_CURRENT);
  1039         } else {
  1040             if(U16_IS_LEAD(c2=iter->previous(iter))) {
  1041                 c=U16_GET_SUPPLEMENTARY(c2, c);
  1043             if(c2>=0) {
  1044                 /* undo index movement */
  1045                 iter->move(iter, 1, UITER_CURRENT);
  1049     return c;
  1052 U_CAPI UChar32 U_EXPORT2
  1053 uiter_next32(UCharIterator *iter) {
  1054     UChar32 c, c2;
  1056     c=iter->next(iter);
  1057     if(U16_IS_LEAD(c)) {
  1058         if(U16_IS_TRAIL(c2=iter->next(iter))) {
  1059             c=U16_GET_SUPPLEMENTARY(c, c2);
  1060         } else if(c2>=0) {
  1061             /* unmatched first surrogate, undo index movement */
  1062             iter->move(iter, -1, UITER_CURRENT);
  1065     return c;
  1068 U_CAPI UChar32 U_EXPORT2
  1069 uiter_previous32(UCharIterator *iter) {
  1070     UChar32 c, c2;
  1072     c=iter->previous(iter);
  1073     if(U16_IS_TRAIL(c)) {
  1074         if(U16_IS_LEAD(c2=iter->previous(iter))) {
  1075             c=U16_GET_SUPPLEMENTARY(c2, c);
  1076         } else if(c2>=0) {
  1077             /* unmatched second surrogate, undo index movement */
  1078             iter->move(iter, 1, UITER_CURRENT);
  1081     return c;
  1084 U_CAPI uint32_t U_EXPORT2
  1085 uiter_getState(const UCharIterator *iter) {
  1086     if(iter==NULL || iter->getState==NULL) {
  1087         return UITER_NO_STATE;
  1088     } else {
  1089         return iter->getState(iter);
  1093 U_CAPI void U_EXPORT2
  1094 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode) {
  1095     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
  1096         /* do nothing */
  1097     } else if(iter==NULL) {
  1098         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1099     } else if(iter->setState==NULL) {
  1100         *pErrorCode=U_UNSUPPORTED_ERROR;
  1101     } else {
  1102         iter->setState(iter, state, pErrorCode);
  1106 U_CDECL_END

mercurial