intl/icu/source/common/ustrtrns.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2 ******************************************************************************
     3 *
     4 *   Copyright (C) 2001-2013, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 ******************************************************************************
     8 *
     9 * File ustrtrns.cpp
    10 *
    11 * Modification History:
    12 *
    13 *   Date        Name        Description
    14 *   9/10/2001    Ram    Creation.
    15 ******************************************************************************
    16 */
    18 /*******************************************************************************
    19  *
    20  * u_strTo* and u_strFrom* APIs
    21  * WCS functions moved to ustr_wcs.c for better modularization
    22  *
    23  *******************************************************************************
    24  */
    27 #include "unicode/putil.h"
    28 #include "unicode/ustring.h"
    29 #include "unicode/utf.h"
    30 #include "unicode/utf8.h"
    31 #include "unicode/utf16.h"
    32 #include "cstring.h"
    33 #include "cmemory.h"
    34 #include "ustr_imp.h"
    35 #include "uassert.h"
    37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
    39 U_CAPI UChar* U_EXPORT2 
    40 u_strFromUTF32WithSub(UChar *dest,
    41                int32_t destCapacity,
    42                int32_t *pDestLength,
    43                const UChar32 *src,
    44                int32_t srcLength,
    45                UChar32 subchar, int32_t *pNumSubstitutions,
    46                UErrorCode *pErrorCode) {
    47     const UChar32 *srcLimit;
    48     UChar32 ch;
    49     UChar *destLimit;
    50     UChar *pDest;
    51     int32_t reqLength;
    52     int32_t numSubstitutions;
    54     /* args check */
    55     if(U_FAILURE(*pErrorCode)){
    56         return NULL;
    57     }
    58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    61     ) {
    62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    63         return NULL;
    64     }
    66     if(pNumSubstitutions != NULL) {
    67         *pNumSubstitutions = 0;
    68     }
    70     pDest = dest;
    71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
    72     reqLength = 0;
    73     numSubstitutions = 0;
    75     if(srcLength < 0) {
    76         /* simple loop for conversion of a NUL-terminated BMP string */
    77         while((ch=*src) != 0 &&
    78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
    79             ++src;
    80             if(pDest < destLimit) {
    81                 *pDest++ = (UChar)ch;
    82             } else {
    83                 ++reqLength;
    84             }
    85         }
    86         srcLimit = src;
    87         if(ch != 0) {
    88             /* "complicated" case, find the end of the remaining string */
    89             while(*++srcLimit != 0) {}
    90         }
    91     } else {
    92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
    93     }
    95     /* convert with length */
    96     while(src < srcLimit) {
    97         ch = *src++;
    98         do {
    99             /* usually "loops" once; twice only for writing subchar */
   100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
   101                 if(pDest < destLimit) {
   102                     *pDest++ = (UChar)ch;
   103                 } else {
   104                     ++reqLength;
   105                 }
   106                 break;
   107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
   108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
   109                     *pDest++ = U16_LEAD(ch);
   110                     *pDest++ = U16_TRAIL(ch);
   111                 } else {
   112                     reqLength += 2;
   113                 }
   114                 break;
   115             } else if((ch = subchar) < 0) {
   116                 /* surrogate code point, or not a Unicode code point at all */
   117                 *pErrorCode = U_INVALID_CHAR_FOUND;
   118                 return NULL;
   119             } else {
   120                 ++numSubstitutions;
   121             }
   122         } while(TRUE);
   123     }
   125     reqLength += (int32_t)(pDest - dest);
   126     if(pDestLength) {
   127         *pDestLength = reqLength;
   128     }
   129     if(pNumSubstitutions != NULL) {
   130         *pNumSubstitutions = numSubstitutions;
   131     }
   133     /* Terminate the buffer */
   134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   136     return dest;
   137 }
   139 U_CAPI UChar* U_EXPORT2 
   140 u_strFromUTF32(UChar *dest,
   141                int32_t destCapacity, 
   142                int32_t *pDestLength,
   143                const UChar32 *src,
   144                int32_t srcLength,
   145                UErrorCode *pErrorCode) {
   146     return u_strFromUTF32WithSub(
   147             dest, destCapacity, pDestLength,
   148             src, srcLength,
   149             U_SENTINEL, NULL,
   150             pErrorCode);
   151 }
   153 U_CAPI UChar32* U_EXPORT2 
   154 u_strToUTF32WithSub(UChar32 *dest,
   155              int32_t destCapacity,
   156              int32_t *pDestLength,
   157              const UChar *src,
   158              int32_t srcLength,
   159              UChar32 subchar, int32_t *pNumSubstitutions,
   160              UErrorCode *pErrorCode) {
   161     const UChar *srcLimit;
   162     UChar32 ch;
   163     UChar ch2;
   164     UChar32 *destLimit;
   165     UChar32 *pDest;
   166     int32_t reqLength;
   167     int32_t numSubstitutions;
   169     /* args check */
   170     if(U_FAILURE(*pErrorCode)){
   171         return NULL;
   172     }
   173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
   175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   176     ) {
   177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   178         return NULL;
   179     }
   181     if(pNumSubstitutions != NULL) {
   182         *pNumSubstitutions = 0;
   183     }
   185     pDest = dest;
   186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
   187     reqLength = 0;
   188     numSubstitutions = 0;
   190     if(srcLength < 0) {
   191         /* simple loop for conversion of a NUL-terminated BMP string */
   192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
   193             ++src;
   194             if(pDest < destLimit) {
   195                 *pDest++ = ch;
   196             } else {
   197                 ++reqLength;
   198             }
   199         }
   200         srcLimit = src;
   201         if(ch != 0) {
   202             /* "complicated" case, find the end of the remaining string */
   203             while(*++srcLimit != 0) {}
   204         }
   205     } else {
   206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
   207     }
   209     /* convert with length */
   210     while(src < srcLimit) {
   211         ch = *src++;
   212         if(!U16_IS_SURROGATE(ch)) {
   213             /* write or count ch below */
   214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
   215             ++src;
   216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
   217         } else if((ch = subchar) < 0) {
   218             /* unpaired surrogate */
   219             *pErrorCode = U_INVALID_CHAR_FOUND;
   220             return NULL;
   221         } else {
   222             ++numSubstitutions;
   223         }
   224         if(pDest < destLimit) {
   225             *pDest++ = ch;
   226         } else {
   227             ++reqLength;
   228         }
   229     }
   231     reqLength += (int32_t)(pDest - dest);
   232     if(pDestLength) {
   233         *pDestLength = reqLength;
   234     }
   235     if(pNumSubstitutions != NULL) {
   236         *pNumSubstitutions = numSubstitutions;
   237     }
   239     /* Terminate the buffer */
   240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
   242     return dest;
   243 }
   245 U_CAPI UChar32* U_EXPORT2 
   246 u_strToUTF32(UChar32 *dest, 
   247              int32_t destCapacity,
   248              int32_t *pDestLength,
   249              const UChar *src, 
   250              int32_t srcLength,
   251              UErrorCode *pErrorCode) {
   252     return u_strToUTF32WithSub(
   253             dest, destCapacity, pDestLength,
   254             src, srcLength,
   255             U_SENTINEL, NULL,
   256             pErrorCode);
   257 }
   259 /* for utf8_nextCharSafeBodyTerminated() */
   260 static const UChar32
   261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
   263 /*
   264  * Version of utf8_nextCharSafeBody() with the following differences:
   265  * - checks for NUL termination instead of length
   266  * - works with pointers instead of indexes
   267  * - always strict (strict==-1)
   268  *
   269  * *ps points to after the lead byte and will be moved to after the last trail byte.
   270  * c is the lead byte.
   271  * @return the code point, or U_SENTINEL
   272  */
   273 static UChar32
   274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
   275     const uint8_t *s=*ps;
   276     uint8_t trail, illegal=0;
   277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
   278     U_ASSERT(count<6);
   279     U8_MASK_LEAD_BYTE((c), count);
   280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
   281     switch(count) {
   282     /* each branch falls through to the next one */
   283     case 5:
   284     case 4:
   285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
   286         illegal=1;
   287         break;
   288     case 3:
   289         trail=(uint8_t)(*s++ - 0x80);
   290         c=(c<<6)|trail;
   291         if(trail>0x3f || c>=0x110) {
   292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
   293             illegal=1;
   294             break;
   295         }
   296     case 2: /*fall through*/
   297         trail=(uint8_t)(*s++ - 0x80);
   298         if(trail>0x3f) {
   299             /* not a trail byte */
   300             illegal=1;
   301             break;
   302         }
   303         c=(c<<6)|trail;
   304     case 1: /*fall through*/
   305         trail=(uint8_t)(*s++ - 0x80);
   306         if(trail>0x3f) {
   307             /* not a trail byte */
   308             illegal=1;
   309         }
   310         c=(c<<6)|trail;
   311         break;
   312     case 0:
   313         return U_SENTINEL;
   314     /* no default branch to optimize switch()  - all values are covered */
   315     }
   317     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
   318     /* illegal is also set if count>=4 */
   319     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
   320         /* error handling */
   321         /* don't go beyond this sequence */
   322         s=*ps;
   323         while(count>0 && U8_IS_TRAIL(*s)) {
   324             ++s;
   325             --count;
   326         }
   327         c=U_SENTINEL;
   328     }
   329     *ps=s;
   330     return c;
   331 }
   333 /*
   334  * Version of utf8_nextCharSafeBody() with the following differences:
   335  * - works with pointers instead of indexes
   336  * - always strict (strict==-1)
   337  *
   338  * *ps points to after the lead byte and will be moved to after the last trail byte.
   339  * c is the lead byte.
   340  * @return the code point, or U_SENTINEL
   341  */
   342 static UChar32
   343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
   344     const uint8_t *s=*ps;
   345     uint8_t trail, illegal=0;
   346     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
   347     if((limit-s)>=count) {
   348         U8_MASK_LEAD_BYTE((c), count);
   349         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
   350         switch(count) {
   351         /* each branch falls through to the next one */
   352         case 5:
   353         case 4:
   354             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
   355             illegal=1;
   356             break;
   357         case 3:
   358             trail=*s++;
   359             c=(c<<6)|(trail&0x3f);
   360             if(c<0x110) {
   361                 illegal|=(trail&0xc0)^0x80;
   362             } else {
   363                 /* code point>0x10ffff, outside Unicode */
   364                 illegal=1;
   365                 break;
   366             }
   367         case 2: /*fall through*/
   368             trail=*s++;
   369             c=(c<<6)|(trail&0x3f);
   370             illegal|=(trail&0xc0)^0x80;
   371         case 1: /*fall through*/
   372             trail=*s++;
   373             c=(c<<6)|(trail&0x3f);
   374             illegal|=(trail&0xc0)^0x80;
   375             break;
   376         case 0:
   377             return U_SENTINEL;
   378         /* no default branch to optimize switch()  - all values are covered */
   379         }
   380     } else {
   381         illegal=1; /* too few bytes left */
   382     }
   384     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
   385     /* illegal is also set if count>=4 */
   386     U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
   387     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
   388         /* error handling */
   389         /* don't go beyond this sequence */
   390         s=*ps;
   391         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
   392             ++s;
   393             --count;
   394         }
   395         c=U_SENTINEL;
   396     }
   397     *ps=s;
   398     return c;
   399 }
   401 U_CAPI UChar* U_EXPORT2
   402 u_strFromUTF8WithSub(UChar *dest,
   403               int32_t destCapacity,
   404               int32_t *pDestLength,
   405               const char* src,
   406               int32_t srcLength,
   407               UChar32 subchar, int32_t *pNumSubstitutions,
   408               UErrorCode *pErrorCode){
   409     UChar *pDest = dest;
   410     UChar *pDestLimit = dest+destCapacity;
   411     UChar32 ch;
   412     int32_t reqLength = 0;
   413     const uint8_t* pSrc = (const uint8_t*) src;
   414     uint8_t t1, t2; /* trail bytes */
   415     int32_t numSubstitutions;
   417     /* args check */
   418     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
   419         return NULL;
   420     }
   422     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   423         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
   424         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   425     ) {
   426         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   427         return NULL;
   428     }
   430     if(pNumSubstitutions!=NULL) {
   431         *pNumSubstitutions=0;
   432     }
   433     numSubstitutions=0;
   435     /*
   436      * Inline processing of UTF-8 byte sequences:
   437      *
   438      * Byte sequences for the most common characters are handled inline in
   439      * the conversion loops. In order to reduce the path lengths for those
   440      * characters, the tests are arranged in a kind of binary search.
   441      * ASCII (<=0x7f) is checked first, followed by the dividing point
   442      * between 2- and 3-byte sequences (0xe0).
   443      * The 3-byte branch is tested first to speed up CJK text.
   444      * The compiler should combine the subtractions for the two tests for 0xe0.
   445      * Each branch then tests for the other end of its range.
   446      */
   448     if(srcLength < 0){
   449         /*
   450          * Transform a NUL-terminated string.
   451          * The code explicitly checks for NULs only in the lead byte position.
   452          * A NUL byte in the trail byte position fails the trail byte range check anyway.
   453          */
   454         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
   455             if(ch <= 0x7f){
   456                 *pDest++=(UChar)ch;
   457                 ++pSrc;
   458             } else {
   459                 if(ch > 0xe0) {
   460                     if( /* handle U+1000..U+CFFF inline */
   461                         ch <= 0xec &&
   462                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   463                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   464                     ) {
   465                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   466                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   467                         pSrc += 3;
   468                         continue;
   469                     }
   470                 } else if(ch < 0xe0) {
   471                     if( /* handle U+0080..U+07FF inline */
   472                         ch >= 0xc2 &&
   473                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   474                     ) {
   475                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   476                         pSrc += 2;
   477                         continue;
   478                     }
   479                 }
   481                 /* function call for "complicated" and error cases */
   482                 ++pSrc; /* continue after the lead byte */
   483                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
   484                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
   485                     *pErrorCode = U_INVALID_CHAR_FOUND;
   486                     return NULL;
   487                 } else if(ch<=0xFFFF) {
   488                     *(pDest++)=(UChar)ch;
   489                 } else {
   490                     *(pDest++)=U16_LEAD(ch);
   491                     if(pDest<pDestLimit) {
   492                         *(pDest++)=U16_TRAIL(ch);
   493                     } else {
   494                         reqLength++;
   495                         break;
   496                     }
   497                 }
   498             }
   499         }
   501         /* Pre-flight the rest of the string. */
   502         while((ch = *pSrc) != 0) {
   503             if(ch <= 0x7f){
   504                 ++reqLength;
   505                 ++pSrc;
   506             } else {
   507                 if(ch > 0xe0) {
   508                     if( /* handle U+1000..U+CFFF inline */
   509                         ch <= 0xec &&
   510                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   511                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   512                     ) {
   513                         ++reqLength;
   514                         pSrc += 3;
   515                         continue;
   516                     }
   517                 } else if(ch < 0xe0) {
   518                     if( /* handle U+0080..U+07FF inline */
   519                         ch >= 0xc2 &&
   520                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   521                     ) {
   522                         ++reqLength;
   523                         pSrc += 2;
   524                         continue;
   525                     }
   526                 }
   528                 /* function call for "complicated" and error cases */
   529                 ++pSrc; /* continue after the lead byte */
   530                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
   531                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
   532                     *pErrorCode = U_INVALID_CHAR_FOUND;
   533                     return NULL;
   534                 }
   535                 reqLength += U16_LENGTH(ch);
   536             }
   537         }
   538     } else /* srcLength >= 0 */ {
   539         const uint8_t *pSrcLimit = pSrc + srcLength;
   540         int32_t count;
   542         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   543         for(;;) {
   544             /*
   545              * Each iteration of the inner loop progresses by at most 3 UTF-8
   546              * bytes and one UChar, for most characters.
   547              * For supplementary code points (4 & 2), which are rare,
   548              * there is an additional adjustment.
   549              */
   550             count = (int32_t)(pDestLimit - pDest);
   551             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
   552             if(count > srcLength) {
   553                 count = srcLength; /* min(remaining dest, remaining src/3) */
   554             }
   555             if(count < 3) {
   556                 /*
   557                  * Too much overhead if we get near the end of the string,
   558                  * continue with the next loop.
   559                  */
   560                 break;
   561             }
   563             do {
   564                 ch = *pSrc;
   565                 if(ch <= 0x7f){
   566                     *pDest++=(UChar)ch;
   567                     ++pSrc;
   568                 } else {
   569                     if(ch > 0xe0) {
   570                         if( /* handle U+1000..U+CFFF inline */
   571                             ch <= 0xec &&
   572                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   573                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   574                         ) {
   575                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   576                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   577                             pSrc += 3;
   578                             continue;
   579                         }
   580                     } else if(ch < 0xe0) {
   581                         if( /* handle U+0080..U+07FF inline */
   582                             ch >= 0xc2 &&
   583                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   584                         ) {
   585                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   586                             pSrc += 2;
   587                             continue;
   588                         }
   589                     }
   591                     if(ch >= 0xf0 || subchar > 0xffff) {
   592                         /*
   593                          * We may read up to six bytes and write up to two UChars,
   594                          * which we didn't account for with computing count,
   595                          * so we adjust it here.
   596                          */
   597                         if(--count == 0) {
   598                             break;
   599                         }
   600                     }
   602                     /* function call for "complicated" and error cases */
   603                     ++pSrc; /* continue after the lead byte */
   604                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   605                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
   606                         *pErrorCode = U_INVALID_CHAR_FOUND;
   607                         return NULL;
   608                     }else if(ch<=0xFFFF){
   609                         *(pDest++)=(UChar)ch;
   610                     }else{
   611                         *(pDest++)=U16_LEAD(ch);
   612                         *(pDest++)=U16_TRAIL(ch);
   613                     }
   614                 }
   615             } while(--count > 0);
   616         }
   618         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
   619             ch = *pSrc;
   620             if(ch <= 0x7f){
   621                 *pDest++=(UChar)ch;
   622                 ++pSrc;
   623             } else {
   624                 if(ch > 0xe0) {
   625                     if( /* handle U+1000..U+CFFF inline */
   626                         ch <= 0xec &&
   627                         ((pSrcLimit - pSrc) >= 3) &&
   628                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   629                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   630                     ) {
   631                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   632                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   633                         pSrc += 3;
   634                         continue;
   635                     }
   636                 } else if(ch < 0xe0) {
   637                     if( /* handle U+0080..U+07FF inline */
   638                         ch >= 0xc2 &&
   639                         ((pSrcLimit - pSrc) >= 2) &&
   640                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   641                     ) {
   642                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   643                         pSrc += 2;
   644                         continue;
   645                     }
   646                 }
   648                 /* function call for "complicated" and error cases */
   649                 ++pSrc; /* continue after the lead byte */
   650                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   651                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
   652                     *pErrorCode = U_INVALID_CHAR_FOUND;
   653                     return NULL;
   654                 }else if(ch<=0xFFFF){
   655                     *(pDest++)=(UChar)ch;
   656                 }else{
   657                     *(pDest++)=U16_LEAD(ch);
   658                     if(pDest<pDestLimit){
   659                         *(pDest++)=U16_TRAIL(ch);
   660                     }else{
   661                         reqLength++;
   662                         break;
   663                     }
   664                 }
   665             }
   666         }
   667         /* do not fill the dest buffer just count the UChars needed */
   668         while(pSrc < pSrcLimit){
   669             ch = *pSrc;
   670             if(ch <= 0x7f){
   671                 reqLength++;
   672                 ++pSrc;
   673             } else {
   674                 if(ch > 0xe0) {
   675                     if( /* handle U+1000..U+CFFF inline */
   676                         ch <= 0xec &&
   677                         ((pSrcLimit - pSrc) >= 3) &&
   678                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   679                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   680                     ) {
   681                         reqLength++;
   682                         pSrc += 3;
   683                         continue;
   684                     }
   685                 } else if(ch < 0xe0) {
   686                     if( /* handle U+0080..U+07FF inline */
   687                         ch >= 0xc2 &&
   688                         ((pSrcLimit - pSrc) >= 2) &&
   689                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   690                     ) {
   691                         reqLength++;
   692                         pSrc += 2;
   693                         continue;
   694                     }
   695                 }
   697                 /* function call for "complicated" and error cases */
   698                 ++pSrc; /* continue after the lead byte */
   699                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   700                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
   701                     *pErrorCode = U_INVALID_CHAR_FOUND;
   702                     return NULL;
   703                 }
   704                 reqLength+=U16_LENGTH(ch);
   705             }
   706         }
   707     }
   709     reqLength+=(int32_t)(pDest - dest);
   711     if(pNumSubstitutions!=NULL) {
   712         *pNumSubstitutions=numSubstitutions;
   713     }
   715     if(pDestLength){
   716         *pDestLength = reqLength;
   717     }
   719     /* Terminate the buffer */
   720     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
   722     return dest;
   723 }
   725 U_CAPI UChar* U_EXPORT2
   726 u_strFromUTF8(UChar *dest,
   727               int32_t destCapacity,
   728               int32_t *pDestLength,
   729               const char* src,
   730               int32_t srcLength,
   731               UErrorCode *pErrorCode){
   732     return u_strFromUTF8WithSub(
   733             dest, destCapacity, pDestLength,
   734             src, srcLength,
   735             U_SENTINEL, NULL,
   736             pErrorCode);
   737 }
   739 U_CAPI UChar * U_EXPORT2
   740 u_strFromUTF8Lenient(UChar *dest,
   741                      int32_t destCapacity,
   742                      int32_t *pDestLength,
   743                      const char *src,
   744                      int32_t srcLength,
   745                      UErrorCode *pErrorCode) {
   746     UChar *pDest = dest;
   747     UChar32 ch;
   748     int32_t reqLength = 0;
   749     uint8_t* pSrc = (uint8_t*) src;
   751     /* args check */
   752     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
   753         return NULL;
   754     }
   756     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   757         (destCapacity<0) || (dest == NULL && destCapacity > 0)
   758     ) {
   759         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   760         return NULL;
   761     }
   763     if(srcLength < 0) {
   764         /* Transform a NUL-terminated string. */
   765         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
   766         uint8_t t1, t2, t3; /* trail bytes */
   768         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
   769             if(ch < 0xc0) {
   770                 /*
   771                  * ASCII, or a trail byte in lead position which is treated like
   772                  * a single-byte sequence for better character boundary
   773                  * resynchronization after illegal sequences.
   774                  */
   775                 *pDest++=(UChar)ch;
   776                 ++pSrc;
   777                 continue;
   778             } else if(ch < 0xe0) { /* U+0080..U+07FF */
   779                 if((t1 = pSrc[1]) != 0) {
   780                     /* 0x3080 = (0xc0 << 6) + 0x80 */
   781                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
   782                     pSrc += 2;
   783                     continue;
   784                 }
   785             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
   786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
   787                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   788                     /* 0x2080 = (0x80 << 6) + 0x80 */
   789                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
   790                     pSrc += 3;
   791                     continue;
   792                 }
   793             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
   794                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
   795                     pSrc += 4;
   796                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
   797                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
   798                     *(pDest++) = U16_LEAD(ch);
   799                     if(pDest < pDestLimit) {
   800                         *(pDest++) = U16_TRAIL(ch);
   801                     } else {
   802                         reqLength = 1;
   803                         break;
   804                     }
   805                     continue;
   806                 }
   807             }
   809             /* truncated character at the end */
   810             *pDest++ = 0xfffd;
   811             while(*++pSrc != 0) {}
   812             break;
   813         }
   815         /* Pre-flight the rest of the string. */
   816         while((ch = *pSrc) != 0) {
   817             if(ch < 0xc0) {
   818                 /*
   819                  * ASCII, or a trail byte in lead position which is treated like
   820                  * a single-byte sequence for better character boundary
   821                  * resynchronization after illegal sequences.
   822                  */
   823                 ++reqLength;
   824                 ++pSrc;
   825                 continue;
   826             } else if(ch < 0xe0) { /* U+0080..U+07FF */
   827                 if(pSrc[1] != 0) {
   828                     ++reqLength;
   829                     pSrc += 2;
   830                     continue;
   831                 }
   832             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
   833                 if(pSrc[1] != 0 && pSrc[2] != 0) {
   834                     ++reqLength;
   835                     pSrc += 3;
   836                     continue;
   837                 }
   838             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
   839                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
   840                     reqLength += 2;
   841                     pSrc += 4;
   842                     continue;
   843                 }
   844             }
   846             /* truncated character at the end */
   847             ++reqLength;
   848             break;
   849         }
   850     } else /* srcLength >= 0 */ {
   851       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
   853         /*
   854          * This function requires that if srcLength is given, then it must be
   855          * destCapatity >= srcLength so that we need not check for
   856          * destination buffer overflow in the loop.
   857          */
   858         if(destCapacity < srcLength) {
   859             if(pDestLength != NULL) {
   860                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
   861             }
   862             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
   863             return NULL;
   864         }
   866         if((pSrcLimit - pSrc) >= 4) {
   867             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
   869             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
   870             do {
   871                 ch = *pSrc++;
   872                 if(ch < 0xc0) {
   873                     /*
   874                      * ASCII, or a trail byte in lead position which is treated like
   875                      * a single-byte sequence for better character boundary
   876                      * resynchronization after illegal sequences.
   877                      */
   878                     *pDest++=(UChar)ch;
   879                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
   880                     /* 0x3080 = (0xc0 << 6) + 0x80 */
   881                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
   882                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
   883                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   884                     /* 0x2080 = (0x80 << 6) + 0x80 */
   885                     ch = (ch << 12) + (*pSrc++ << 6);
   886                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
   887                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
   888                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
   889                     ch = (ch << 18) + (*pSrc++ << 12);
   890                     ch += *pSrc++ << 6;
   891                     ch += *pSrc++ - 0x3c82080;
   892                     *(pDest++) = U16_LEAD(ch);
   893                     *(pDest++) = U16_TRAIL(ch);
   894                 }
   895             } while(pSrc < pSrcLimit);
   897             pSrcLimit += 3; /* restore original pSrcLimit */
   898         }
   900         while(pSrc < pSrcLimit) {
   901             ch = *pSrc++;
   902             if(ch < 0xc0) {
   903                 /*
   904                  * ASCII, or a trail byte in lead position which is treated like
   905                  * a single-byte sequence for better character boundary
   906                  * resynchronization after illegal sequences.
   907                  */
   908                 *pDest++=(UChar)ch;
   909                 continue;
   910             } else if(ch < 0xe0) { /* U+0080..U+07FF */
   911                 if(pSrc < pSrcLimit) {
   912                     /* 0x3080 = (0xc0 << 6) + 0x80 */
   913                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
   914                     continue;
   915                 }
   916             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
   917                 if((pSrcLimit - pSrc) >= 2) {
   918                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   919                     /* 0x2080 = (0x80 << 6) + 0x80 */
   920                     ch = (ch << 12) + (*pSrc++ << 6);
   921                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
   922                     pSrc += 3;
   923                     continue;
   924                 }
   925             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
   926                 if((pSrcLimit - pSrc) >= 3) {
   927                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
   928                     ch = (ch << 18) + (*pSrc++ << 12);
   929                     ch += *pSrc++ << 6;
   930                     ch += *pSrc++ - 0x3c82080;
   931                     *(pDest++) = U16_LEAD(ch);
   932                     *(pDest++) = U16_TRAIL(ch);
   933                     pSrc += 4;
   934                     continue;
   935                 }
   936             }
   938             /* truncated character at the end */
   939             *pDest++ = 0xfffd;
   940             break;
   941         }
   942     }
   944     reqLength+=(int32_t)(pDest - dest);
   946     if(pDestLength){
   947         *pDestLength = reqLength;
   948     }
   950     /* Terminate the buffer */
   951     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
   953     return dest;
   954 }
   956 static inline uint8_t *
   957 _appendUTF8(uint8_t *pDest, UChar32 c) {
   958     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
   959     if((c)<=0x7f) {
   960         *pDest++=(uint8_t)c;
   961     } else if(c<=0x7ff) {
   962         *pDest++=(uint8_t)((c>>6)|0xc0);
   963         *pDest++=(uint8_t)((c&0x3f)|0x80);
   964     } else if(c<=0xffff) {
   965         *pDest++=(uint8_t)((c>>12)|0xe0);
   966         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
   967         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
   968     } else /* if((uint32_t)(c)<=0x10ffff) */ {
   969         *pDest++=(uint8_t)(((c)>>18)|0xf0);
   970         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
   971         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
   972         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
   973     }
   974     return pDest;
   975 }
   978 U_CAPI char* U_EXPORT2 
   979 u_strToUTF8WithSub(char *dest,
   980             int32_t destCapacity,
   981             int32_t *pDestLength,
   982             const UChar *pSrc,
   983             int32_t srcLength,
   984             UChar32 subchar, int32_t *pNumSubstitutions,
   985             UErrorCode *pErrorCode){
   986     int32_t reqLength=0;
   987     uint32_t ch=0,ch2=0;
   988     uint8_t *pDest = (uint8_t *)dest;
   989     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
   990     int32_t numSubstitutions;
   992     /* args check */
   993     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
   994         return NULL;
   995     }
   997     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
   998         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
   999         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  1000     ) {
  1001         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1002         return NULL;
  1005     if(pNumSubstitutions!=NULL) {
  1006         *pNumSubstitutions=0;
  1008     numSubstitutions=0;
  1010     if(srcLength==-1) {
  1011         while((ch=*pSrc)!=0) {
  1012             ++pSrc;
  1013             if(ch <= 0x7f) {
  1014                 if(pDest<pDestLimit) {
  1015                     *pDest++ = (uint8_t)ch;
  1016                 } else {
  1017                     reqLength = 1;
  1018                     break;
  1020             } else if(ch <= 0x7ff) {
  1021                 if((pDestLimit - pDest) >= 2) {
  1022                     *pDest++=(uint8_t)((ch>>6)|0xc0);
  1023                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1024                 } else {
  1025                     reqLength = 2;
  1026                     break;
  1028             } else if(ch <= 0xd7ff || ch >= 0xe000) {
  1029                 if((pDestLimit - pDest) >= 3) {
  1030                     *pDest++=(uint8_t)((ch>>12)|0xe0);
  1031                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1032                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1033                 } else {
  1034                     reqLength = 3;
  1035                     break;
  1037             } else /* ch is a surrogate */ {
  1038                 int32_t length;
  1040                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
  1041                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
  1042                     ++pSrc;
  1043                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  1044                 } else if(subchar>=0) {
  1045                     ch=subchar;
  1046                     ++numSubstitutions;
  1047                 } else {
  1048                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  1049                     *pErrorCode = U_INVALID_CHAR_FOUND;
  1050                     return NULL;
  1053                 length = U8_LENGTH(ch);
  1054                 if((pDestLimit - pDest) >= length) {
  1055                     /* convert and append*/
  1056                     pDest=_appendUTF8(pDest, ch);
  1057                 } else {
  1058                     reqLength = length;
  1059                     break;
  1063         while((ch=*pSrc++)!=0) {
  1064             if(ch<=0x7f) {
  1065                 ++reqLength;
  1066             } else if(ch<=0x7ff) {
  1067                 reqLength+=2;
  1068             } else if(!U16_IS_SURROGATE(ch)) {
  1069                 reqLength+=3;
  1070             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
  1071                 ++pSrc;
  1072                 reqLength+=4;
  1073             } else if(subchar>=0) {
  1074                 reqLength+=U8_LENGTH(subchar);
  1075                 ++numSubstitutions;
  1076             } else {
  1077                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  1078                 *pErrorCode = U_INVALID_CHAR_FOUND;
  1079                 return NULL;
  1082     } else {
  1083         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
  1084         int32_t count;
  1086         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
  1087         for(;;) {
  1088             /*
  1089              * Each iteration of the inner loop progresses by at most 3 UTF-8
  1090              * bytes and one UChar, for most characters.
  1091              * For supplementary code points (4 & 2), which are rare,
  1092              * there is an additional adjustment.
  1093              */
  1094             count = (int32_t)((pDestLimit - pDest) / 3);
  1095             srcLength = (int32_t)(pSrcLimit - pSrc);
  1096             if(count > srcLength) {
  1097                 count = srcLength; /* min(remaining dest/3, remaining src) */
  1099             if(count < 3) {
  1100                 /*
  1101                  * Too much overhead if we get near the end of the string,
  1102                  * continue with the next loop.
  1103                  */
  1104                 break;
  1106             do {
  1107                 ch=*pSrc++;
  1108                 if(ch <= 0x7f) {
  1109                     *pDest++ = (uint8_t)ch;
  1110                 } else if(ch <= 0x7ff) {
  1111                     *pDest++=(uint8_t)((ch>>6)|0xc0);
  1112                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1113                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
  1114                     *pDest++=(uint8_t)((ch>>12)|0xe0);
  1115                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1117                 } else /* ch is a surrogate */ {
  1118                     /*
  1119                      * We will read two UChars and probably output four bytes,
  1120                      * which we didn't account for with computing count,
  1121                      * so we adjust it here.
  1122                      */
  1123                     if(--count == 0) {
  1124                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
  1125                         break;  /* recompute count */
  1128                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 
  1129                         ++pSrc;
  1130                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  1132                         /* writing 4 bytes per 2 UChars is ok */
  1133                         *pDest++=(uint8_t)((ch>>18)|0xf0);
  1134                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
  1135                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1136                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1137                     } else  {
  1138                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  1139                         if(subchar>=0) {
  1140                             ch=subchar;
  1141                             ++numSubstitutions;
  1142                         } else {
  1143                             *pErrorCode = U_INVALID_CHAR_FOUND;
  1144                             return NULL;
  1147                         /* convert and append*/
  1148                         pDest=_appendUTF8(pDest, ch);
  1151             } while(--count > 0);
  1154         while(pSrc<pSrcLimit) {
  1155             ch=*pSrc++;
  1156             if(ch <= 0x7f) {
  1157                 if(pDest<pDestLimit) {
  1158                     *pDest++ = (uint8_t)ch;
  1159                 } else {
  1160                     reqLength = 1;
  1161                     break;
  1163             } else if(ch <= 0x7ff) {
  1164                 if((pDestLimit - pDest) >= 2) {
  1165                     *pDest++=(uint8_t)((ch>>6)|0xc0);
  1166                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1167                 } else {
  1168                     reqLength = 2;
  1169                     break;
  1171             } else if(ch <= 0xd7ff || ch >= 0xe000) {
  1172                 if((pDestLimit - pDest) >= 3) {
  1173                     *pDest++=(uint8_t)((ch>>12)|0xe0);
  1174                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1175                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1176                 } else {
  1177                     reqLength = 3;
  1178                     break;
  1180             } else /* ch is a surrogate */ {
  1181                 int32_t length;
  1183                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 
  1184                     ++pSrc;
  1185                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
  1186                 } else if(subchar>=0) {
  1187                     ch=subchar;
  1188                     ++numSubstitutions;
  1189                 } else {
  1190                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  1191                     *pErrorCode = U_INVALID_CHAR_FOUND;
  1192                     return NULL;
  1195                 length = U8_LENGTH(ch);
  1196                 if((pDestLimit - pDest) >= length) {
  1197                     /* convert and append*/
  1198                     pDest=_appendUTF8(pDest, ch);
  1199                 } else {
  1200                     reqLength = length;
  1201                     break;
  1205         while(pSrc<pSrcLimit) {
  1206             ch=*pSrc++;
  1207             if(ch<=0x7f) {
  1208                 ++reqLength;
  1209             } else if(ch<=0x7ff) {
  1210                 reqLength+=2;
  1211             } else if(!U16_IS_SURROGATE(ch)) {
  1212                 reqLength+=3;
  1213             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
  1214                 ++pSrc;
  1215                 reqLength+=4;
  1216             } else if(subchar>=0) {
  1217                 reqLength+=U8_LENGTH(subchar);
  1218                 ++numSubstitutions;
  1219             } else {
  1220                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
  1221                 *pErrorCode = U_INVALID_CHAR_FOUND;
  1222                 return NULL;
  1227     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
  1229     if(pNumSubstitutions!=NULL) {
  1230         *pNumSubstitutions=numSubstitutions;
  1233     if(pDestLength){
  1234         *pDestLength = reqLength;
  1237     /* Terminate the buffer */
  1238     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  1239     return dest;
  1242 U_CAPI char* U_EXPORT2 
  1243 u_strToUTF8(char *dest,
  1244             int32_t destCapacity,
  1245             int32_t *pDestLength,
  1246             const UChar *pSrc,
  1247             int32_t srcLength,
  1248             UErrorCode *pErrorCode){
  1249     return u_strToUTF8WithSub(
  1250             dest, destCapacity, pDestLength,
  1251             pSrc, srcLength,
  1252             U_SENTINEL, NULL,
  1253             pErrorCode);
  1256 U_CAPI UChar* U_EXPORT2
  1257 u_strFromJavaModifiedUTF8WithSub(
  1258         UChar *dest,
  1259         int32_t destCapacity,
  1260         int32_t *pDestLength,
  1261         const char *src,
  1262         int32_t srcLength,
  1263         UChar32 subchar, int32_t *pNumSubstitutions,
  1264         UErrorCode *pErrorCode) {
  1265     UChar *pDest = dest;
  1266     UChar *pDestLimit = dest+destCapacity;
  1267     UChar32 ch;
  1268     int32_t reqLength = 0;
  1269     const uint8_t* pSrc = (const uint8_t*) src;
  1270     const uint8_t *pSrcLimit;
  1271     int32_t count;
  1272     uint8_t t1, t2; /* trail bytes */
  1273     int32_t numSubstitutions;
  1275     /* args check */
  1276     if(U_FAILURE(*pErrorCode)){
  1277         return NULL;
  1279     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  1280         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
  1281         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
  1282     ) {
  1283         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1284         return NULL;
  1287     if(pNumSubstitutions!=NULL) {
  1288         *pNumSubstitutions=0;
  1290     numSubstitutions=0;
  1292     if(srcLength < 0) {
  1293         /*
  1294          * Transform a NUL-terminated ASCII string.
  1295          * Handle non-ASCII strings with slower code.
  1296          */
  1297         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
  1298             *pDest++=(UChar)ch;
  1299             ++pSrc;
  1301         if(ch == 0) {
  1302             reqLength=(int32_t)(pDest - dest);
  1303             if(pDestLength) {
  1304                 *pDestLength = reqLength;
  1307             /* Terminate the buffer */
  1308             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
  1309             return dest;
  1311         srcLength = uprv_strlen((const char *)pSrc);
  1314     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
  1315     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
  1316     for(;;) {
  1317         count = (int32_t)(pDestLimit - pDest);
  1318         srcLength = (int32_t)(pSrcLimit - pSrc);
  1319         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
  1320             /* fast ASCII loop */
  1321             const uint8_t *prevSrc = pSrc;
  1322             int32_t delta;
  1323             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
  1324                 *pDest++=(UChar)ch;
  1325                 ++pSrc;
  1327             delta = (int32_t)(pSrc - prevSrc);
  1328             count -= delta;
  1329             srcLength -= delta;
  1331         /*
  1332          * Each iteration of the inner loop progresses by at most 3 UTF-8
  1333          * bytes and one UChar.
  1334          */
  1335         srcLength /= 3;
  1336         if(count > srcLength) {
  1337             count = srcLength; /* min(remaining dest, remaining src/3) */
  1339         if(count < 3) {
  1340             /*
  1341              * Too much overhead if we get near the end of the string,
  1342              * continue with the next loop.
  1343              */
  1344             break;
  1346         do {
  1347             ch = *pSrc;
  1348             if(ch <= 0x7f){
  1349                 *pDest++=(UChar)ch;
  1350                 ++pSrc;
  1351             } else {
  1352                 if(ch >= 0xe0) {
  1353                     if( /* handle U+0000..U+FFFF inline */
  1354                         ch <= 0xef &&
  1355                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
  1356                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
  1357                     ) {
  1358                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
  1359                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
  1360                         pSrc += 3;
  1361                         continue;
  1363                 } else {
  1364                     if( /* handle U+0000..U+07FF inline */
  1365                         ch >= 0xc0 &&
  1366                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
  1367                     ) {
  1368                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
  1369                         pSrc += 2;
  1370                         continue;
  1374                 if(subchar < 0) {
  1375                     *pErrorCode = U_INVALID_CHAR_FOUND;
  1376                     return NULL;
  1377                 } else if(subchar > 0xffff && --count == 0) {
  1378                     /*
  1379                      * We need to write two UChars, adjusted count for that,
  1380                      * and ran out of space.
  1381                      */
  1382                     break;
  1383                 } else {
  1384                     /* function call for error cases */
  1385                     ++pSrc; /* continue after the lead byte */
  1386                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
  1387                     ++numSubstitutions;
  1388                     if(subchar<=0xFFFF) {
  1389                         *(pDest++)=(UChar)subchar;
  1390                     } else {
  1391                         *(pDest++)=U16_LEAD(subchar);
  1392                         *(pDest++)=U16_TRAIL(subchar);
  1396         } while(--count > 0);
  1399     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
  1400         ch = *pSrc;
  1401         if(ch <= 0x7f){
  1402             *pDest++=(UChar)ch;
  1403             ++pSrc;
  1404         } else {
  1405             if(ch >= 0xe0) {
  1406                 if( /* handle U+0000..U+FFFF inline */
  1407                     ch <= 0xef &&
  1408                     ((pSrcLimit - pSrc) >= 3) &&
  1409                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
  1410                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
  1411                 ) {
  1412                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
  1413                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
  1414                     pSrc += 3;
  1415                     continue;
  1417             } else {
  1418                 if( /* handle U+0000..U+07FF inline */
  1419                     ch >= 0xc0 &&
  1420                     ((pSrcLimit - pSrc) >= 2) &&
  1421                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
  1422                 ) {
  1423                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
  1424                     pSrc += 2;
  1425                     continue;
  1429             if(subchar < 0) {
  1430                 *pErrorCode = U_INVALID_CHAR_FOUND;
  1431                 return NULL;
  1432             } else {
  1433                 /* function call for error cases */
  1434                 ++pSrc; /* continue after the lead byte */
  1435                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
  1436                 ++numSubstitutions;
  1437                 if(subchar<=0xFFFF) {
  1438                     *(pDest++)=(UChar)subchar;
  1439                 } else {
  1440                     *(pDest++)=U16_LEAD(subchar);
  1441                     if(pDest<pDestLimit) {
  1442                         *(pDest++)=U16_TRAIL(subchar);
  1443                     } else {
  1444                         reqLength++;
  1445                         break;
  1452     /* do not fill the dest buffer just count the UChars needed */
  1453     while(pSrc < pSrcLimit){
  1454         ch = *pSrc;
  1455         if(ch <= 0x7f) {
  1456             reqLength++;
  1457             ++pSrc;
  1458         } else {
  1459             if(ch >= 0xe0) {
  1460                 if( /* handle U+0000..U+FFFF inline */
  1461                     ch <= 0xef &&
  1462                     ((pSrcLimit - pSrc) >= 3) &&
  1463                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
  1464                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
  1465                 ) {
  1466                     reqLength++;
  1467                     pSrc += 3;
  1468                     continue;
  1470             } else {
  1471                 if( /* handle U+0000..U+07FF inline */
  1472                     ch >= 0xc0 &&
  1473                     ((pSrcLimit - pSrc) >= 2) &&
  1474                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
  1475                 ) {
  1476                     reqLength++;
  1477                     pSrc += 2;
  1478                     continue;
  1482             if(subchar < 0) {
  1483                 *pErrorCode = U_INVALID_CHAR_FOUND;
  1484                 return NULL;
  1485             } else {
  1486                 /* function call for error cases */
  1487                 ++pSrc; /* continue after the lead byte */
  1488                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
  1489                 ++numSubstitutions;
  1490                 reqLength+=U16_LENGTH(ch);
  1495     if(pNumSubstitutions!=NULL) {
  1496         *pNumSubstitutions=numSubstitutions;
  1499     reqLength+=(int32_t)(pDest - dest);
  1500     if(pDestLength) {
  1501         *pDestLength = reqLength;
  1504     /* Terminate the buffer */
  1505     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
  1506     return dest;
  1509 U_CAPI char* U_EXPORT2 
  1510 u_strToJavaModifiedUTF8(
  1511         char *dest,
  1512         int32_t destCapacity,
  1513         int32_t *pDestLength,
  1514         const UChar *src, 
  1515         int32_t srcLength,
  1516         UErrorCode *pErrorCode) {
  1517     int32_t reqLength=0;
  1518     uint32_t ch=0;
  1519     uint8_t *pDest = (uint8_t *)dest;
  1520     uint8_t *pDestLimit = pDest + destCapacity;
  1521     const UChar *pSrcLimit;
  1522     int32_t count;
  1524     /* args check */
  1525     if(U_FAILURE(*pErrorCode)){
  1526         return NULL;
  1528     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
  1529         (dest==NULL && destCapacity!=0) || destCapacity<0
  1530     ) {
  1531         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
  1532         return NULL;
  1535     if(srcLength==-1) {
  1536         /* Convert NUL-terminated ASCII, then find the string length. */
  1537         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
  1538             *pDest++ = (uint8_t)ch;
  1539             ++src;
  1541         if(ch == 0) {
  1542             reqLength=(int32_t)(pDest - (uint8_t *)dest);
  1543             if(pDestLength) {
  1544                 *pDestLength = reqLength;
  1547             /* Terminate the buffer */
  1548             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  1549             return dest;
  1551         srcLength = u_strlen(src);
  1554     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
  1555     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
  1556     for(;;) {
  1557         count = (int32_t)(pDestLimit - pDest);
  1558         srcLength = (int32_t)(pSrcLimit - src);
  1559         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
  1560             /* fast ASCII loop */
  1561             const UChar *prevSrc = src;
  1562             int32_t delta;
  1563             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
  1564                 *pDest++=(uint8_t)ch;
  1565                 ++src;
  1567             delta = (int32_t)(src - prevSrc);
  1568             count -= delta;
  1569             srcLength -= delta;
  1571         /*
  1572          * Each iteration of the inner loop progresses by at most 3 UTF-8
  1573          * bytes and one UChar.
  1574          */
  1575         count /= 3;
  1576         if(count > srcLength) {
  1577             count = srcLength; /* min(remaining dest/3, remaining src) */
  1579         if(count < 3) {
  1580             /*
  1581              * Too much overhead if we get near the end of the string,
  1582              * continue with the next loop.
  1583              */
  1584             break;
  1586         do {
  1587             ch=*src++;
  1588             if(ch <= 0x7f && ch != 0) {
  1589                 *pDest++ = (uint8_t)ch;
  1590             } else if(ch <= 0x7ff) {
  1591                 *pDest++=(uint8_t)((ch>>6)|0xc0);
  1592                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1593             } else {
  1594                 *pDest++=(uint8_t)((ch>>12)|0xe0);
  1595                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1598         } while(--count > 0);
  1601     while(src<pSrcLimit) {
  1602         ch=*src++;
  1603         if(ch <= 0x7f && ch != 0) {
  1604             if(pDest<pDestLimit) {
  1605                 *pDest++ = (uint8_t)ch;
  1606             } else {
  1607                 reqLength = 1;
  1608                 break;
  1610         } else if(ch <= 0x7ff) {
  1611             if((pDestLimit - pDest) >= 2) {
  1612                 *pDest++=(uint8_t)((ch>>6)|0xc0);
  1613                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1614             } else {
  1615                 reqLength = 2;
  1616                 break;
  1618         } else {
  1619             if((pDestLimit - pDest) >= 3) {
  1620                 *pDest++=(uint8_t)((ch>>12)|0xe0);
  1621                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
  1622                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
  1623             } else {
  1624                 reqLength = 3;
  1625                 break;
  1629     while(src<pSrcLimit) {
  1630         ch=*src++;
  1631         if(ch <= 0x7f && ch != 0) {
  1632             ++reqLength;
  1633         } else if(ch<=0x7ff) {
  1634             reqLength+=2;
  1635         } else {
  1636             reqLength+=3;
  1640     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
  1641     if(pDestLength){
  1642         *pDestLength = reqLength;
  1645     /* Terminate the buffer */
  1646     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
  1647     return dest;

mercurial