michael@0: /* michael@0: ****************************************************************************** michael@0: * michael@0: * Copyright (C) 2001-2013, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: * michael@0: ****************************************************************************** michael@0: * michael@0: * File ustrtrns.cpp michael@0: * michael@0: * Modification History: michael@0: * michael@0: * Date Name Description michael@0: * 9/10/2001 Ram Creation. michael@0: ****************************************************************************** michael@0: */ michael@0: michael@0: /******************************************************************************* michael@0: * michael@0: * u_strTo* and u_strFrom* APIs michael@0: * WCS functions moved to ustr_wcs.c for better modularization michael@0: * michael@0: ******************************************************************************* michael@0: */ michael@0: michael@0: michael@0: #include "unicode/putil.h" michael@0: #include "unicode/ustring.h" michael@0: #include "unicode/utf.h" michael@0: #include "unicode/utf8.h" michael@0: #include "unicode/utf16.h" michael@0: #include "cstring.h" michael@0: #include "cmemory.h" michael@0: #include "ustr_imp.h" michael@0: #include "uassert.h" michael@0: michael@0: #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strFromUTF32WithSub(UChar *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar32 *src, michael@0: int32_t srcLength, michael@0: UChar32 subchar, int32_t *pNumSubstitutions, michael@0: UErrorCode *pErrorCode) { michael@0: const UChar32 *srcLimit; michael@0: UChar32 ch; michael@0: UChar *destLimit; michael@0: UChar *pDest; michael@0: int32_t reqLength; michael@0: int32_t numSubstitutions; michael@0: michael@0: /* args check */ michael@0: if(U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: if( (src==NULL && srcLength!=0) || srcLength < -1 || michael@0: (destCapacity<0) || (dest == NULL && destCapacity > 0) || michael@0: subchar > 0x10ffff || U_IS_SURROGATE(subchar) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(pNumSubstitutions != NULL) { michael@0: *pNumSubstitutions = 0; michael@0: } michael@0: michael@0: pDest = dest; michael@0: destLimit = (dest!=NULL)?(dest + destCapacity):NULL; michael@0: reqLength = 0; michael@0: numSubstitutions = 0; michael@0: michael@0: if(srcLength < 0) { michael@0: /* simple loop for conversion of a NUL-terminated BMP string */ michael@0: while((ch=*src) != 0 && michael@0: ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { michael@0: ++src; michael@0: if(pDest < destLimit) { michael@0: *pDest++ = (UChar)ch; michael@0: } else { michael@0: ++reqLength; michael@0: } michael@0: } michael@0: srcLimit = src; michael@0: if(ch != 0) { michael@0: /* "complicated" case, find the end of the remaining string */ michael@0: while(*++srcLimit != 0) {} michael@0: } michael@0: } else { michael@0: srcLimit = (src!=NULL)?(src + srcLength):NULL; michael@0: } michael@0: michael@0: /* convert with length */ michael@0: while(src < srcLimit) { michael@0: ch = *src++; michael@0: do { michael@0: /* usually "loops" once; twice only for writing subchar */ michael@0: if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { michael@0: if(pDest < destLimit) { michael@0: *pDest++ = (UChar)ch; michael@0: } else { michael@0: ++reqLength; michael@0: } michael@0: break; michael@0: } else if(0x10000 <= ch && ch <= 0x10ffff) { michael@0: if(pDest!=NULL && ((pDest + 2) <= destLimit)) { michael@0: *pDest++ = U16_LEAD(ch); michael@0: *pDest++ = U16_TRAIL(ch); michael@0: } else { michael@0: reqLength += 2; michael@0: } michael@0: break; michael@0: } else if((ch = subchar) < 0) { michael@0: /* surrogate code point, or not a Unicode code point at all */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else { michael@0: ++numSubstitutions; michael@0: } michael@0: } while(TRUE); michael@0: } michael@0: michael@0: reqLength += (int32_t)(pDest - dest); michael@0: if(pDestLength) { michael@0: *pDestLength = reqLength; michael@0: } michael@0: if(pNumSubstitutions != NULL) { michael@0: *pNumSubstitutions = numSubstitutions; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); michael@0: michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strFromUTF32(UChar *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar32 *src, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: return u_strFromUTF32WithSub( michael@0: dest, destCapacity, pDestLength, michael@0: src, srcLength, michael@0: U_SENTINEL, NULL, michael@0: pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UChar32* U_EXPORT2 michael@0: u_strToUTF32WithSub(UChar32 *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar *src, michael@0: int32_t srcLength, michael@0: UChar32 subchar, int32_t *pNumSubstitutions, michael@0: UErrorCode *pErrorCode) { michael@0: const UChar *srcLimit; michael@0: UChar32 ch; michael@0: UChar ch2; michael@0: UChar32 *destLimit; michael@0: UChar32 *pDest; michael@0: int32_t reqLength; michael@0: int32_t numSubstitutions; michael@0: michael@0: /* args check */ michael@0: if(U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: if( (src==NULL && srcLength!=0) || srcLength < -1 || michael@0: (destCapacity<0) || (dest == NULL && destCapacity > 0) || michael@0: subchar > 0x10ffff || U_IS_SURROGATE(subchar) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(pNumSubstitutions != NULL) { michael@0: *pNumSubstitutions = 0; michael@0: } michael@0: michael@0: pDest = dest; michael@0: destLimit = (dest!=NULL)?(dest + destCapacity):NULL; michael@0: reqLength = 0; michael@0: numSubstitutions = 0; michael@0: michael@0: if(srcLength < 0) { michael@0: /* simple loop for conversion of a NUL-terminated BMP string */ michael@0: while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { michael@0: ++src; michael@0: if(pDest < destLimit) { michael@0: *pDest++ = ch; michael@0: } else { michael@0: ++reqLength; michael@0: } michael@0: } michael@0: srcLimit = src; michael@0: if(ch != 0) { michael@0: /* "complicated" case, find the end of the remaining string */ michael@0: while(*++srcLimit != 0) {} michael@0: } michael@0: } else { michael@0: srcLimit = (src!=NULL)?(src + srcLength):NULL; michael@0: } michael@0: michael@0: /* convert with length */ michael@0: while(src < srcLimit) { michael@0: ch = *src++; michael@0: if(!U16_IS_SURROGATE(ch)) { michael@0: /* write or count ch below */ michael@0: } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { michael@0: ++src; michael@0: ch = U16_GET_SUPPLEMENTARY(ch, ch2); michael@0: } else if((ch = subchar) < 0) { michael@0: /* unpaired surrogate */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else { michael@0: ++numSubstitutions; michael@0: } michael@0: if(pDest < destLimit) { michael@0: *pDest++ = ch; michael@0: } else { michael@0: ++reqLength; michael@0: } michael@0: } michael@0: michael@0: reqLength += (int32_t)(pDest - dest); michael@0: if(pDestLength) { michael@0: *pDestLength = reqLength; michael@0: } michael@0: if(pNumSubstitutions != NULL) { michael@0: *pNumSubstitutions = numSubstitutions; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); michael@0: michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI UChar32* U_EXPORT2 michael@0: u_strToUTF32(UChar32 *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar *src, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: return u_strToUTF32WithSub( michael@0: dest, destCapacity, pDestLength, michael@0: src, srcLength, michael@0: U_SENTINEL, NULL, michael@0: pErrorCode); michael@0: } michael@0: michael@0: /* for utf8_nextCharSafeBodyTerminated() */ michael@0: static const UChar32 michael@0: utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; michael@0: michael@0: /* michael@0: * Version of utf8_nextCharSafeBody() with the following differences: michael@0: * - checks for NUL termination instead of length michael@0: * - works with pointers instead of indexes michael@0: * - always strict (strict==-1) michael@0: * michael@0: * *ps points to after the lead byte and will be moved to after the last trail byte. michael@0: * c is the lead byte. michael@0: * @return the code point, or U_SENTINEL michael@0: */ michael@0: static UChar32 michael@0: utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { michael@0: const uint8_t *s=*ps; michael@0: uint8_t trail, illegal=0; michael@0: uint8_t count=U8_COUNT_TRAIL_BYTES(c); michael@0: U_ASSERT(count<6); michael@0: U8_MASK_LEAD_BYTE((c), count); michael@0: /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ michael@0: switch(count) { michael@0: /* each branch falls through to the next one */ michael@0: case 5: michael@0: case 4: michael@0: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ michael@0: illegal=1; michael@0: break; michael@0: case 3: michael@0: trail=(uint8_t)(*s++ - 0x80); michael@0: c=(c<<6)|trail; michael@0: if(trail>0x3f || c>=0x110) { michael@0: /* not a trail byte, or code point>0x10ffff (outside Unicode) */ michael@0: illegal=1; michael@0: break; michael@0: } michael@0: case 2: /*fall through*/ michael@0: trail=(uint8_t)(*s++ - 0x80); michael@0: if(trail>0x3f) { michael@0: /* not a trail byte */ michael@0: illegal=1; michael@0: break; michael@0: } michael@0: c=(c<<6)|trail; michael@0: case 1: /*fall through*/ michael@0: trail=(uint8_t)(*s++ - 0x80); michael@0: if(trail>0x3f) { michael@0: /* not a trail byte */ michael@0: illegal=1; michael@0: } michael@0: c=(c<<6)|trail; michael@0: break; michael@0: case 0: michael@0: return U_SENTINEL; michael@0: /* no default branch to optimize switch() - all values are covered */ michael@0: } michael@0: michael@0: /* correct sequence - all trail bytes have (b7..b6)==(10)? */ michael@0: /* illegal is also set if count>=4 */ michael@0: if(illegal || c0 && U8_IS_TRAIL(*s)) { michael@0: ++s; michael@0: --count; michael@0: } michael@0: c=U_SENTINEL; michael@0: } michael@0: *ps=s; michael@0: return c; michael@0: } michael@0: michael@0: /* michael@0: * Version of utf8_nextCharSafeBody() with the following differences: michael@0: * - works with pointers instead of indexes michael@0: * - always strict (strict==-1) michael@0: * michael@0: * *ps points to after the lead byte and will be moved to after the last trail byte. michael@0: * c is the lead byte. michael@0: * @return the code point, or U_SENTINEL michael@0: */ michael@0: static UChar32 michael@0: utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { michael@0: const uint8_t *s=*ps; michael@0: uint8_t trail, illegal=0; michael@0: uint8_t count=U8_COUNT_TRAIL_BYTES(c); michael@0: if((limit-s)>=count) { michael@0: U8_MASK_LEAD_BYTE((c), count); michael@0: /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ michael@0: switch(count) { michael@0: /* each branch falls through to the next one */ michael@0: case 5: michael@0: case 4: michael@0: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ michael@0: illegal=1; michael@0: break; michael@0: case 3: michael@0: trail=*s++; michael@0: c=(c<<6)|(trail&0x3f); michael@0: if(c<0x110) { michael@0: illegal|=(trail&0xc0)^0x80; michael@0: } else { michael@0: /* code point>0x10ffff, outside Unicode */ michael@0: illegal=1; michael@0: break; michael@0: } michael@0: case 2: /*fall through*/ michael@0: trail=*s++; michael@0: c=(c<<6)|(trail&0x3f); michael@0: illegal|=(trail&0xc0)^0x80; michael@0: case 1: /*fall through*/ michael@0: trail=*s++; michael@0: c=(c<<6)|(trail&0x3f); michael@0: illegal|=(trail&0xc0)^0x80; michael@0: break; michael@0: case 0: michael@0: return U_SENTINEL; michael@0: /* no default branch to optimize switch() - all values are covered */ michael@0: } michael@0: } else { michael@0: illegal=1; /* too few bytes left */ michael@0: } michael@0: michael@0: /* correct sequence - all trail bytes have (b7..b6)==(10)? */ michael@0: /* illegal is also set if count>=4 */ michael@0: U_ASSERT(illegal || count0 && s 0) || michael@0: subchar > 0x10ffff || U_IS_SURROGATE(subchar) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=0; michael@0: } michael@0: numSubstitutions=0; michael@0: michael@0: /* michael@0: * Inline processing of UTF-8 byte sequences: michael@0: * michael@0: * Byte sequences for the most common characters are handled inline in michael@0: * the conversion loops. In order to reduce the path lengths for those michael@0: * characters, the tests are arranged in a kind of binary search. michael@0: * ASCII (<=0x7f) is checked first, followed by the dividing point michael@0: * between 2- and 3-byte sequences (0xe0). michael@0: * The 3-byte branch is tested first to speed up CJK text. michael@0: * The compiler should combine the subtractions for the two tests for 0xe0. michael@0: * Each branch then tests for the other end of its range. michael@0: */ michael@0: michael@0: if(srcLength < 0){ michael@0: /* michael@0: * Transform a NUL-terminated string. michael@0: * The code explicitly checks for NULs only in the lead byte position. michael@0: * A NUL byte in the trail byte position fails the trail byte range check anyway. michael@0: */ michael@0: while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { michael@0: if(ch <= 0x7f){ michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: } else { michael@0: if(ch > 0xe0) { michael@0: if( /* handle U+1000..U+CFFF inline */ michael@0: ch <= 0xec && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && michael@0: (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f michael@0: ) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xe0) { michael@0: if( /* handle U+0080..U+07FF inline */ michael@0: ch >= 0xc2 && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f michael@0: ) { michael@0: *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* function call for "complicated" and error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); michael@0: if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else if(ch<=0xFFFF) { michael@0: *(pDest++)=(UChar)ch; michael@0: } else { michael@0: *(pDest++)=U16_LEAD(ch); michael@0: if(pDest 0xe0) { michael@0: if( /* handle U+1000..U+CFFF inline */ michael@0: ch <= 0xec && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f && michael@0: (uint8_t)(pSrc[2] - 0x80) <= 0x3f michael@0: ) { michael@0: ++reqLength; michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xe0) { michael@0: if( /* handle U+0080..U+07FF inline */ michael@0: ch >= 0xc2 && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f michael@0: ) { michael@0: ++reqLength; michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* function call for "complicated" and error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); michael@0: if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: reqLength += U16_LENGTH(ch); michael@0: } michael@0: } michael@0: } else /* srcLength >= 0 */ { michael@0: const uint8_t *pSrcLimit = pSrc + srcLength; michael@0: int32_t count; michael@0: michael@0: /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ michael@0: for(;;) { michael@0: /* michael@0: * Each iteration of the inner loop progresses by at most 3 UTF-8 michael@0: * bytes and one UChar, for most characters. michael@0: * For supplementary code points (4 & 2), which are rare, michael@0: * there is an additional adjustment. michael@0: */ michael@0: count = (int32_t)(pDestLimit - pDest); michael@0: srcLength = (int32_t)((pSrcLimit - pSrc) / 3); michael@0: if(count > srcLength) { michael@0: count = srcLength; /* min(remaining dest, remaining src/3) */ michael@0: } michael@0: if(count < 3) { michael@0: /* michael@0: * Too much overhead if we get near the end of the string, michael@0: * continue with the next loop. michael@0: */ michael@0: break; michael@0: } michael@0: michael@0: do { michael@0: ch = *pSrc; michael@0: if(ch <= 0x7f){ michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: } else { michael@0: if(ch > 0xe0) { michael@0: if( /* handle U+1000..U+CFFF inline */ michael@0: ch <= 0xec && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && michael@0: (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f michael@0: ) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xe0) { michael@0: if( /* handle U+0080..U+07FF inline */ michael@0: ch >= 0xc2 && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f michael@0: ) { michael@0: *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: if(ch >= 0xf0 || subchar > 0xffff) { michael@0: /* michael@0: * We may read up to six bytes and write up to two UChars, michael@0: * which we didn't account for with computing count, michael@0: * so we adjust it here. michael@0: */ michael@0: if(--count == 0) { michael@0: break; michael@0: } michael@0: } michael@0: michael@0: /* function call for "complicated" and error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: }else if(ch<=0xFFFF){ michael@0: *(pDest++)=(UChar)ch; michael@0: }else{ michael@0: *(pDest++)=U16_LEAD(ch); michael@0: *(pDest++)=U16_TRAIL(ch); michael@0: } michael@0: } michael@0: } while(--count > 0); michael@0: } michael@0: michael@0: while((pSrc 0xe0) { michael@0: if( /* handle U+1000..U+CFFF inline */ michael@0: ch <= 0xec && michael@0: ((pSrcLimit - pSrc) >= 3) && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && michael@0: (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f michael@0: ) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xe0) { michael@0: if( /* handle U+0080..U+07FF inline */ michael@0: ch >= 0xc2 && michael@0: ((pSrcLimit - pSrc) >= 2) && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f michael@0: ) { michael@0: *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* function call for "complicated" and error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: }else if(ch<=0xFFFF){ michael@0: *(pDest++)=(UChar)ch; michael@0: }else{ michael@0: *(pDest++)=U16_LEAD(ch); michael@0: if(pDest 0xe0) { michael@0: if( /* handle U+1000..U+CFFF inline */ michael@0: ch <= 0xec && michael@0: ((pSrcLimit - pSrc) >= 3) && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f && michael@0: (uint8_t)(pSrc[2] - 0x80) <= 0x3f michael@0: ) { michael@0: reqLength++; michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xe0) { michael@0: if( /* handle U+0080..U+07FF inline */ michael@0: ch >= 0xc2 && michael@0: ((pSrcLimit - pSrc) >= 2) && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f michael@0: ) { michael@0: reqLength++; michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* function call for "complicated" and error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: reqLength+=U16_LENGTH(ch); michael@0: } michael@0: } michael@0: } michael@0: michael@0: reqLength+=(int32_t)(pDest - dest); michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=numSubstitutions; michael@0: } michael@0: michael@0: if(pDestLength){ michael@0: *pDestLength = reqLength; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); michael@0: michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strFromUTF8(UChar *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const char* src, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode){ michael@0: return u_strFromUTF8WithSub( michael@0: dest, destCapacity, pDestLength, michael@0: src, srcLength, michael@0: U_SENTINEL, NULL, michael@0: pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UChar * U_EXPORT2 michael@0: u_strFromUTF8Lenient(UChar *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const char *src, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: UChar *pDest = dest; michael@0: UChar32 ch; michael@0: int32_t reqLength = 0; michael@0: uint8_t* pSrc = (uint8_t*) src; michael@0: michael@0: /* args check */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: michael@0: if( (src==NULL && srcLength!=0) || srcLength < -1 || michael@0: (destCapacity<0) || (dest == NULL && destCapacity > 0) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(srcLength < 0) { michael@0: /* Transform a NUL-terminated string. */ michael@0: UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; michael@0: uint8_t t1, t2, t3; /* trail bytes */ michael@0: michael@0: while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { michael@0: if(ch < 0xc0) { michael@0: /* michael@0: * ASCII, or a trail byte in lead position which is treated like michael@0: * a single-byte sequence for better character boundary michael@0: * resynchronization after illegal sequences. michael@0: */ michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: continue; michael@0: } else if(ch < 0xe0) { /* U+0080..U+07FF */ michael@0: if((t1 = pSrc[1]) != 0) { michael@0: /* 0x3080 = (0xc0 << 6) + 0x80 */ michael@0: *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xf0) { /* U+0800..U+FFFF */ michael@0: if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: /* 0x2080 = (0x80 << 6) + 0x80 */ michael@0: *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else /* f0..f4 */ { /* U+10000..U+10FFFF */ michael@0: if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { michael@0: pSrc += 4; michael@0: /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ michael@0: ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; michael@0: *(pDest++) = U16_LEAD(ch); michael@0: if(pDest < pDestLimit) { michael@0: *(pDest++) = U16_TRAIL(ch); michael@0: } else { michael@0: reqLength = 1; michael@0: break; michael@0: } michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* truncated character at the end */ michael@0: *pDest++ = 0xfffd; michael@0: while(*++pSrc != 0) {} michael@0: break; michael@0: } michael@0: michael@0: /* Pre-flight the rest of the string. */ michael@0: while((ch = *pSrc) != 0) { michael@0: if(ch < 0xc0) { michael@0: /* michael@0: * ASCII, or a trail byte in lead position which is treated like michael@0: * a single-byte sequence for better character boundary michael@0: * resynchronization after illegal sequences. michael@0: */ michael@0: ++reqLength; michael@0: ++pSrc; michael@0: continue; michael@0: } else if(ch < 0xe0) { /* U+0080..U+07FF */ michael@0: if(pSrc[1] != 0) { michael@0: ++reqLength; michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } else if(ch < 0xf0) { /* U+0800..U+FFFF */ michael@0: if(pSrc[1] != 0 && pSrc[2] != 0) { michael@0: ++reqLength; michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else /* f0..f4 */ { /* U+10000..U+10FFFF */ michael@0: if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { michael@0: reqLength += 2; michael@0: pSrc += 4; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* truncated character at the end */ michael@0: ++reqLength; michael@0: break; michael@0: } michael@0: } else /* srcLength >= 0 */ { michael@0: const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; michael@0: michael@0: /* michael@0: * This function requires that if srcLength is given, then it must be michael@0: * destCapatity >= srcLength so that we need not check for michael@0: * destination buffer overflow in the loop. michael@0: */ michael@0: if(destCapacity < srcLength) { michael@0: if(pDestLength != NULL) { michael@0: *pDestLength = srcLength; /* this likely overestimates the true destLength! */ michael@0: } michael@0: *pErrorCode = U_BUFFER_OVERFLOW_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if((pSrcLimit - pSrc) >= 4) { michael@0: pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ michael@0: michael@0: /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ michael@0: do { michael@0: ch = *pSrc++; michael@0: if(ch < 0xc0) { michael@0: /* michael@0: * ASCII, or a trail byte in lead position which is treated like michael@0: * a single-byte sequence for better character boundary michael@0: * resynchronization after illegal sequences. michael@0: */ michael@0: *pDest++=(UChar)ch; michael@0: } else if(ch < 0xe0) { /* U+0080..U+07FF */ michael@0: /* 0x3080 = (0xc0 << 6) + 0x80 */ michael@0: *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); michael@0: } else if(ch < 0xf0) { /* U+0800..U+FFFF */ michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: /* 0x2080 = (0x80 << 6) + 0x80 */ michael@0: ch = (ch << 12) + (*pSrc++ << 6); michael@0: *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); michael@0: } else /* f0..f4 */ { /* U+10000..U+10FFFF */ michael@0: /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ michael@0: ch = (ch << 18) + (*pSrc++ << 12); michael@0: ch += *pSrc++ << 6; michael@0: ch += *pSrc++ - 0x3c82080; michael@0: *(pDest++) = U16_LEAD(ch); michael@0: *(pDest++) = U16_TRAIL(ch); michael@0: } michael@0: } while(pSrc < pSrcLimit); michael@0: michael@0: pSrcLimit += 3; /* restore original pSrcLimit */ michael@0: } michael@0: michael@0: while(pSrc < pSrcLimit) { michael@0: ch = *pSrc++; michael@0: if(ch < 0xc0) { michael@0: /* michael@0: * ASCII, or a trail byte in lead position which is treated like michael@0: * a single-byte sequence for better character boundary michael@0: * resynchronization after illegal sequences. michael@0: */ michael@0: *pDest++=(UChar)ch; michael@0: continue; michael@0: } else if(ch < 0xe0) { /* U+0080..U+07FF */ michael@0: if(pSrc < pSrcLimit) { michael@0: /* 0x3080 = (0xc0 << 6) + 0x80 */ michael@0: *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); michael@0: continue; michael@0: } michael@0: } else if(ch < 0xf0) { /* U+0800..U+FFFF */ michael@0: if((pSrcLimit - pSrc) >= 2) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: /* 0x2080 = (0x80 << 6) + 0x80 */ michael@0: ch = (ch << 12) + (*pSrc++ << 6); michael@0: *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else /* f0..f4 */ { /* U+10000..U+10FFFF */ michael@0: if((pSrcLimit - pSrc) >= 3) { michael@0: /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ michael@0: ch = (ch << 18) + (*pSrc++ << 12); michael@0: ch += *pSrc++ << 6; michael@0: ch += *pSrc++ - 0x3c82080; michael@0: *(pDest++) = U16_LEAD(ch); michael@0: *(pDest++) = U16_TRAIL(ch); michael@0: pSrc += 4; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: /* truncated character at the end */ michael@0: *pDest++ = 0xfffd; michael@0: break; michael@0: } michael@0: } michael@0: michael@0: reqLength+=(int32_t)(pDest - dest); michael@0: michael@0: if(pDestLength){ michael@0: *pDestLength = reqLength; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); michael@0: michael@0: return dest; michael@0: } michael@0: michael@0: static inline uint8_t * michael@0: _appendUTF8(uint8_t *pDest, UChar32 c) { michael@0: /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ michael@0: if((c)<=0x7f) { michael@0: *pDest++=(uint8_t)c; michael@0: } else if(c<=0x7ff) { michael@0: *pDest++=(uint8_t)((c>>6)|0xc0); michael@0: *pDest++=(uint8_t)((c&0x3f)|0x80); michael@0: } else if(c<=0xffff) { michael@0: *pDest++=(uint8_t)((c>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)(((c)&0x3f)|0x80); michael@0: } else /* if((uint32_t)(c)<=0x10ffff) */ { michael@0: *pDest++=(uint8_t)(((c)>>18)|0xf0); michael@0: *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)(((c)&0x3f)|0x80); michael@0: } michael@0: return pDest; michael@0: } michael@0: michael@0: michael@0: U_CAPI char* U_EXPORT2 michael@0: u_strToUTF8WithSub(char *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar *pSrc, michael@0: int32_t srcLength, michael@0: UChar32 subchar, int32_t *pNumSubstitutions, michael@0: UErrorCode *pErrorCode){ michael@0: int32_t reqLength=0; michael@0: uint32_t ch=0,ch2=0; michael@0: uint8_t *pDest = (uint8_t *)dest; michael@0: uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; michael@0: int32_t numSubstitutions; michael@0: michael@0: /* args check */ michael@0: if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: michael@0: if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || michael@0: (destCapacity<0) || (dest == NULL && destCapacity > 0) || michael@0: subchar > 0x10ffff || U_IS_SURROGATE(subchar) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=0; michael@0: } michael@0: numSubstitutions=0; michael@0: michael@0: if(srcLength==-1) { michael@0: while((ch=*pSrc)!=0) { michael@0: ++pSrc; michael@0: if(ch <= 0x7f) { michael@0: if(pDest= 2) { michael@0: *pDest++=(uint8_t)((ch>>6)|0xc0); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 2; michael@0: break; michael@0: } michael@0: } else if(ch <= 0xd7ff || ch >= 0xe000) { michael@0: if((pDestLimit - pDest) >= 3) { michael@0: *pDest++=(uint8_t)((ch>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 3; michael@0: break; michael@0: } michael@0: } else /* ch is a surrogate */ { michael@0: int32_t length; michael@0: michael@0: /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ michael@0: if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { michael@0: ++pSrc; michael@0: ch=U16_GET_SUPPLEMENTARY(ch, ch2); michael@0: } else if(subchar>=0) { michael@0: ch=subchar; michael@0: ++numSubstitutions; michael@0: } else { michael@0: /* Unicode 3.2 forbids surrogate code points in UTF-8 */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: michael@0: length = U8_LENGTH(ch); michael@0: if((pDestLimit - pDest) >= length) { michael@0: /* convert and append*/ michael@0: pDest=_appendUTF8(pDest, ch); michael@0: } else { michael@0: reqLength = length; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: while((ch=*pSrc++)!=0) { michael@0: if(ch<=0x7f) { michael@0: ++reqLength; michael@0: } else if(ch<=0x7ff) { michael@0: reqLength+=2; michael@0: } else if(!U16_IS_SURROGATE(ch)) { michael@0: reqLength+=3; michael@0: } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { michael@0: ++pSrc; michael@0: reqLength+=4; michael@0: } else if(subchar>=0) { michael@0: reqLength+=U8_LENGTH(subchar); michael@0: ++numSubstitutions; michael@0: } else { michael@0: /* Unicode 3.2 forbids surrogate code points in UTF-8 */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: } michael@0: } else { michael@0: const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; michael@0: int32_t count; michael@0: michael@0: /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ michael@0: for(;;) { michael@0: /* michael@0: * Each iteration of the inner loop progresses by at most 3 UTF-8 michael@0: * bytes and one UChar, for most characters. michael@0: * For supplementary code points (4 & 2), which are rare, michael@0: * there is an additional adjustment. michael@0: */ michael@0: count = (int32_t)((pDestLimit - pDest) / 3); michael@0: srcLength = (int32_t)(pSrcLimit - pSrc); michael@0: if(count > srcLength) { michael@0: count = srcLength; /* min(remaining dest/3, remaining src) */ michael@0: } michael@0: if(count < 3) { michael@0: /* michael@0: * Too much overhead if we get near the end of the string, michael@0: * continue with the next loop. michael@0: */ michael@0: break; michael@0: } michael@0: do { michael@0: ch=*pSrc++; michael@0: if(ch <= 0x7f) { michael@0: *pDest++ = (uint8_t)ch; michael@0: } else if(ch <= 0x7ff) { michael@0: *pDest++=(uint8_t)((ch>>6)|0xc0); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else if(ch <= 0xd7ff || ch >= 0xe000) { michael@0: *pDest++=(uint8_t)((ch>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else /* ch is a surrogate */ { michael@0: /* michael@0: * We will read two UChars and probably output four bytes, michael@0: * which we didn't account for with computing count, michael@0: * so we adjust it here. michael@0: */ michael@0: if(--count == 0) { michael@0: --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ michael@0: break; /* recompute count */ michael@0: } michael@0: michael@0: if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { michael@0: ++pSrc; michael@0: ch=U16_GET_SUPPLEMENTARY(ch, ch2); michael@0: michael@0: /* writing 4 bytes per 2 UChars is ok */ michael@0: *pDest++=(uint8_t)((ch>>18)|0xf0); michael@0: *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: /* Unicode 3.2 forbids surrogate code points in UTF-8 */ michael@0: if(subchar>=0) { michael@0: ch=subchar; michael@0: ++numSubstitutions; michael@0: } else { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: michael@0: /* convert and append*/ michael@0: pDest=_appendUTF8(pDest, ch); michael@0: } michael@0: } michael@0: } while(--count > 0); michael@0: } michael@0: michael@0: while(pSrc= 2) { michael@0: *pDest++=(uint8_t)((ch>>6)|0xc0); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 2; michael@0: break; michael@0: } michael@0: } else if(ch <= 0xd7ff || ch >= 0xe000) { michael@0: if((pDestLimit - pDest) >= 3) { michael@0: *pDest++=(uint8_t)((ch>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 3; michael@0: break; michael@0: } michael@0: } else /* ch is a surrogate */ { michael@0: int32_t length; michael@0: michael@0: if(U16_IS_SURROGATE_LEAD(ch) && pSrc=0) { michael@0: ch=subchar; michael@0: ++numSubstitutions; michael@0: } else { michael@0: /* Unicode 3.2 forbids surrogate code points in UTF-8 */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: michael@0: length = U8_LENGTH(ch); michael@0: if((pDestLimit - pDest) >= length) { michael@0: /* convert and append*/ michael@0: pDest=_appendUTF8(pDest, ch); michael@0: } else { michael@0: reqLength = length; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: while(pSrc=0) { michael@0: reqLength+=U8_LENGTH(subchar); michael@0: ++numSubstitutions; michael@0: } else { michael@0: /* Unicode 3.2 forbids surrogate code points in UTF-8 */ michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } michael@0: } michael@0: } michael@0: michael@0: reqLength+=(int32_t)(pDest - (uint8_t *)dest); michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=numSubstitutions; michael@0: } michael@0: michael@0: if(pDestLength){ michael@0: *pDestLength = reqLength; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateChars(dest, destCapacity, reqLength, pErrorCode); michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI char* U_EXPORT2 michael@0: u_strToUTF8(char *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar *pSrc, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode){ michael@0: return u_strToUTF8WithSub( michael@0: dest, destCapacity, pDestLength, michael@0: pSrc, srcLength, michael@0: U_SENTINEL, NULL, michael@0: pErrorCode); michael@0: } michael@0: michael@0: U_CAPI UChar* U_EXPORT2 michael@0: u_strFromJavaModifiedUTF8WithSub( michael@0: UChar *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const char *src, michael@0: int32_t srcLength, michael@0: UChar32 subchar, int32_t *pNumSubstitutions, michael@0: UErrorCode *pErrorCode) { michael@0: UChar *pDest = dest; michael@0: UChar *pDestLimit = dest+destCapacity; michael@0: UChar32 ch; michael@0: int32_t reqLength = 0; michael@0: const uint8_t* pSrc = (const uint8_t*) src; michael@0: const uint8_t *pSrcLimit; michael@0: int32_t count; michael@0: uint8_t t1, t2; /* trail bytes */ michael@0: int32_t numSubstitutions; michael@0: michael@0: /* args check */ michael@0: if(U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: if( (src==NULL && srcLength!=0) || srcLength < -1 || michael@0: (dest==NULL && destCapacity!=0) || destCapacity<0 || michael@0: subchar > 0x10ffff || U_IS_SURROGATE(subchar) michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=0; michael@0: } michael@0: numSubstitutions=0; michael@0: michael@0: if(srcLength < 0) { michael@0: /* michael@0: * Transform a NUL-terminated ASCII string. michael@0: * Handle non-ASCII strings with slower code. michael@0: */ michael@0: while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: } michael@0: if(ch == 0) { michael@0: reqLength=(int32_t)(pDest - dest); michael@0: if(pDestLength) { michael@0: *pDestLength = reqLength; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); michael@0: return dest; michael@0: } michael@0: srcLength = uprv_strlen((const char *)pSrc); michael@0: } michael@0: michael@0: /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ michael@0: pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; michael@0: for(;;) { michael@0: count = (int32_t)(pDestLimit - pDest); michael@0: srcLength = (int32_t)(pSrcLimit - pSrc); michael@0: if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { michael@0: /* fast ASCII loop */ michael@0: const uint8_t *prevSrc = pSrc; michael@0: int32_t delta; michael@0: while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: } michael@0: delta = (int32_t)(pSrc - prevSrc); michael@0: count -= delta; michael@0: srcLength -= delta; michael@0: } michael@0: /* michael@0: * Each iteration of the inner loop progresses by at most 3 UTF-8 michael@0: * bytes and one UChar. michael@0: */ michael@0: srcLength /= 3; michael@0: if(count > srcLength) { michael@0: count = srcLength; /* min(remaining dest, remaining src/3) */ michael@0: } michael@0: if(count < 3) { michael@0: /* michael@0: * Too much overhead if we get near the end of the string, michael@0: * continue with the next loop. michael@0: */ michael@0: break; michael@0: } michael@0: do { michael@0: ch = *pSrc; michael@0: if(ch <= 0x7f){ michael@0: *pDest++=(UChar)ch; michael@0: ++pSrc; michael@0: } else { michael@0: if(ch >= 0xe0) { michael@0: if( /* handle U+0000..U+FFFF inline */ michael@0: ch <= 0xef && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && michael@0: (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f michael@0: ) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else { michael@0: if( /* handle U+0000..U+07FF inline */ michael@0: ch >= 0xc0 && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f michael@0: ) { michael@0: *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: if(subchar < 0) { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else if(subchar > 0xffff && --count == 0) { michael@0: /* michael@0: * We need to write two UChars, adjusted count for that, michael@0: * and ran out of space. michael@0: */ michael@0: break; michael@0: } else { michael@0: /* function call for error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: ++numSubstitutions; michael@0: if(subchar<=0xFFFF) { michael@0: *(pDest++)=(UChar)subchar; michael@0: } else { michael@0: *(pDest++)=U16_LEAD(subchar); michael@0: *(pDest++)=U16_TRAIL(subchar); michael@0: } michael@0: } michael@0: } michael@0: } while(--count > 0); michael@0: } michael@0: michael@0: while((pSrc= 0xe0) { michael@0: if( /* handle U+0000..U+FFFF inline */ michael@0: ch <= 0xef && michael@0: ((pSrcLimit - pSrc) >= 3) && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && michael@0: (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f michael@0: ) { michael@0: /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ michael@0: *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else { michael@0: if( /* handle U+0000..U+07FF inline */ michael@0: ch >= 0xc0 && michael@0: ((pSrcLimit - pSrc) >= 2) && michael@0: (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f michael@0: ) { michael@0: *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: if(subchar < 0) { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else { michael@0: /* function call for error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: ++numSubstitutions; michael@0: if(subchar<=0xFFFF) { michael@0: *(pDest++)=(UChar)subchar; michael@0: } else { michael@0: *(pDest++)=U16_LEAD(subchar); michael@0: if(pDest= 0xe0) { michael@0: if( /* handle U+0000..U+FFFF inline */ michael@0: ch <= 0xef && michael@0: ((pSrcLimit - pSrc) >= 3) && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f && michael@0: (uint8_t)(pSrc[2] - 0x80) <= 0x3f michael@0: ) { michael@0: reqLength++; michael@0: pSrc += 3; michael@0: continue; michael@0: } michael@0: } else { michael@0: if( /* handle U+0000..U+07FF inline */ michael@0: ch >= 0xc0 && michael@0: ((pSrcLimit - pSrc) >= 2) && michael@0: (uint8_t)(pSrc[1] - 0x80) <= 0x3f michael@0: ) { michael@0: reqLength++; michael@0: pSrc += 2; michael@0: continue; michael@0: } michael@0: } michael@0: michael@0: if(subchar < 0) { michael@0: *pErrorCode = U_INVALID_CHAR_FOUND; michael@0: return NULL; michael@0: } else { michael@0: /* function call for error cases */ michael@0: ++pSrc; /* continue after the lead byte */ michael@0: utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); michael@0: ++numSubstitutions; michael@0: reqLength+=U16_LENGTH(ch); michael@0: } michael@0: } michael@0: } michael@0: michael@0: if(pNumSubstitutions!=NULL) { michael@0: *pNumSubstitutions=numSubstitutions; michael@0: } michael@0: michael@0: reqLength+=(int32_t)(pDest - dest); michael@0: if(pDestLength) { michael@0: *pDestLength = reqLength; michael@0: } michael@0: michael@0: /* Terminate the buffer */ michael@0: u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); michael@0: return dest; michael@0: } michael@0: michael@0: U_CAPI char* U_EXPORT2 michael@0: u_strToJavaModifiedUTF8( michael@0: char *dest, michael@0: int32_t destCapacity, michael@0: int32_t *pDestLength, michael@0: const UChar *src, michael@0: int32_t srcLength, michael@0: UErrorCode *pErrorCode) { michael@0: int32_t reqLength=0; michael@0: uint32_t ch=0; michael@0: uint8_t *pDest = (uint8_t *)dest; michael@0: uint8_t *pDestLimit = pDest + destCapacity; michael@0: const UChar *pSrcLimit; michael@0: int32_t count; michael@0: michael@0: /* args check */ michael@0: if(U_FAILURE(*pErrorCode)){ michael@0: return NULL; michael@0: } michael@0: if( (src==NULL && srcLength!=0) || srcLength < -1 || michael@0: (dest==NULL && destCapacity!=0) || destCapacity<0 michael@0: ) { michael@0: *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; michael@0: return NULL; michael@0: } michael@0: michael@0: if(srcLength==-1) { michael@0: /* Convert NUL-terminated ASCII, then find the string length. */ michael@0: while((ch=*src)<=0x7f && ch != 0 && pDest= srcLength && srcLength > 0 && *src <= 0x7f) { michael@0: /* fast ASCII loop */ michael@0: const UChar *prevSrc = src; michael@0: int32_t delta; michael@0: while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { michael@0: *pDest++=(uint8_t)ch; michael@0: ++src; michael@0: } michael@0: delta = (int32_t)(src - prevSrc); michael@0: count -= delta; michael@0: srcLength -= delta; michael@0: } michael@0: /* michael@0: * Each iteration of the inner loop progresses by at most 3 UTF-8 michael@0: * bytes and one UChar. michael@0: */ michael@0: count /= 3; michael@0: if(count > srcLength) { michael@0: count = srcLength; /* min(remaining dest/3, remaining src) */ michael@0: } michael@0: if(count < 3) { michael@0: /* michael@0: * Too much overhead if we get near the end of the string, michael@0: * continue with the next loop. michael@0: */ michael@0: break; michael@0: } michael@0: do { michael@0: ch=*src++; michael@0: if(ch <= 0x7f && ch != 0) { michael@0: *pDest++ = (uint8_t)ch; michael@0: } else if(ch <= 0x7ff) { michael@0: *pDest++=(uint8_t)((ch>>6)|0xc0); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: *pDest++=(uint8_t)((ch>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } michael@0: } while(--count > 0); michael@0: } michael@0: michael@0: while(src= 2) { michael@0: *pDest++=(uint8_t)((ch>>6)|0xc0); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 2; michael@0: break; michael@0: } michael@0: } else { michael@0: if((pDestLimit - pDest) >= 3) { michael@0: *pDest++=(uint8_t)((ch>>12)|0xe0); michael@0: *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); michael@0: *pDest++=(uint8_t)((ch&0x3f)|0x80); michael@0: } else { michael@0: reqLength = 3; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: while(src