1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ustrtrns.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1648 @@ 1.4 +/* 1.5 +****************************************************************************** 1.6 +* 1.7 +* Copyright (C) 2001-2013, International Business Machines 1.8 +* Corporation and others. All Rights Reserved. 1.9 +* 1.10 +****************************************************************************** 1.11 +* 1.12 +* File ustrtrns.cpp 1.13 +* 1.14 +* Modification History: 1.15 +* 1.16 +* Date Name Description 1.17 +* 9/10/2001 Ram Creation. 1.18 +****************************************************************************** 1.19 +*/ 1.20 + 1.21 +/******************************************************************************* 1.22 + * 1.23 + * u_strTo* and u_strFrom* APIs 1.24 + * WCS functions moved to ustr_wcs.c for better modularization 1.25 + * 1.26 + ******************************************************************************* 1.27 + */ 1.28 + 1.29 + 1.30 +#include "unicode/putil.h" 1.31 +#include "unicode/ustring.h" 1.32 +#include "unicode/utf.h" 1.33 +#include "unicode/utf8.h" 1.34 +#include "unicode/utf16.h" 1.35 +#include "cstring.h" 1.36 +#include "cmemory.h" 1.37 +#include "ustr_imp.h" 1.38 +#include "uassert.h" 1.39 + 1.40 +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 1.41 + 1.42 +U_CAPI UChar* U_EXPORT2 1.43 +u_strFromUTF32WithSub(UChar *dest, 1.44 + int32_t destCapacity, 1.45 + int32_t *pDestLength, 1.46 + const UChar32 *src, 1.47 + int32_t srcLength, 1.48 + UChar32 subchar, int32_t *pNumSubstitutions, 1.49 + UErrorCode *pErrorCode) { 1.50 + const UChar32 *srcLimit; 1.51 + UChar32 ch; 1.52 + UChar *destLimit; 1.53 + UChar *pDest; 1.54 + int32_t reqLength; 1.55 + int32_t numSubstitutions; 1.56 + 1.57 + /* args check */ 1.58 + if(U_FAILURE(*pErrorCode)){ 1.59 + return NULL; 1.60 + } 1.61 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.62 + (destCapacity<0) || (dest == NULL && destCapacity > 0) || 1.63 + subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1.64 + ) { 1.65 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.66 + return NULL; 1.67 + } 1.68 + 1.69 + if(pNumSubstitutions != NULL) { 1.70 + *pNumSubstitutions = 0; 1.71 + } 1.72 + 1.73 + pDest = dest; 1.74 + destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 1.75 + reqLength = 0; 1.76 + numSubstitutions = 0; 1.77 + 1.78 + if(srcLength < 0) { 1.79 + /* simple loop for conversion of a NUL-terminated BMP string */ 1.80 + while((ch=*src) != 0 && 1.81 + ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 1.82 + ++src; 1.83 + if(pDest < destLimit) { 1.84 + *pDest++ = (UChar)ch; 1.85 + } else { 1.86 + ++reqLength; 1.87 + } 1.88 + } 1.89 + srcLimit = src; 1.90 + if(ch != 0) { 1.91 + /* "complicated" case, find the end of the remaining string */ 1.92 + while(*++srcLimit != 0) {} 1.93 + } 1.94 + } else { 1.95 + srcLimit = (src!=NULL)?(src + srcLength):NULL; 1.96 + } 1.97 + 1.98 + /* convert with length */ 1.99 + while(src < srcLimit) { 1.100 + ch = *src++; 1.101 + do { 1.102 + /* usually "loops" once; twice only for writing subchar */ 1.103 + if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 1.104 + if(pDest < destLimit) { 1.105 + *pDest++ = (UChar)ch; 1.106 + } else { 1.107 + ++reqLength; 1.108 + } 1.109 + break; 1.110 + } else if(0x10000 <= ch && ch <= 0x10ffff) { 1.111 + if(pDest!=NULL && ((pDest + 2) <= destLimit)) { 1.112 + *pDest++ = U16_LEAD(ch); 1.113 + *pDest++ = U16_TRAIL(ch); 1.114 + } else { 1.115 + reqLength += 2; 1.116 + } 1.117 + break; 1.118 + } else if((ch = subchar) < 0) { 1.119 + /* surrogate code point, or not a Unicode code point at all */ 1.120 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.121 + return NULL; 1.122 + } else { 1.123 + ++numSubstitutions; 1.124 + } 1.125 + } while(TRUE); 1.126 + } 1.127 + 1.128 + reqLength += (int32_t)(pDest - dest); 1.129 + if(pDestLength) { 1.130 + *pDestLength = reqLength; 1.131 + } 1.132 + if(pNumSubstitutions != NULL) { 1.133 + *pNumSubstitutions = numSubstitutions; 1.134 + } 1.135 + 1.136 + /* Terminate the buffer */ 1.137 + u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1.138 + 1.139 + return dest; 1.140 +} 1.141 + 1.142 +U_CAPI UChar* U_EXPORT2 1.143 +u_strFromUTF32(UChar *dest, 1.144 + int32_t destCapacity, 1.145 + int32_t *pDestLength, 1.146 + const UChar32 *src, 1.147 + int32_t srcLength, 1.148 + UErrorCode *pErrorCode) { 1.149 + return u_strFromUTF32WithSub( 1.150 + dest, destCapacity, pDestLength, 1.151 + src, srcLength, 1.152 + U_SENTINEL, NULL, 1.153 + pErrorCode); 1.154 +} 1.155 + 1.156 +U_CAPI UChar32* U_EXPORT2 1.157 +u_strToUTF32WithSub(UChar32 *dest, 1.158 + int32_t destCapacity, 1.159 + int32_t *pDestLength, 1.160 + const UChar *src, 1.161 + int32_t srcLength, 1.162 + UChar32 subchar, int32_t *pNumSubstitutions, 1.163 + UErrorCode *pErrorCode) { 1.164 + const UChar *srcLimit; 1.165 + UChar32 ch; 1.166 + UChar ch2; 1.167 + UChar32 *destLimit; 1.168 + UChar32 *pDest; 1.169 + int32_t reqLength; 1.170 + int32_t numSubstitutions; 1.171 + 1.172 + /* args check */ 1.173 + if(U_FAILURE(*pErrorCode)){ 1.174 + return NULL; 1.175 + } 1.176 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.177 + (destCapacity<0) || (dest == NULL && destCapacity > 0) || 1.178 + subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1.179 + ) { 1.180 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.181 + return NULL; 1.182 + } 1.183 + 1.184 + if(pNumSubstitutions != NULL) { 1.185 + *pNumSubstitutions = 0; 1.186 + } 1.187 + 1.188 + pDest = dest; 1.189 + destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 1.190 + reqLength = 0; 1.191 + numSubstitutions = 0; 1.192 + 1.193 + if(srcLength < 0) { 1.194 + /* simple loop for conversion of a NUL-terminated BMP string */ 1.195 + while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 1.196 + ++src; 1.197 + if(pDest < destLimit) { 1.198 + *pDest++ = ch; 1.199 + } else { 1.200 + ++reqLength; 1.201 + } 1.202 + } 1.203 + srcLimit = src; 1.204 + if(ch != 0) { 1.205 + /* "complicated" case, find the end of the remaining string */ 1.206 + while(*++srcLimit != 0) {} 1.207 + } 1.208 + } else { 1.209 + srcLimit = (src!=NULL)?(src + srcLength):NULL; 1.210 + } 1.211 + 1.212 + /* convert with length */ 1.213 + while(src < srcLimit) { 1.214 + ch = *src++; 1.215 + if(!U16_IS_SURROGATE(ch)) { 1.216 + /* write or count ch below */ 1.217 + } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 1.218 + ++src; 1.219 + ch = U16_GET_SUPPLEMENTARY(ch, ch2); 1.220 + } else if((ch = subchar) < 0) { 1.221 + /* unpaired surrogate */ 1.222 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.223 + return NULL; 1.224 + } else { 1.225 + ++numSubstitutions; 1.226 + } 1.227 + if(pDest < destLimit) { 1.228 + *pDest++ = ch; 1.229 + } else { 1.230 + ++reqLength; 1.231 + } 1.232 + } 1.233 + 1.234 + reqLength += (int32_t)(pDest - dest); 1.235 + if(pDestLength) { 1.236 + *pDestLength = reqLength; 1.237 + } 1.238 + if(pNumSubstitutions != NULL) { 1.239 + *pNumSubstitutions = numSubstitutions; 1.240 + } 1.241 + 1.242 + /* Terminate the buffer */ 1.243 + u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 1.244 + 1.245 + return dest; 1.246 +} 1.247 + 1.248 +U_CAPI UChar32* U_EXPORT2 1.249 +u_strToUTF32(UChar32 *dest, 1.250 + int32_t destCapacity, 1.251 + int32_t *pDestLength, 1.252 + const UChar *src, 1.253 + int32_t srcLength, 1.254 + UErrorCode *pErrorCode) { 1.255 + return u_strToUTF32WithSub( 1.256 + dest, destCapacity, pDestLength, 1.257 + src, srcLength, 1.258 + U_SENTINEL, NULL, 1.259 + pErrorCode); 1.260 +} 1.261 + 1.262 +/* for utf8_nextCharSafeBodyTerminated() */ 1.263 +static const UChar32 1.264 +utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 1.265 + 1.266 +/* 1.267 + * Version of utf8_nextCharSafeBody() with the following differences: 1.268 + * - checks for NUL termination instead of length 1.269 + * - works with pointers instead of indexes 1.270 + * - always strict (strict==-1) 1.271 + * 1.272 + * *ps points to after the lead byte and will be moved to after the last trail byte. 1.273 + * c is the lead byte. 1.274 + * @return the code point, or U_SENTINEL 1.275 + */ 1.276 +static UChar32 1.277 +utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 1.278 + const uint8_t *s=*ps; 1.279 + uint8_t trail, illegal=0; 1.280 + uint8_t count=U8_COUNT_TRAIL_BYTES(c); 1.281 + U_ASSERT(count<6); 1.282 + U8_MASK_LEAD_BYTE((c), count); 1.283 + /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 1.284 + switch(count) { 1.285 + /* each branch falls through to the next one */ 1.286 + case 5: 1.287 + case 4: 1.288 + /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 1.289 + illegal=1; 1.290 + break; 1.291 + case 3: 1.292 + trail=(uint8_t)(*s++ - 0x80); 1.293 + c=(c<<6)|trail; 1.294 + if(trail>0x3f || c>=0x110) { 1.295 + /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 1.296 + illegal=1; 1.297 + break; 1.298 + } 1.299 + case 2: /*fall through*/ 1.300 + trail=(uint8_t)(*s++ - 0x80); 1.301 + if(trail>0x3f) { 1.302 + /* not a trail byte */ 1.303 + illegal=1; 1.304 + break; 1.305 + } 1.306 + c=(c<<6)|trail; 1.307 + case 1: /*fall through*/ 1.308 + trail=(uint8_t)(*s++ - 0x80); 1.309 + if(trail>0x3f) { 1.310 + /* not a trail byte */ 1.311 + illegal=1; 1.312 + } 1.313 + c=(c<<6)|trail; 1.314 + break; 1.315 + case 0: 1.316 + return U_SENTINEL; 1.317 + /* no default branch to optimize switch() - all values are covered */ 1.318 + } 1.319 + 1.320 + /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 1.321 + /* illegal is also set if count>=4 */ 1.322 + if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 1.323 + /* error handling */ 1.324 + /* don't go beyond this sequence */ 1.325 + s=*ps; 1.326 + while(count>0 && U8_IS_TRAIL(*s)) { 1.327 + ++s; 1.328 + --count; 1.329 + } 1.330 + c=U_SENTINEL; 1.331 + } 1.332 + *ps=s; 1.333 + return c; 1.334 +} 1.335 + 1.336 +/* 1.337 + * Version of utf8_nextCharSafeBody() with the following differences: 1.338 + * - works with pointers instead of indexes 1.339 + * - always strict (strict==-1) 1.340 + * 1.341 + * *ps points to after the lead byte and will be moved to after the last trail byte. 1.342 + * c is the lead byte. 1.343 + * @return the code point, or U_SENTINEL 1.344 + */ 1.345 +static UChar32 1.346 +utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 1.347 + const uint8_t *s=*ps; 1.348 + uint8_t trail, illegal=0; 1.349 + uint8_t count=U8_COUNT_TRAIL_BYTES(c); 1.350 + if((limit-s)>=count) { 1.351 + U8_MASK_LEAD_BYTE((c), count); 1.352 + /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 1.353 + switch(count) { 1.354 + /* each branch falls through to the next one */ 1.355 + case 5: 1.356 + case 4: 1.357 + /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 1.358 + illegal=1; 1.359 + break; 1.360 + case 3: 1.361 + trail=*s++; 1.362 + c=(c<<6)|(trail&0x3f); 1.363 + if(c<0x110) { 1.364 + illegal|=(trail&0xc0)^0x80; 1.365 + } else { 1.366 + /* code point>0x10ffff, outside Unicode */ 1.367 + illegal=1; 1.368 + break; 1.369 + } 1.370 + case 2: /*fall through*/ 1.371 + trail=*s++; 1.372 + c=(c<<6)|(trail&0x3f); 1.373 + illegal|=(trail&0xc0)^0x80; 1.374 + case 1: /*fall through*/ 1.375 + trail=*s++; 1.376 + c=(c<<6)|(trail&0x3f); 1.377 + illegal|=(trail&0xc0)^0x80; 1.378 + break; 1.379 + case 0: 1.380 + return U_SENTINEL; 1.381 + /* no default branch to optimize switch() - all values are covered */ 1.382 + } 1.383 + } else { 1.384 + illegal=1; /* too few bytes left */ 1.385 + } 1.386 + 1.387 + /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 1.388 + /* illegal is also set if count>=4 */ 1.389 + U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal)); 1.390 + if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 1.391 + /* error handling */ 1.392 + /* don't go beyond this sequence */ 1.393 + s=*ps; 1.394 + while(count>0 && s<limit && U8_IS_TRAIL(*s)) { 1.395 + ++s; 1.396 + --count; 1.397 + } 1.398 + c=U_SENTINEL; 1.399 + } 1.400 + *ps=s; 1.401 + return c; 1.402 +} 1.403 + 1.404 +U_CAPI UChar* U_EXPORT2 1.405 +u_strFromUTF8WithSub(UChar *dest, 1.406 + int32_t destCapacity, 1.407 + int32_t *pDestLength, 1.408 + const char* src, 1.409 + int32_t srcLength, 1.410 + UChar32 subchar, int32_t *pNumSubstitutions, 1.411 + UErrorCode *pErrorCode){ 1.412 + UChar *pDest = dest; 1.413 + UChar *pDestLimit = dest+destCapacity; 1.414 + UChar32 ch; 1.415 + int32_t reqLength = 0; 1.416 + const uint8_t* pSrc = (const uint8_t*) src; 1.417 + uint8_t t1, t2; /* trail bytes */ 1.418 + int32_t numSubstitutions; 1.419 + 1.420 + /* args check */ 1.421 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 1.422 + return NULL; 1.423 + } 1.424 + 1.425 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.426 + (destCapacity<0) || (dest == NULL && destCapacity > 0) || 1.427 + subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1.428 + ) { 1.429 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.430 + return NULL; 1.431 + } 1.432 + 1.433 + if(pNumSubstitutions!=NULL) { 1.434 + *pNumSubstitutions=0; 1.435 + } 1.436 + numSubstitutions=0; 1.437 + 1.438 + /* 1.439 + * Inline processing of UTF-8 byte sequences: 1.440 + * 1.441 + * Byte sequences for the most common characters are handled inline in 1.442 + * the conversion loops. In order to reduce the path lengths for those 1.443 + * characters, the tests are arranged in a kind of binary search. 1.444 + * ASCII (<=0x7f) is checked first, followed by the dividing point 1.445 + * between 2- and 3-byte sequences (0xe0). 1.446 + * The 3-byte branch is tested first to speed up CJK text. 1.447 + * The compiler should combine the subtractions for the two tests for 0xe0. 1.448 + * Each branch then tests for the other end of its range. 1.449 + */ 1.450 + 1.451 + if(srcLength < 0){ 1.452 + /* 1.453 + * Transform a NUL-terminated string. 1.454 + * The code explicitly checks for NULs only in the lead byte position. 1.455 + * A NUL byte in the trail byte position fails the trail byte range check anyway. 1.456 + */ 1.457 + while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 1.458 + if(ch <= 0x7f){ 1.459 + *pDest++=(UChar)ch; 1.460 + ++pSrc; 1.461 + } else { 1.462 + if(ch > 0xe0) { 1.463 + if( /* handle U+1000..U+CFFF inline */ 1.464 + ch <= 0xec && 1.465 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1.466 + (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1.467 + ) { 1.468 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.469 + *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1.470 + pSrc += 3; 1.471 + continue; 1.472 + } 1.473 + } else if(ch < 0xe0) { 1.474 + if( /* handle U+0080..U+07FF inline */ 1.475 + ch >= 0xc2 && 1.476 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1.477 + ) { 1.478 + *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1.479 + pSrc += 2; 1.480 + continue; 1.481 + } 1.482 + } 1.483 + 1.484 + /* function call for "complicated" and error cases */ 1.485 + ++pSrc; /* continue after the lead byte */ 1.486 + ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 1.487 + if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 1.488 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.489 + return NULL; 1.490 + } else if(ch<=0xFFFF) { 1.491 + *(pDest++)=(UChar)ch; 1.492 + } else { 1.493 + *(pDest++)=U16_LEAD(ch); 1.494 + if(pDest<pDestLimit) { 1.495 + *(pDest++)=U16_TRAIL(ch); 1.496 + } else { 1.497 + reqLength++; 1.498 + break; 1.499 + } 1.500 + } 1.501 + } 1.502 + } 1.503 + 1.504 + /* Pre-flight the rest of the string. */ 1.505 + while((ch = *pSrc) != 0) { 1.506 + if(ch <= 0x7f){ 1.507 + ++reqLength; 1.508 + ++pSrc; 1.509 + } else { 1.510 + if(ch > 0xe0) { 1.511 + if( /* handle U+1000..U+CFFF inline */ 1.512 + ch <= 0xec && 1.513 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1.514 + (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1.515 + ) { 1.516 + ++reqLength; 1.517 + pSrc += 3; 1.518 + continue; 1.519 + } 1.520 + } else if(ch < 0xe0) { 1.521 + if( /* handle U+0080..U+07FF inline */ 1.522 + ch >= 0xc2 && 1.523 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1.524 + ) { 1.525 + ++reqLength; 1.526 + pSrc += 2; 1.527 + continue; 1.528 + } 1.529 + } 1.530 + 1.531 + /* function call for "complicated" and error cases */ 1.532 + ++pSrc; /* continue after the lead byte */ 1.533 + ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 1.534 + if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 1.535 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.536 + return NULL; 1.537 + } 1.538 + reqLength += U16_LENGTH(ch); 1.539 + } 1.540 + } 1.541 + } else /* srcLength >= 0 */ { 1.542 + const uint8_t *pSrcLimit = pSrc + srcLength; 1.543 + int32_t count; 1.544 + 1.545 + /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1.546 + for(;;) { 1.547 + /* 1.548 + * Each iteration of the inner loop progresses by at most 3 UTF-8 1.549 + * bytes and one UChar, for most characters. 1.550 + * For supplementary code points (4 & 2), which are rare, 1.551 + * there is an additional adjustment. 1.552 + */ 1.553 + count = (int32_t)(pDestLimit - pDest); 1.554 + srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 1.555 + if(count > srcLength) { 1.556 + count = srcLength; /* min(remaining dest, remaining src/3) */ 1.557 + } 1.558 + if(count < 3) { 1.559 + /* 1.560 + * Too much overhead if we get near the end of the string, 1.561 + * continue with the next loop. 1.562 + */ 1.563 + break; 1.564 + } 1.565 + 1.566 + do { 1.567 + ch = *pSrc; 1.568 + if(ch <= 0x7f){ 1.569 + *pDest++=(UChar)ch; 1.570 + ++pSrc; 1.571 + } else { 1.572 + if(ch > 0xe0) { 1.573 + if( /* handle U+1000..U+CFFF inline */ 1.574 + ch <= 0xec && 1.575 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1.576 + (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1.577 + ) { 1.578 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.579 + *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1.580 + pSrc += 3; 1.581 + continue; 1.582 + } 1.583 + } else if(ch < 0xe0) { 1.584 + if( /* handle U+0080..U+07FF inline */ 1.585 + ch >= 0xc2 && 1.586 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1.587 + ) { 1.588 + *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1.589 + pSrc += 2; 1.590 + continue; 1.591 + } 1.592 + } 1.593 + 1.594 + if(ch >= 0xf0 || subchar > 0xffff) { 1.595 + /* 1.596 + * We may read up to six bytes and write up to two UChars, 1.597 + * which we didn't account for with computing count, 1.598 + * so we adjust it here. 1.599 + */ 1.600 + if(--count == 0) { 1.601 + break; 1.602 + } 1.603 + } 1.604 + 1.605 + /* function call for "complicated" and error cases */ 1.606 + ++pSrc; /* continue after the lead byte */ 1.607 + ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.608 + if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 1.609 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.610 + return NULL; 1.611 + }else if(ch<=0xFFFF){ 1.612 + *(pDest++)=(UChar)ch; 1.613 + }else{ 1.614 + *(pDest++)=U16_LEAD(ch); 1.615 + *(pDest++)=U16_TRAIL(ch); 1.616 + } 1.617 + } 1.618 + } while(--count > 0); 1.619 + } 1.620 + 1.621 + while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1.622 + ch = *pSrc; 1.623 + if(ch <= 0x7f){ 1.624 + *pDest++=(UChar)ch; 1.625 + ++pSrc; 1.626 + } else { 1.627 + if(ch > 0xe0) { 1.628 + if( /* handle U+1000..U+CFFF inline */ 1.629 + ch <= 0xec && 1.630 + ((pSrcLimit - pSrc) >= 3) && 1.631 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1.632 + (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1.633 + ) { 1.634 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.635 + *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1.636 + pSrc += 3; 1.637 + continue; 1.638 + } 1.639 + } else if(ch < 0xe0) { 1.640 + if( /* handle U+0080..U+07FF inline */ 1.641 + ch >= 0xc2 && 1.642 + ((pSrcLimit - pSrc) >= 2) && 1.643 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1.644 + ) { 1.645 + *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1.646 + pSrc += 2; 1.647 + continue; 1.648 + } 1.649 + } 1.650 + 1.651 + /* function call for "complicated" and error cases */ 1.652 + ++pSrc; /* continue after the lead byte */ 1.653 + ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.654 + if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 1.655 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.656 + return NULL; 1.657 + }else if(ch<=0xFFFF){ 1.658 + *(pDest++)=(UChar)ch; 1.659 + }else{ 1.660 + *(pDest++)=U16_LEAD(ch); 1.661 + if(pDest<pDestLimit){ 1.662 + *(pDest++)=U16_TRAIL(ch); 1.663 + }else{ 1.664 + reqLength++; 1.665 + break; 1.666 + } 1.667 + } 1.668 + } 1.669 + } 1.670 + /* do not fill the dest buffer just count the UChars needed */ 1.671 + while(pSrc < pSrcLimit){ 1.672 + ch = *pSrc; 1.673 + if(ch <= 0x7f){ 1.674 + reqLength++; 1.675 + ++pSrc; 1.676 + } else { 1.677 + if(ch > 0xe0) { 1.678 + if( /* handle U+1000..U+CFFF inline */ 1.679 + ch <= 0xec && 1.680 + ((pSrcLimit - pSrc) >= 3) && 1.681 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1.682 + (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1.683 + ) { 1.684 + reqLength++; 1.685 + pSrc += 3; 1.686 + continue; 1.687 + } 1.688 + } else if(ch < 0xe0) { 1.689 + if( /* handle U+0080..U+07FF inline */ 1.690 + ch >= 0xc2 && 1.691 + ((pSrcLimit - pSrc) >= 2) && 1.692 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1.693 + ) { 1.694 + reqLength++; 1.695 + pSrc += 2; 1.696 + continue; 1.697 + } 1.698 + } 1.699 + 1.700 + /* function call for "complicated" and error cases */ 1.701 + ++pSrc; /* continue after the lead byte */ 1.702 + ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.703 + if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 1.704 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.705 + return NULL; 1.706 + } 1.707 + reqLength+=U16_LENGTH(ch); 1.708 + } 1.709 + } 1.710 + } 1.711 + 1.712 + reqLength+=(int32_t)(pDest - dest); 1.713 + 1.714 + if(pNumSubstitutions!=NULL) { 1.715 + *pNumSubstitutions=numSubstitutions; 1.716 + } 1.717 + 1.718 + if(pDestLength){ 1.719 + *pDestLength = reqLength; 1.720 + } 1.721 + 1.722 + /* Terminate the buffer */ 1.723 + u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 1.724 + 1.725 + return dest; 1.726 +} 1.727 + 1.728 +U_CAPI UChar* U_EXPORT2 1.729 +u_strFromUTF8(UChar *dest, 1.730 + int32_t destCapacity, 1.731 + int32_t *pDestLength, 1.732 + const char* src, 1.733 + int32_t srcLength, 1.734 + UErrorCode *pErrorCode){ 1.735 + return u_strFromUTF8WithSub( 1.736 + dest, destCapacity, pDestLength, 1.737 + src, srcLength, 1.738 + U_SENTINEL, NULL, 1.739 + pErrorCode); 1.740 +} 1.741 + 1.742 +U_CAPI UChar * U_EXPORT2 1.743 +u_strFromUTF8Lenient(UChar *dest, 1.744 + int32_t destCapacity, 1.745 + int32_t *pDestLength, 1.746 + const char *src, 1.747 + int32_t srcLength, 1.748 + UErrorCode *pErrorCode) { 1.749 + UChar *pDest = dest; 1.750 + UChar32 ch; 1.751 + int32_t reqLength = 0; 1.752 + uint8_t* pSrc = (uint8_t*) src; 1.753 + 1.754 + /* args check */ 1.755 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 1.756 + return NULL; 1.757 + } 1.758 + 1.759 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.760 + (destCapacity<0) || (dest == NULL && destCapacity > 0) 1.761 + ) { 1.762 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.763 + return NULL; 1.764 + } 1.765 + 1.766 + if(srcLength < 0) { 1.767 + /* Transform a NUL-terminated string. */ 1.768 + UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 1.769 + uint8_t t1, t2, t3; /* trail bytes */ 1.770 + 1.771 + while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 1.772 + if(ch < 0xc0) { 1.773 + /* 1.774 + * ASCII, or a trail byte in lead position which is treated like 1.775 + * a single-byte sequence for better character boundary 1.776 + * resynchronization after illegal sequences. 1.777 + */ 1.778 + *pDest++=(UChar)ch; 1.779 + ++pSrc; 1.780 + continue; 1.781 + } else if(ch < 0xe0) { /* U+0080..U+07FF */ 1.782 + if((t1 = pSrc[1]) != 0) { 1.783 + /* 0x3080 = (0xc0 << 6) + 0x80 */ 1.784 + *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 1.785 + pSrc += 2; 1.786 + continue; 1.787 + } 1.788 + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 1.789 + if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 1.790 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.791 + /* 0x2080 = (0x80 << 6) + 0x80 */ 1.792 + *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 1.793 + pSrc += 3; 1.794 + continue; 1.795 + } 1.796 + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 1.797 + if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 1.798 + pSrc += 4; 1.799 + /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 1.800 + ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 1.801 + *(pDest++) = U16_LEAD(ch); 1.802 + if(pDest < pDestLimit) { 1.803 + *(pDest++) = U16_TRAIL(ch); 1.804 + } else { 1.805 + reqLength = 1; 1.806 + break; 1.807 + } 1.808 + continue; 1.809 + } 1.810 + } 1.811 + 1.812 + /* truncated character at the end */ 1.813 + *pDest++ = 0xfffd; 1.814 + while(*++pSrc != 0) {} 1.815 + break; 1.816 + } 1.817 + 1.818 + /* Pre-flight the rest of the string. */ 1.819 + while((ch = *pSrc) != 0) { 1.820 + if(ch < 0xc0) { 1.821 + /* 1.822 + * ASCII, or a trail byte in lead position which is treated like 1.823 + * a single-byte sequence for better character boundary 1.824 + * resynchronization after illegal sequences. 1.825 + */ 1.826 + ++reqLength; 1.827 + ++pSrc; 1.828 + continue; 1.829 + } else if(ch < 0xe0) { /* U+0080..U+07FF */ 1.830 + if(pSrc[1] != 0) { 1.831 + ++reqLength; 1.832 + pSrc += 2; 1.833 + continue; 1.834 + } 1.835 + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 1.836 + if(pSrc[1] != 0 && pSrc[2] != 0) { 1.837 + ++reqLength; 1.838 + pSrc += 3; 1.839 + continue; 1.840 + } 1.841 + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 1.842 + if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 1.843 + reqLength += 2; 1.844 + pSrc += 4; 1.845 + continue; 1.846 + } 1.847 + } 1.848 + 1.849 + /* truncated character at the end */ 1.850 + ++reqLength; 1.851 + break; 1.852 + } 1.853 + } else /* srcLength >= 0 */ { 1.854 + const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; 1.855 + 1.856 + /* 1.857 + * This function requires that if srcLength is given, then it must be 1.858 + * destCapatity >= srcLength so that we need not check for 1.859 + * destination buffer overflow in the loop. 1.860 + */ 1.861 + if(destCapacity < srcLength) { 1.862 + if(pDestLength != NULL) { 1.863 + *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 1.864 + } 1.865 + *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 1.866 + return NULL; 1.867 + } 1.868 + 1.869 + if((pSrcLimit - pSrc) >= 4) { 1.870 + pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 1.871 + 1.872 + /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 1.873 + do { 1.874 + ch = *pSrc++; 1.875 + if(ch < 0xc0) { 1.876 + /* 1.877 + * ASCII, or a trail byte in lead position which is treated like 1.878 + * a single-byte sequence for better character boundary 1.879 + * resynchronization after illegal sequences. 1.880 + */ 1.881 + *pDest++=(UChar)ch; 1.882 + } else if(ch < 0xe0) { /* U+0080..U+07FF */ 1.883 + /* 0x3080 = (0xc0 << 6) + 0x80 */ 1.884 + *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 1.885 + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 1.886 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.887 + /* 0x2080 = (0x80 << 6) + 0x80 */ 1.888 + ch = (ch << 12) + (*pSrc++ << 6); 1.889 + *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 1.890 + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 1.891 + /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 1.892 + ch = (ch << 18) + (*pSrc++ << 12); 1.893 + ch += *pSrc++ << 6; 1.894 + ch += *pSrc++ - 0x3c82080; 1.895 + *(pDest++) = U16_LEAD(ch); 1.896 + *(pDest++) = U16_TRAIL(ch); 1.897 + } 1.898 + } while(pSrc < pSrcLimit); 1.899 + 1.900 + pSrcLimit += 3; /* restore original pSrcLimit */ 1.901 + } 1.902 + 1.903 + while(pSrc < pSrcLimit) { 1.904 + ch = *pSrc++; 1.905 + if(ch < 0xc0) { 1.906 + /* 1.907 + * ASCII, or a trail byte in lead position which is treated like 1.908 + * a single-byte sequence for better character boundary 1.909 + * resynchronization after illegal sequences. 1.910 + */ 1.911 + *pDest++=(UChar)ch; 1.912 + continue; 1.913 + } else if(ch < 0xe0) { /* U+0080..U+07FF */ 1.914 + if(pSrc < pSrcLimit) { 1.915 + /* 0x3080 = (0xc0 << 6) + 0x80 */ 1.916 + *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 1.917 + continue; 1.918 + } 1.919 + } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 1.920 + if((pSrcLimit - pSrc) >= 2) { 1.921 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.922 + /* 0x2080 = (0x80 << 6) + 0x80 */ 1.923 + ch = (ch << 12) + (*pSrc++ << 6); 1.924 + *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 1.925 + pSrc += 3; 1.926 + continue; 1.927 + } 1.928 + } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 1.929 + if((pSrcLimit - pSrc) >= 3) { 1.930 + /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 1.931 + ch = (ch << 18) + (*pSrc++ << 12); 1.932 + ch += *pSrc++ << 6; 1.933 + ch += *pSrc++ - 0x3c82080; 1.934 + *(pDest++) = U16_LEAD(ch); 1.935 + *(pDest++) = U16_TRAIL(ch); 1.936 + pSrc += 4; 1.937 + continue; 1.938 + } 1.939 + } 1.940 + 1.941 + /* truncated character at the end */ 1.942 + *pDest++ = 0xfffd; 1.943 + break; 1.944 + } 1.945 + } 1.946 + 1.947 + reqLength+=(int32_t)(pDest - dest); 1.948 + 1.949 + if(pDestLength){ 1.950 + *pDestLength = reqLength; 1.951 + } 1.952 + 1.953 + /* Terminate the buffer */ 1.954 + u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 1.955 + 1.956 + return dest; 1.957 +} 1.958 + 1.959 +static inline uint8_t * 1.960 +_appendUTF8(uint8_t *pDest, UChar32 c) { 1.961 + /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 1.962 + if((c)<=0x7f) { 1.963 + *pDest++=(uint8_t)c; 1.964 + } else if(c<=0x7ff) { 1.965 + *pDest++=(uint8_t)((c>>6)|0xc0); 1.966 + *pDest++=(uint8_t)((c&0x3f)|0x80); 1.967 + } else if(c<=0xffff) { 1.968 + *pDest++=(uint8_t)((c>>12)|0xe0); 1.969 + *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 1.970 + *pDest++=(uint8_t)(((c)&0x3f)|0x80); 1.971 + } else /* if((uint32_t)(c)<=0x10ffff) */ { 1.972 + *pDest++=(uint8_t)(((c)>>18)|0xf0); 1.973 + *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 1.974 + *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 1.975 + *pDest++=(uint8_t)(((c)&0x3f)|0x80); 1.976 + } 1.977 + return pDest; 1.978 +} 1.979 + 1.980 + 1.981 +U_CAPI char* U_EXPORT2 1.982 +u_strToUTF8WithSub(char *dest, 1.983 + int32_t destCapacity, 1.984 + int32_t *pDestLength, 1.985 + const UChar *pSrc, 1.986 + int32_t srcLength, 1.987 + UChar32 subchar, int32_t *pNumSubstitutions, 1.988 + UErrorCode *pErrorCode){ 1.989 + int32_t reqLength=0; 1.990 + uint32_t ch=0,ch2=0; 1.991 + uint8_t *pDest = (uint8_t *)dest; 1.992 + uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; 1.993 + int32_t numSubstitutions; 1.994 + 1.995 + /* args check */ 1.996 + if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 1.997 + return NULL; 1.998 + } 1.999 + 1.1000 + if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 1.1001 + (destCapacity<0) || (dest == NULL && destCapacity > 0) || 1.1002 + subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1.1003 + ) { 1.1004 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.1005 + return NULL; 1.1006 + } 1.1007 + 1.1008 + if(pNumSubstitutions!=NULL) { 1.1009 + *pNumSubstitutions=0; 1.1010 + } 1.1011 + numSubstitutions=0; 1.1012 + 1.1013 + if(srcLength==-1) { 1.1014 + while((ch=*pSrc)!=0) { 1.1015 + ++pSrc; 1.1016 + if(ch <= 0x7f) { 1.1017 + if(pDest<pDestLimit) { 1.1018 + *pDest++ = (uint8_t)ch; 1.1019 + } else { 1.1020 + reqLength = 1; 1.1021 + break; 1.1022 + } 1.1023 + } else if(ch <= 0x7ff) { 1.1024 + if((pDestLimit - pDest) >= 2) { 1.1025 + *pDest++=(uint8_t)((ch>>6)|0xc0); 1.1026 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1027 + } else { 1.1028 + reqLength = 2; 1.1029 + break; 1.1030 + } 1.1031 + } else if(ch <= 0xd7ff || ch >= 0xe000) { 1.1032 + if((pDestLimit - pDest) >= 3) { 1.1033 + *pDest++=(uint8_t)((ch>>12)|0xe0); 1.1034 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1035 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1036 + } else { 1.1037 + reqLength = 3; 1.1038 + break; 1.1039 + } 1.1040 + } else /* ch is a surrogate */ { 1.1041 + int32_t length; 1.1042 + 1.1043 + /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ 1.1044 + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1.1045 + ++pSrc; 1.1046 + ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1.1047 + } else if(subchar>=0) { 1.1048 + ch=subchar; 1.1049 + ++numSubstitutions; 1.1050 + } else { 1.1051 + /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1.1052 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1053 + return NULL; 1.1054 + } 1.1055 + 1.1056 + length = U8_LENGTH(ch); 1.1057 + if((pDestLimit - pDest) >= length) { 1.1058 + /* convert and append*/ 1.1059 + pDest=_appendUTF8(pDest, ch); 1.1060 + } else { 1.1061 + reqLength = length; 1.1062 + break; 1.1063 + } 1.1064 + } 1.1065 + } 1.1066 + while((ch=*pSrc++)!=0) { 1.1067 + if(ch<=0x7f) { 1.1068 + ++reqLength; 1.1069 + } else if(ch<=0x7ff) { 1.1070 + reqLength+=2; 1.1071 + } else if(!U16_IS_SURROGATE(ch)) { 1.1072 + reqLength+=3; 1.1073 + } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1.1074 + ++pSrc; 1.1075 + reqLength+=4; 1.1076 + } else if(subchar>=0) { 1.1077 + reqLength+=U8_LENGTH(subchar); 1.1078 + ++numSubstitutions; 1.1079 + } else { 1.1080 + /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1.1081 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1082 + return NULL; 1.1083 + } 1.1084 + } 1.1085 + } else { 1.1086 + const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; 1.1087 + int32_t count; 1.1088 + 1.1089 + /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1.1090 + for(;;) { 1.1091 + /* 1.1092 + * Each iteration of the inner loop progresses by at most 3 UTF-8 1.1093 + * bytes and one UChar, for most characters. 1.1094 + * For supplementary code points (4 & 2), which are rare, 1.1095 + * there is an additional adjustment. 1.1096 + */ 1.1097 + count = (int32_t)((pDestLimit - pDest) / 3); 1.1098 + srcLength = (int32_t)(pSrcLimit - pSrc); 1.1099 + if(count > srcLength) { 1.1100 + count = srcLength; /* min(remaining dest/3, remaining src) */ 1.1101 + } 1.1102 + if(count < 3) { 1.1103 + /* 1.1104 + * Too much overhead if we get near the end of the string, 1.1105 + * continue with the next loop. 1.1106 + */ 1.1107 + break; 1.1108 + } 1.1109 + do { 1.1110 + ch=*pSrc++; 1.1111 + if(ch <= 0x7f) { 1.1112 + *pDest++ = (uint8_t)ch; 1.1113 + } else if(ch <= 0x7ff) { 1.1114 + *pDest++=(uint8_t)((ch>>6)|0xc0); 1.1115 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1116 + } else if(ch <= 0xd7ff || ch >= 0xe000) { 1.1117 + *pDest++=(uint8_t)((ch>>12)|0xe0); 1.1118 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1119 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1120 + } else /* ch is a surrogate */ { 1.1121 + /* 1.1122 + * We will read two UChars and probably output four bytes, 1.1123 + * which we didn't account for with computing count, 1.1124 + * so we adjust it here. 1.1125 + */ 1.1126 + if(--count == 0) { 1.1127 + --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1.1128 + break; /* recompute count */ 1.1129 + } 1.1130 + 1.1131 + if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1.1132 + ++pSrc; 1.1133 + ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1.1134 + 1.1135 + /* writing 4 bytes per 2 UChars is ok */ 1.1136 + *pDest++=(uint8_t)((ch>>18)|0xf0); 1.1137 + *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1.1138 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1139 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1140 + } else { 1.1141 + /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1.1142 + if(subchar>=0) { 1.1143 + ch=subchar; 1.1144 + ++numSubstitutions; 1.1145 + } else { 1.1146 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1147 + return NULL; 1.1148 + } 1.1149 + 1.1150 + /* convert and append*/ 1.1151 + pDest=_appendUTF8(pDest, ch); 1.1152 + } 1.1153 + } 1.1154 + } while(--count > 0); 1.1155 + } 1.1156 + 1.1157 + while(pSrc<pSrcLimit) { 1.1158 + ch=*pSrc++; 1.1159 + if(ch <= 0x7f) { 1.1160 + if(pDest<pDestLimit) { 1.1161 + *pDest++ = (uint8_t)ch; 1.1162 + } else { 1.1163 + reqLength = 1; 1.1164 + break; 1.1165 + } 1.1166 + } else if(ch <= 0x7ff) { 1.1167 + if((pDestLimit - pDest) >= 2) { 1.1168 + *pDest++=(uint8_t)((ch>>6)|0xc0); 1.1169 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1170 + } else { 1.1171 + reqLength = 2; 1.1172 + break; 1.1173 + } 1.1174 + } else if(ch <= 0xd7ff || ch >= 0xe000) { 1.1175 + if((pDestLimit - pDest) >= 3) { 1.1176 + *pDest++=(uint8_t)((ch>>12)|0xe0); 1.1177 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1178 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1179 + } else { 1.1180 + reqLength = 3; 1.1181 + break; 1.1182 + } 1.1183 + } else /* ch is a surrogate */ { 1.1184 + int32_t length; 1.1185 + 1.1186 + if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1.1187 + ++pSrc; 1.1188 + ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1.1189 + } else if(subchar>=0) { 1.1190 + ch=subchar; 1.1191 + ++numSubstitutions; 1.1192 + } else { 1.1193 + /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1.1194 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1195 + return NULL; 1.1196 + } 1.1197 + 1.1198 + length = U8_LENGTH(ch); 1.1199 + if((pDestLimit - pDest) >= length) { 1.1200 + /* convert and append*/ 1.1201 + pDest=_appendUTF8(pDest, ch); 1.1202 + } else { 1.1203 + reqLength = length; 1.1204 + break; 1.1205 + } 1.1206 + } 1.1207 + } 1.1208 + while(pSrc<pSrcLimit) { 1.1209 + ch=*pSrc++; 1.1210 + if(ch<=0x7f) { 1.1211 + ++reqLength; 1.1212 + } else if(ch<=0x7ff) { 1.1213 + reqLength+=2; 1.1214 + } else if(!U16_IS_SURROGATE(ch)) { 1.1215 + reqLength+=3; 1.1216 + } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1.1217 + ++pSrc; 1.1218 + reqLength+=4; 1.1219 + } else if(subchar>=0) { 1.1220 + reqLength+=U8_LENGTH(subchar); 1.1221 + ++numSubstitutions; 1.1222 + } else { 1.1223 + /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1.1224 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1225 + return NULL; 1.1226 + } 1.1227 + } 1.1228 + } 1.1229 + 1.1230 + reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1.1231 + 1.1232 + if(pNumSubstitutions!=NULL) { 1.1233 + *pNumSubstitutions=numSubstitutions; 1.1234 + } 1.1235 + 1.1236 + if(pDestLength){ 1.1237 + *pDestLength = reqLength; 1.1238 + } 1.1239 + 1.1240 + /* Terminate the buffer */ 1.1241 + u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1.1242 + return dest; 1.1243 +} 1.1244 + 1.1245 +U_CAPI char* U_EXPORT2 1.1246 +u_strToUTF8(char *dest, 1.1247 + int32_t destCapacity, 1.1248 + int32_t *pDestLength, 1.1249 + const UChar *pSrc, 1.1250 + int32_t srcLength, 1.1251 + UErrorCode *pErrorCode){ 1.1252 + return u_strToUTF8WithSub( 1.1253 + dest, destCapacity, pDestLength, 1.1254 + pSrc, srcLength, 1.1255 + U_SENTINEL, NULL, 1.1256 + pErrorCode); 1.1257 +} 1.1258 + 1.1259 +U_CAPI UChar* U_EXPORT2 1.1260 +u_strFromJavaModifiedUTF8WithSub( 1.1261 + UChar *dest, 1.1262 + int32_t destCapacity, 1.1263 + int32_t *pDestLength, 1.1264 + const char *src, 1.1265 + int32_t srcLength, 1.1266 + UChar32 subchar, int32_t *pNumSubstitutions, 1.1267 + UErrorCode *pErrorCode) { 1.1268 + UChar *pDest = dest; 1.1269 + UChar *pDestLimit = dest+destCapacity; 1.1270 + UChar32 ch; 1.1271 + int32_t reqLength = 0; 1.1272 + const uint8_t* pSrc = (const uint8_t*) src; 1.1273 + const uint8_t *pSrcLimit; 1.1274 + int32_t count; 1.1275 + uint8_t t1, t2; /* trail bytes */ 1.1276 + int32_t numSubstitutions; 1.1277 + 1.1278 + /* args check */ 1.1279 + if(U_FAILURE(*pErrorCode)){ 1.1280 + return NULL; 1.1281 + } 1.1282 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.1283 + (dest==NULL && destCapacity!=0) || destCapacity<0 || 1.1284 + subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1.1285 + ) { 1.1286 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.1287 + return NULL; 1.1288 + } 1.1289 + 1.1290 + if(pNumSubstitutions!=NULL) { 1.1291 + *pNumSubstitutions=0; 1.1292 + } 1.1293 + numSubstitutions=0; 1.1294 + 1.1295 + if(srcLength < 0) { 1.1296 + /* 1.1297 + * Transform a NUL-terminated ASCII string. 1.1298 + * Handle non-ASCII strings with slower code. 1.1299 + */ 1.1300 + while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { 1.1301 + *pDest++=(UChar)ch; 1.1302 + ++pSrc; 1.1303 + } 1.1304 + if(ch == 0) { 1.1305 + reqLength=(int32_t)(pDest - dest); 1.1306 + if(pDestLength) { 1.1307 + *pDestLength = reqLength; 1.1308 + } 1.1309 + 1.1310 + /* Terminate the buffer */ 1.1311 + u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1.1312 + return dest; 1.1313 + } 1.1314 + srcLength = uprv_strlen((const char *)pSrc); 1.1315 + } 1.1316 + 1.1317 + /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1.1318 + pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; 1.1319 + for(;;) { 1.1320 + count = (int32_t)(pDestLimit - pDest); 1.1321 + srcLength = (int32_t)(pSrcLimit - pSrc); 1.1322 + if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { 1.1323 + /* fast ASCII loop */ 1.1324 + const uint8_t *prevSrc = pSrc; 1.1325 + int32_t delta; 1.1326 + while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { 1.1327 + *pDest++=(UChar)ch; 1.1328 + ++pSrc; 1.1329 + } 1.1330 + delta = (int32_t)(pSrc - prevSrc); 1.1331 + count -= delta; 1.1332 + srcLength -= delta; 1.1333 + } 1.1334 + /* 1.1335 + * Each iteration of the inner loop progresses by at most 3 UTF-8 1.1336 + * bytes and one UChar. 1.1337 + */ 1.1338 + srcLength /= 3; 1.1339 + if(count > srcLength) { 1.1340 + count = srcLength; /* min(remaining dest, remaining src/3) */ 1.1341 + } 1.1342 + if(count < 3) { 1.1343 + /* 1.1344 + * Too much overhead if we get near the end of the string, 1.1345 + * continue with the next loop. 1.1346 + */ 1.1347 + break; 1.1348 + } 1.1349 + do { 1.1350 + ch = *pSrc; 1.1351 + if(ch <= 0x7f){ 1.1352 + *pDest++=(UChar)ch; 1.1353 + ++pSrc; 1.1354 + } else { 1.1355 + if(ch >= 0xe0) { 1.1356 + if( /* handle U+0000..U+FFFF inline */ 1.1357 + ch <= 0xef && 1.1358 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1.1359 + (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1.1360 + ) { 1.1361 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.1362 + *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1.1363 + pSrc += 3; 1.1364 + continue; 1.1365 + } 1.1366 + } else { 1.1367 + if( /* handle U+0000..U+07FF inline */ 1.1368 + ch >= 0xc0 && 1.1369 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1.1370 + ) { 1.1371 + *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1.1372 + pSrc += 2; 1.1373 + continue; 1.1374 + } 1.1375 + } 1.1376 + 1.1377 + if(subchar < 0) { 1.1378 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1379 + return NULL; 1.1380 + } else if(subchar > 0xffff && --count == 0) { 1.1381 + /* 1.1382 + * We need to write two UChars, adjusted count for that, 1.1383 + * and ran out of space. 1.1384 + */ 1.1385 + break; 1.1386 + } else { 1.1387 + /* function call for error cases */ 1.1388 + ++pSrc; /* continue after the lead byte */ 1.1389 + utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.1390 + ++numSubstitutions; 1.1391 + if(subchar<=0xFFFF) { 1.1392 + *(pDest++)=(UChar)subchar; 1.1393 + } else { 1.1394 + *(pDest++)=U16_LEAD(subchar); 1.1395 + *(pDest++)=U16_TRAIL(subchar); 1.1396 + } 1.1397 + } 1.1398 + } 1.1399 + } while(--count > 0); 1.1400 + } 1.1401 + 1.1402 + while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1.1403 + ch = *pSrc; 1.1404 + if(ch <= 0x7f){ 1.1405 + *pDest++=(UChar)ch; 1.1406 + ++pSrc; 1.1407 + } else { 1.1408 + if(ch >= 0xe0) { 1.1409 + if( /* handle U+0000..U+FFFF inline */ 1.1410 + ch <= 0xef && 1.1411 + ((pSrcLimit - pSrc) >= 3) && 1.1412 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1.1413 + (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1.1414 + ) { 1.1415 + /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1.1416 + *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1.1417 + pSrc += 3; 1.1418 + continue; 1.1419 + } 1.1420 + } else { 1.1421 + if( /* handle U+0000..U+07FF inline */ 1.1422 + ch >= 0xc0 && 1.1423 + ((pSrcLimit - pSrc) >= 2) && 1.1424 + (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1.1425 + ) { 1.1426 + *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1.1427 + pSrc += 2; 1.1428 + continue; 1.1429 + } 1.1430 + } 1.1431 + 1.1432 + if(subchar < 0) { 1.1433 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1434 + return NULL; 1.1435 + } else { 1.1436 + /* function call for error cases */ 1.1437 + ++pSrc; /* continue after the lead byte */ 1.1438 + utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.1439 + ++numSubstitutions; 1.1440 + if(subchar<=0xFFFF) { 1.1441 + *(pDest++)=(UChar)subchar; 1.1442 + } else { 1.1443 + *(pDest++)=U16_LEAD(subchar); 1.1444 + if(pDest<pDestLimit) { 1.1445 + *(pDest++)=U16_TRAIL(subchar); 1.1446 + } else { 1.1447 + reqLength++; 1.1448 + break; 1.1449 + } 1.1450 + } 1.1451 + } 1.1452 + } 1.1453 + } 1.1454 + 1.1455 + /* do not fill the dest buffer just count the UChars needed */ 1.1456 + while(pSrc < pSrcLimit){ 1.1457 + ch = *pSrc; 1.1458 + if(ch <= 0x7f) { 1.1459 + reqLength++; 1.1460 + ++pSrc; 1.1461 + } else { 1.1462 + if(ch >= 0xe0) { 1.1463 + if( /* handle U+0000..U+FFFF inline */ 1.1464 + ch <= 0xef && 1.1465 + ((pSrcLimit - pSrc) >= 3) && 1.1466 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1.1467 + (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1.1468 + ) { 1.1469 + reqLength++; 1.1470 + pSrc += 3; 1.1471 + continue; 1.1472 + } 1.1473 + } else { 1.1474 + if( /* handle U+0000..U+07FF inline */ 1.1475 + ch >= 0xc0 && 1.1476 + ((pSrcLimit - pSrc) >= 2) && 1.1477 + (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1.1478 + ) { 1.1479 + reqLength++; 1.1480 + pSrc += 2; 1.1481 + continue; 1.1482 + } 1.1483 + } 1.1484 + 1.1485 + if(subchar < 0) { 1.1486 + *pErrorCode = U_INVALID_CHAR_FOUND; 1.1487 + return NULL; 1.1488 + } else { 1.1489 + /* function call for error cases */ 1.1490 + ++pSrc; /* continue after the lead byte */ 1.1491 + utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1.1492 + ++numSubstitutions; 1.1493 + reqLength+=U16_LENGTH(ch); 1.1494 + } 1.1495 + } 1.1496 + } 1.1497 + 1.1498 + if(pNumSubstitutions!=NULL) { 1.1499 + *pNumSubstitutions=numSubstitutions; 1.1500 + } 1.1501 + 1.1502 + reqLength+=(int32_t)(pDest - dest); 1.1503 + if(pDestLength) { 1.1504 + *pDestLength = reqLength; 1.1505 + } 1.1506 + 1.1507 + /* Terminate the buffer */ 1.1508 + u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1.1509 + return dest; 1.1510 +} 1.1511 + 1.1512 +U_CAPI char* U_EXPORT2 1.1513 +u_strToJavaModifiedUTF8( 1.1514 + char *dest, 1.1515 + int32_t destCapacity, 1.1516 + int32_t *pDestLength, 1.1517 + const UChar *src, 1.1518 + int32_t srcLength, 1.1519 + UErrorCode *pErrorCode) { 1.1520 + int32_t reqLength=0; 1.1521 + uint32_t ch=0; 1.1522 + uint8_t *pDest = (uint8_t *)dest; 1.1523 + uint8_t *pDestLimit = pDest + destCapacity; 1.1524 + const UChar *pSrcLimit; 1.1525 + int32_t count; 1.1526 + 1.1527 + /* args check */ 1.1528 + if(U_FAILURE(*pErrorCode)){ 1.1529 + return NULL; 1.1530 + } 1.1531 + if( (src==NULL && srcLength!=0) || srcLength < -1 || 1.1532 + (dest==NULL && destCapacity!=0) || destCapacity<0 1.1533 + ) { 1.1534 + *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1.1535 + return NULL; 1.1536 + } 1.1537 + 1.1538 + if(srcLength==-1) { 1.1539 + /* Convert NUL-terminated ASCII, then find the string length. */ 1.1540 + while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1.1541 + *pDest++ = (uint8_t)ch; 1.1542 + ++src; 1.1543 + } 1.1544 + if(ch == 0) { 1.1545 + reqLength=(int32_t)(pDest - (uint8_t *)dest); 1.1546 + if(pDestLength) { 1.1547 + *pDestLength = reqLength; 1.1548 + } 1.1549 + 1.1550 + /* Terminate the buffer */ 1.1551 + u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1.1552 + return dest; 1.1553 + } 1.1554 + srcLength = u_strlen(src); 1.1555 + } 1.1556 + 1.1557 + /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1.1558 + pSrcLimit = (src!=NULL)?(src+srcLength):NULL; 1.1559 + for(;;) { 1.1560 + count = (int32_t)(pDestLimit - pDest); 1.1561 + srcLength = (int32_t)(pSrcLimit - src); 1.1562 + if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1.1563 + /* fast ASCII loop */ 1.1564 + const UChar *prevSrc = src; 1.1565 + int32_t delta; 1.1566 + while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1.1567 + *pDest++=(uint8_t)ch; 1.1568 + ++src; 1.1569 + } 1.1570 + delta = (int32_t)(src - prevSrc); 1.1571 + count -= delta; 1.1572 + srcLength -= delta; 1.1573 + } 1.1574 + /* 1.1575 + * Each iteration of the inner loop progresses by at most 3 UTF-8 1.1576 + * bytes and one UChar. 1.1577 + */ 1.1578 + count /= 3; 1.1579 + if(count > srcLength) { 1.1580 + count = srcLength; /* min(remaining dest/3, remaining src) */ 1.1581 + } 1.1582 + if(count < 3) { 1.1583 + /* 1.1584 + * Too much overhead if we get near the end of the string, 1.1585 + * continue with the next loop. 1.1586 + */ 1.1587 + break; 1.1588 + } 1.1589 + do { 1.1590 + ch=*src++; 1.1591 + if(ch <= 0x7f && ch != 0) { 1.1592 + *pDest++ = (uint8_t)ch; 1.1593 + } else if(ch <= 0x7ff) { 1.1594 + *pDest++=(uint8_t)((ch>>6)|0xc0); 1.1595 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1596 + } else { 1.1597 + *pDest++=(uint8_t)((ch>>12)|0xe0); 1.1598 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1599 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1600 + } 1.1601 + } while(--count > 0); 1.1602 + } 1.1603 + 1.1604 + while(src<pSrcLimit) { 1.1605 + ch=*src++; 1.1606 + if(ch <= 0x7f && ch != 0) { 1.1607 + if(pDest<pDestLimit) { 1.1608 + *pDest++ = (uint8_t)ch; 1.1609 + } else { 1.1610 + reqLength = 1; 1.1611 + break; 1.1612 + } 1.1613 + } else if(ch <= 0x7ff) { 1.1614 + if((pDestLimit - pDest) >= 2) { 1.1615 + *pDest++=(uint8_t)((ch>>6)|0xc0); 1.1616 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1617 + } else { 1.1618 + reqLength = 2; 1.1619 + break; 1.1620 + } 1.1621 + } else { 1.1622 + if((pDestLimit - pDest) >= 3) { 1.1623 + *pDest++=(uint8_t)((ch>>12)|0xe0); 1.1624 + *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1.1625 + *pDest++=(uint8_t)((ch&0x3f)|0x80); 1.1626 + } else { 1.1627 + reqLength = 3; 1.1628 + break; 1.1629 + } 1.1630 + } 1.1631 + } 1.1632 + while(src<pSrcLimit) { 1.1633 + ch=*src++; 1.1634 + if(ch <= 0x7f && ch != 0) { 1.1635 + ++reqLength; 1.1636 + } else if(ch<=0x7ff) { 1.1637 + reqLength+=2; 1.1638 + } else { 1.1639 + reqLength+=3; 1.1640 + } 1.1641 + } 1.1642 + 1.1643 + reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1.1644 + if(pDestLength){ 1.1645 + *pDestLength = reqLength; 1.1646 + } 1.1647 + 1.1648 + /* Terminate the buffer */ 1.1649 + u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1.1650 + return dest; 1.1651 +}