intl/icu/source/common/ustrtrns.cpp

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

michael@0 1 /*
michael@0 2 ******************************************************************************
michael@0 3 *
michael@0 4 * Copyright (C) 2001-2013, International Business Machines
michael@0 5 * Corporation and others. All Rights Reserved.
michael@0 6 *
michael@0 7 ******************************************************************************
michael@0 8 *
michael@0 9 * File ustrtrns.cpp
michael@0 10 *
michael@0 11 * Modification History:
michael@0 12 *
michael@0 13 * Date Name Description
michael@0 14 * 9/10/2001 Ram Creation.
michael@0 15 ******************************************************************************
michael@0 16 */
michael@0 17
michael@0 18 /*******************************************************************************
michael@0 19 *
michael@0 20 * u_strTo* and u_strFrom* APIs
michael@0 21 * WCS functions moved to ustr_wcs.c for better modularization
michael@0 22 *
michael@0 23 *******************************************************************************
michael@0 24 */
michael@0 25
michael@0 26
michael@0 27 #include "unicode/putil.h"
michael@0 28 #include "unicode/ustring.h"
michael@0 29 #include "unicode/utf.h"
michael@0 30 #include "unicode/utf8.h"
michael@0 31 #include "unicode/utf16.h"
michael@0 32 #include "cstring.h"
michael@0 33 #include "cmemory.h"
michael@0 34 #include "ustr_imp.h"
michael@0 35 #include "uassert.h"
michael@0 36
michael@0 37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
michael@0 38
michael@0 39 U_CAPI UChar* U_EXPORT2
michael@0 40 u_strFromUTF32WithSub(UChar *dest,
michael@0 41 int32_t destCapacity,
michael@0 42 int32_t *pDestLength,
michael@0 43 const UChar32 *src,
michael@0 44 int32_t srcLength,
michael@0 45 UChar32 subchar, int32_t *pNumSubstitutions,
michael@0 46 UErrorCode *pErrorCode) {
michael@0 47 const UChar32 *srcLimit;
michael@0 48 UChar32 ch;
michael@0 49 UChar *destLimit;
michael@0 50 UChar *pDest;
michael@0 51 int32_t reqLength;
michael@0 52 int32_t numSubstitutions;
michael@0 53
michael@0 54 /* args check */
michael@0 55 if(U_FAILURE(*pErrorCode)){
michael@0 56 return NULL;
michael@0 57 }
michael@0 58 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 59 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
michael@0 60 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
michael@0 61 ) {
michael@0 62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 63 return NULL;
michael@0 64 }
michael@0 65
michael@0 66 if(pNumSubstitutions != NULL) {
michael@0 67 *pNumSubstitutions = 0;
michael@0 68 }
michael@0 69
michael@0 70 pDest = dest;
michael@0 71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
michael@0 72 reqLength = 0;
michael@0 73 numSubstitutions = 0;
michael@0 74
michael@0 75 if(srcLength < 0) {
michael@0 76 /* simple loop for conversion of a NUL-terminated BMP string */
michael@0 77 while((ch=*src) != 0 &&
michael@0 78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
michael@0 79 ++src;
michael@0 80 if(pDest < destLimit) {
michael@0 81 *pDest++ = (UChar)ch;
michael@0 82 } else {
michael@0 83 ++reqLength;
michael@0 84 }
michael@0 85 }
michael@0 86 srcLimit = src;
michael@0 87 if(ch != 0) {
michael@0 88 /* "complicated" case, find the end of the remaining string */
michael@0 89 while(*++srcLimit != 0) {}
michael@0 90 }
michael@0 91 } else {
michael@0 92 srcLimit = (src!=NULL)?(src + srcLength):NULL;
michael@0 93 }
michael@0 94
michael@0 95 /* convert with length */
michael@0 96 while(src < srcLimit) {
michael@0 97 ch = *src++;
michael@0 98 do {
michael@0 99 /* usually "loops" once; twice only for writing subchar */
michael@0 100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
michael@0 101 if(pDest < destLimit) {
michael@0 102 *pDest++ = (UChar)ch;
michael@0 103 } else {
michael@0 104 ++reqLength;
michael@0 105 }
michael@0 106 break;
michael@0 107 } else if(0x10000 <= ch && ch <= 0x10ffff) {
michael@0 108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
michael@0 109 *pDest++ = U16_LEAD(ch);
michael@0 110 *pDest++ = U16_TRAIL(ch);
michael@0 111 } else {
michael@0 112 reqLength += 2;
michael@0 113 }
michael@0 114 break;
michael@0 115 } else if((ch = subchar) < 0) {
michael@0 116 /* surrogate code point, or not a Unicode code point at all */
michael@0 117 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 118 return NULL;
michael@0 119 } else {
michael@0 120 ++numSubstitutions;
michael@0 121 }
michael@0 122 } while(TRUE);
michael@0 123 }
michael@0 124
michael@0 125 reqLength += (int32_t)(pDest - dest);
michael@0 126 if(pDestLength) {
michael@0 127 *pDestLength = reqLength;
michael@0 128 }
michael@0 129 if(pNumSubstitutions != NULL) {
michael@0 130 *pNumSubstitutions = numSubstitutions;
michael@0 131 }
michael@0 132
michael@0 133 /* Terminate the buffer */
michael@0 134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 135
michael@0 136 return dest;
michael@0 137 }
michael@0 138
michael@0 139 U_CAPI UChar* U_EXPORT2
michael@0 140 u_strFromUTF32(UChar *dest,
michael@0 141 int32_t destCapacity,
michael@0 142 int32_t *pDestLength,
michael@0 143 const UChar32 *src,
michael@0 144 int32_t srcLength,
michael@0 145 UErrorCode *pErrorCode) {
michael@0 146 return u_strFromUTF32WithSub(
michael@0 147 dest, destCapacity, pDestLength,
michael@0 148 src, srcLength,
michael@0 149 U_SENTINEL, NULL,
michael@0 150 pErrorCode);
michael@0 151 }
michael@0 152
michael@0 153 U_CAPI UChar32* U_EXPORT2
michael@0 154 u_strToUTF32WithSub(UChar32 *dest,
michael@0 155 int32_t destCapacity,
michael@0 156 int32_t *pDestLength,
michael@0 157 const UChar *src,
michael@0 158 int32_t srcLength,
michael@0 159 UChar32 subchar, int32_t *pNumSubstitutions,
michael@0 160 UErrorCode *pErrorCode) {
michael@0 161 const UChar *srcLimit;
michael@0 162 UChar32 ch;
michael@0 163 UChar ch2;
michael@0 164 UChar32 *destLimit;
michael@0 165 UChar32 *pDest;
michael@0 166 int32_t reqLength;
michael@0 167 int32_t numSubstitutions;
michael@0 168
michael@0 169 /* args check */
michael@0 170 if(U_FAILURE(*pErrorCode)){
michael@0 171 return NULL;
michael@0 172 }
michael@0 173 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 174 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
michael@0 175 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
michael@0 176 ) {
michael@0 177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 178 return NULL;
michael@0 179 }
michael@0 180
michael@0 181 if(pNumSubstitutions != NULL) {
michael@0 182 *pNumSubstitutions = 0;
michael@0 183 }
michael@0 184
michael@0 185 pDest = dest;
michael@0 186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
michael@0 187 reqLength = 0;
michael@0 188 numSubstitutions = 0;
michael@0 189
michael@0 190 if(srcLength < 0) {
michael@0 191 /* simple loop for conversion of a NUL-terminated BMP string */
michael@0 192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
michael@0 193 ++src;
michael@0 194 if(pDest < destLimit) {
michael@0 195 *pDest++ = ch;
michael@0 196 } else {
michael@0 197 ++reqLength;
michael@0 198 }
michael@0 199 }
michael@0 200 srcLimit = src;
michael@0 201 if(ch != 0) {
michael@0 202 /* "complicated" case, find the end of the remaining string */
michael@0 203 while(*++srcLimit != 0) {}
michael@0 204 }
michael@0 205 } else {
michael@0 206 srcLimit = (src!=NULL)?(src + srcLength):NULL;
michael@0 207 }
michael@0 208
michael@0 209 /* convert with length */
michael@0 210 while(src < srcLimit) {
michael@0 211 ch = *src++;
michael@0 212 if(!U16_IS_SURROGATE(ch)) {
michael@0 213 /* write or count ch below */
michael@0 214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
michael@0 215 ++src;
michael@0 216 ch = U16_GET_SUPPLEMENTARY(ch, ch2);
michael@0 217 } else if((ch = subchar) < 0) {
michael@0 218 /* unpaired surrogate */
michael@0 219 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 220 return NULL;
michael@0 221 } else {
michael@0 222 ++numSubstitutions;
michael@0 223 }
michael@0 224 if(pDest < destLimit) {
michael@0 225 *pDest++ = ch;
michael@0 226 } else {
michael@0 227 ++reqLength;
michael@0 228 }
michael@0 229 }
michael@0 230
michael@0 231 reqLength += (int32_t)(pDest - dest);
michael@0 232 if(pDestLength) {
michael@0 233 *pDestLength = reqLength;
michael@0 234 }
michael@0 235 if(pNumSubstitutions != NULL) {
michael@0 236 *pNumSubstitutions = numSubstitutions;
michael@0 237 }
michael@0 238
michael@0 239 /* Terminate the buffer */
michael@0 240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
michael@0 241
michael@0 242 return dest;
michael@0 243 }
michael@0 244
michael@0 245 U_CAPI UChar32* U_EXPORT2
michael@0 246 u_strToUTF32(UChar32 *dest,
michael@0 247 int32_t destCapacity,
michael@0 248 int32_t *pDestLength,
michael@0 249 const UChar *src,
michael@0 250 int32_t srcLength,
michael@0 251 UErrorCode *pErrorCode) {
michael@0 252 return u_strToUTF32WithSub(
michael@0 253 dest, destCapacity, pDestLength,
michael@0 254 src, srcLength,
michael@0 255 U_SENTINEL, NULL,
michael@0 256 pErrorCode);
michael@0 257 }
michael@0 258
michael@0 259 /* for utf8_nextCharSafeBodyTerminated() */
michael@0 260 static const UChar32
michael@0 261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
michael@0 262
michael@0 263 /*
michael@0 264 * Version of utf8_nextCharSafeBody() with the following differences:
michael@0 265 * - checks for NUL termination instead of length
michael@0 266 * - works with pointers instead of indexes
michael@0 267 * - always strict (strict==-1)
michael@0 268 *
michael@0 269 * *ps points to after the lead byte and will be moved to after the last trail byte.
michael@0 270 * c is the lead byte.
michael@0 271 * @return the code point, or U_SENTINEL
michael@0 272 */
michael@0 273 static UChar32
michael@0 274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
michael@0 275 const uint8_t *s=*ps;
michael@0 276 uint8_t trail, illegal=0;
michael@0 277 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
michael@0 278 U_ASSERT(count<6);
michael@0 279 U8_MASK_LEAD_BYTE((c), count);
michael@0 280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
michael@0 281 switch(count) {
michael@0 282 /* each branch falls through to the next one */
michael@0 283 case 5:
michael@0 284 case 4:
michael@0 285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
michael@0 286 illegal=1;
michael@0 287 break;
michael@0 288 case 3:
michael@0 289 trail=(uint8_t)(*s++ - 0x80);
michael@0 290 c=(c<<6)|trail;
michael@0 291 if(trail>0x3f || c>=0x110) {
michael@0 292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */
michael@0 293 illegal=1;
michael@0 294 break;
michael@0 295 }
michael@0 296 case 2: /*fall through*/
michael@0 297 trail=(uint8_t)(*s++ - 0x80);
michael@0 298 if(trail>0x3f) {
michael@0 299 /* not a trail byte */
michael@0 300 illegal=1;
michael@0 301 break;
michael@0 302 }
michael@0 303 c=(c<<6)|trail;
michael@0 304 case 1: /*fall through*/
michael@0 305 trail=(uint8_t)(*s++ - 0x80);
michael@0 306 if(trail>0x3f) {
michael@0 307 /* not a trail byte */
michael@0 308 illegal=1;
michael@0 309 }
michael@0 310 c=(c<<6)|trail;
michael@0 311 break;
michael@0 312 case 0:
michael@0 313 return U_SENTINEL;
michael@0 314 /* no default branch to optimize switch() - all values are covered */
michael@0 315 }
michael@0 316
michael@0 317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
michael@0 318 /* illegal is also set if count>=4 */
michael@0 319 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
michael@0 320 /* error handling */
michael@0 321 /* don't go beyond this sequence */
michael@0 322 s=*ps;
michael@0 323 while(count>0 && U8_IS_TRAIL(*s)) {
michael@0 324 ++s;
michael@0 325 --count;
michael@0 326 }
michael@0 327 c=U_SENTINEL;
michael@0 328 }
michael@0 329 *ps=s;
michael@0 330 return c;
michael@0 331 }
michael@0 332
michael@0 333 /*
michael@0 334 * Version of utf8_nextCharSafeBody() with the following differences:
michael@0 335 * - works with pointers instead of indexes
michael@0 336 * - always strict (strict==-1)
michael@0 337 *
michael@0 338 * *ps points to after the lead byte and will be moved to after the last trail byte.
michael@0 339 * c is the lead byte.
michael@0 340 * @return the code point, or U_SENTINEL
michael@0 341 */
michael@0 342 static UChar32
michael@0 343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
michael@0 344 const uint8_t *s=*ps;
michael@0 345 uint8_t trail, illegal=0;
michael@0 346 uint8_t count=U8_COUNT_TRAIL_BYTES(c);
michael@0 347 if((limit-s)>=count) {
michael@0 348 U8_MASK_LEAD_BYTE((c), count);
michael@0 349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
michael@0 350 switch(count) {
michael@0 351 /* each branch falls through to the next one */
michael@0 352 case 5:
michael@0 353 case 4:
michael@0 354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
michael@0 355 illegal=1;
michael@0 356 break;
michael@0 357 case 3:
michael@0 358 trail=*s++;
michael@0 359 c=(c<<6)|(trail&0x3f);
michael@0 360 if(c<0x110) {
michael@0 361 illegal|=(trail&0xc0)^0x80;
michael@0 362 } else {
michael@0 363 /* code point>0x10ffff, outside Unicode */
michael@0 364 illegal=1;
michael@0 365 break;
michael@0 366 }
michael@0 367 case 2: /*fall through*/
michael@0 368 trail=*s++;
michael@0 369 c=(c<<6)|(trail&0x3f);
michael@0 370 illegal|=(trail&0xc0)^0x80;
michael@0 371 case 1: /*fall through*/
michael@0 372 trail=*s++;
michael@0 373 c=(c<<6)|(trail&0x3f);
michael@0 374 illegal|=(trail&0xc0)^0x80;
michael@0 375 break;
michael@0 376 case 0:
michael@0 377 return U_SENTINEL;
michael@0 378 /* no default branch to optimize switch() - all values are covered */
michael@0 379 }
michael@0 380 } else {
michael@0 381 illegal=1; /* too few bytes left */
michael@0 382 }
michael@0 383
michael@0 384 /* correct sequence - all trail bytes have (b7..b6)==(10)? */
michael@0 385 /* illegal is also set if count>=4 */
michael@0 386 U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
michael@0 387 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
michael@0 388 /* error handling */
michael@0 389 /* don't go beyond this sequence */
michael@0 390 s=*ps;
michael@0 391 while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
michael@0 392 ++s;
michael@0 393 --count;
michael@0 394 }
michael@0 395 c=U_SENTINEL;
michael@0 396 }
michael@0 397 *ps=s;
michael@0 398 return c;
michael@0 399 }
michael@0 400
michael@0 401 U_CAPI UChar* U_EXPORT2
michael@0 402 u_strFromUTF8WithSub(UChar *dest,
michael@0 403 int32_t destCapacity,
michael@0 404 int32_t *pDestLength,
michael@0 405 const char* src,
michael@0 406 int32_t srcLength,
michael@0 407 UChar32 subchar, int32_t *pNumSubstitutions,
michael@0 408 UErrorCode *pErrorCode){
michael@0 409 UChar *pDest = dest;
michael@0 410 UChar *pDestLimit = dest+destCapacity;
michael@0 411 UChar32 ch;
michael@0 412 int32_t reqLength = 0;
michael@0 413 const uint8_t* pSrc = (const uint8_t*) src;
michael@0 414 uint8_t t1, t2; /* trail bytes */
michael@0 415 int32_t numSubstitutions;
michael@0 416
michael@0 417 /* args check */
michael@0 418 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
michael@0 419 return NULL;
michael@0 420 }
michael@0 421
michael@0 422 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 423 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
michael@0 424 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
michael@0 425 ) {
michael@0 426 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 427 return NULL;
michael@0 428 }
michael@0 429
michael@0 430 if(pNumSubstitutions!=NULL) {
michael@0 431 *pNumSubstitutions=0;
michael@0 432 }
michael@0 433 numSubstitutions=0;
michael@0 434
michael@0 435 /*
michael@0 436 * Inline processing of UTF-8 byte sequences:
michael@0 437 *
michael@0 438 * Byte sequences for the most common characters are handled inline in
michael@0 439 * the conversion loops. In order to reduce the path lengths for those
michael@0 440 * characters, the tests are arranged in a kind of binary search.
michael@0 441 * ASCII (<=0x7f) is checked first, followed by the dividing point
michael@0 442 * between 2- and 3-byte sequences (0xe0).
michael@0 443 * The 3-byte branch is tested first to speed up CJK text.
michael@0 444 * The compiler should combine the subtractions for the two tests for 0xe0.
michael@0 445 * Each branch then tests for the other end of its range.
michael@0 446 */
michael@0 447
michael@0 448 if(srcLength < 0){
michael@0 449 /*
michael@0 450 * Transform a NUL-terminated string.
michael@0 451 * The code explicitly checks for NULs only in the lead byte position.
michael@0 452 * A NUL byte in the trail byte position fails the trail byte range check anyway.
michael@0 453 */
michael@0 454 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
michael@0 455 if(ch <= 0x7f){
michael@0 456 *pDest++=(UChar)ch;
michael@0 457 ++pSrc;
michael@0 458 } else {
michael@0 459 if(ch > 0xe0) {
michael@0 460 if( /* handle U+1000..U+CFFF inline */
michael@0 461 ch <= 0xec &&
michael@0 462 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
michael@0 463 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
michael@0 464 ) {
michael@0 465 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 466 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
michael@0 467 pSrc += 3;
michael@0 468 continue;
michael@0 469 }
michael@0 470 } else if(ch < 0xe0) {
michael@0 471 if( /* handle U+0080..U+07FF inline */
michael@0 472 ch >= 0xc2 &&
michael@0 473 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
michael@0 474 ) {
michael@0 475 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
michael@0 476 pSrc += 2;
michael@0 477 continue;
michael@0 478 }
michael@0 479 }
michael@0 480
michael@0 481 /* function call for "complicated" and error cases */
michael@0 482 ++pSrc; /* continue after the lead byte */
michael@0 483 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
michael@0 484 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
michael@0 485 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 486 return NULL;
michael@0 487 } else if(ch<=0xFFFF) {
michael@0 488 *(pDest++)=(UChar)ch;
michael@0 489 } else {
michael@0 490 *(pDest++)=U16_LEAD(ch);
michael@0 491 if(pDest<pDestLimit) {
michael@0 492 *(pDest++)=U16_TRAIL(ch);
michael@0 493 } else {
michael@0 494 reqLength++;
michael@0 495 break;
michael@0 496 }
michael@0 497 }
michael@0 498 }
michael@0 499 }
michael@0 500
michael@0 501 /* Pre-flight the rest of the string. */
michael@0 502 while((ch = *pSrc) != 0) {
michael@0 503 if(ch <= 0x7f){
michael@0 504 ++reqLength;
michael@0 505 ++pSrc;
michael@0 506 } else {
michael@0 507 if(ch > 0xe0) {
michael@0 508 if( /* handle U+1000..U+CFFF inline */
michael@0 509 ch <= 0xec &&
michael@0 510 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
michael@0 511 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
michael@0 512 ) {
michael@0 513 ++reqLength;
michael@0 514 pSrc += 3;
michael@0 515 continue;
michael@0 516 }
michael@0 517 } else if(ch < 0xe0) {
michael@0 518 if( /* handle U+0080..U+07FF inline */
michael@0 519 ch >= 0xc2 &&
michael@0 520 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
michael@0 521 ) {
michael@0 522 ++reqLength;
michael@0 523 pSrc += 2;
michael@0 524 continue;
michael@0 525 }
michael@0 526 }
michael@0 527
michael@0 528 /* function call for "complicated" and error cases */
michael@0 529 ++pSrc; /* continue after the lead byte */
michael@0 530 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
michael@0 531 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
michael@0 532 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 533 return NULL;
michael@0 534 }
michael@0 535 reqLength += U16_LENGTH(ch);
michael@0 536 }
michael@0 537 }
michael@0 538 } else /* srcLength >= 0 */ {
michael@0 539 const uint8_t *pSrcLimit = pSrc + srcLength;
michael@0 540 int32_t count;
michael@0 541
michael@0 542 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
michael@0 543 for(;;) {
michael@0 544 /*
michael@0 545 * Each iteration of the inner loop progresses by at most 3 UTF-8
michael@0 546 * bytes and one UChar, for most characters.
michael@0 547 * For supplementary code points (4 & 2), which are rare,
michael@0 548 * there is an additional adjustment.
michael@0 549 */
michael@0 550 count = (int32_t)(pDestLimit - pDest);
michael@0 551 srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
michael@0 552 if(count > srcLength) {
michael@0 553 count = srcLength; /* min(remaining dest, remaining src/3) */
michael@0 554 }
michael@0 555 if(count < 3) {
michael@0 556 /*
michael@0 557 * Too much overhead if we get near the end of the string,
michael@0 558 * continue with the next loop.
michael@0 559 */
michael@0 560 break;
michael@0 561 }
michael@0 562
michael@0 563 do {
michael@0 564 ch = *pSrc;
michael@0 565 if(ch <= 0x7f){
michael@0 566 *pDest++=(UChar)ch;
michael@0 567 ++pSrc;
michael@0 568 } else {
michael@0 569 if(ch > 0xe0) {
michael@0 570 if( /* handle U+1000..U+CFFF inline */
michael@0 571 ch <= 0xec &&
michael@0 572 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
michael@0 573 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
michael@0 574 ) {
michael@0 575 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 576 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
michael@0 577 pSrc += 3;
michael@0 578 continue;
michael@0 579 }
michael@0 580 } else if(ch < 0xe0) {
michael@0 581 if( /* handle U+0080..U+07FF inline */
michael@0 582 ch >= 0xc2 &&
michael@0 583 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
michael@0 584 ) {
michael@0 585 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
michael@0 586 pSrc += 2;
michael@0 587 continue;
michael@0 588 }
michael@0 589 }
michael@0 590
michael@0 591 if(ch >= 0xf0 || subchar > 0xffff) {
michael@0 592 /*
michael@0 593 * We may read up to six bytes and write up to two UChars,
michael@0 594 * which we didn't account for with computing count,
michael@0 595 * so we adjust it here.
michael@0 596 */
michael@0 597 if(--count == 0) {
michael@0 598 break;
michael@0 599 }
michael@0 600 }
michael@0 601
michael@0 602 /* function call for "complicated" and error cases */
michael@0 603 ++pSrc; /* continue after the lead byte */
michael@0 604 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 605 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
michael@0 606 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 607 return NULL;
michael@0 608 }else if(ch<=0xFFFF){
michael@0 609 *(pDest++)=(UChar)ch;
michael@0 610 }else{
michael@0 611 *(pDest++)=U16_LEAD(ch);
michael@0 612 *(pDest++)=U16_TRAIL(ch);
michael@0 613 }
michael@0 614 }
michael@0 615 } while(--count > 0);
michael@0 616 }
michael@0 617
michael@0 618 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
michael@0 619 ch = *pSrc;
michael@0 620 if(ch <= 0x7f){
michael@0 621 *pDest++=(UChar)ch;
michael@0 622 ++pSrc;
michael@0 623 } else {
michael@0 624 if(ch > 0xe0) {
michael@0 625 if( /* handle U+1000..U+CFFF inline */
michael@0 626 ch <= 0xec &&
michael@0 627 ((pSrcLimit - pSrc) >= 3) &&
michael@0 628 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
michael@0 629 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
michael@0 630 ) {
michael@0 631 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 632 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
michael@0 633 pSrc += 3;
michael@0 634 continue;
michael@0 635 }
michael@0 636 } else if(ch < 0xe0) {
michael@0 637 if( /* handle U+0080..U+07FF inline */
michael@0 638 ch >= 0xc2 &&
michael@0 639 ((pSrcLimit - pSrc) >= 2) &&
michael@0 640 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
michael@0 641 ) {
michael@0 642 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
michael@0 643 pSrc += 2;
michael@0 644 continue;
michael@0 645 }
michael@0 646 }
michael@0 647
michael@0 648 /* function call for "complicated" and error cases */
michael@0 649 ++pSrc; /* continue after the lead byte */
michael@0 650 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 651 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
michael@0 652 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 653 return NULL;
michael@0 654 }else if(ch<=0xFFFF){
michael@0 655 *(pDest++)=(UChar)ch;
michael@0 656 }else{
michael@0 657 *(pDest++)=U16_LEAD(ch);
michael@0 658 if(pDest<pDestLimit){
michael@0 659 *(pDest++)=U16_TRAIL(ch);
michael@0 660 }else{
michael@0 661 reqLength++;
michael@0 662 break;
michael@0 663 }
michael@0 664 }
michael@0 665 }
michael@0 666 }
michael@0 667 /* do not fill the dest buffer just count the UChars needed */
michael@0 668 while(pSrc < pSrcLimit){
michael@0 669 ch = *pSrc;
michael@0 670 if(ch <= 0x7f){
michael@0 671 reqLength++;
michael@0 672 ++pSrc;
michael@0 673 } else {
michael@0 674 if(ch > 0xe0) {
michael@0 675 if( /* handle U+1000..U+CFFF inline */
michael@0 676 ch <= 0xec &&
michael@0 677 ((pSrcLimit - pSrc) >= 3) &&
michael@0 678 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
michael@0 679 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
michael@0 680 ) {
michael@0 681 reqLength++;
michael@0 682 pSrc += 3;
michael@0 683 continue;
michael@0 684 }
michael@0 685 } else if(ch < 0xe0) {
michael@0 686 if( /* handle U+0080..U+07FF inline */
michael@0 687 ch >= 0xc2 &&
michael@0 688 ((pSrcLimit - pSrc) >= 2) &&
michael@0 689 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
michael@0 690 ) {
michael@0 691 reqLength++;
michael@0 692 pSrc += 2;
michael@0 693 continue;
michael@0 694 }
michael@0 695 }
michael@0 696
michael@0 697 /* function call for "complicated" and error cases */
michael@0 698 ++pSrc; /* continue after the lead byte */
michael@0 699 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 700 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
michael@0 701 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 702 return NULL;
michael@0 703 }
michael@0 704 reqLength+=U16_LENGTH(ch);
michael@0 705 }
michael@0 706 }
michael@0 707 }
michael@0 708
michael@0 709 reqLength+=(int32_t)(pDest - dest);
michael@0 710
michael@0 711 if(pNumSubstitutions!=NULL) {
michael@0 712 *pNumSubstitutions=numSubstitutions;
michael@0 713 }
michael@0 714
michael@0 715 if(pDestLength){
michael@0 716 *pDestLength = reqLength;
michael@0 717 }
michael@0 718
michael@0 719 /* Terminate the buffer */
michael@0 720 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
michael@0 721
michael@0 722 return dest;
michael@0 723 }
michael@0 724
michael@0 725 U_CAPI UChar* U_EXPORT2
michael@0 726 u_strFromUTF8(UChar *dest,
michael@0 727 int32_t destCapacity,
michael@0 728 int32_t *pDestLength,
michael@0 729 const char* src,
michael@0 730 int32_t srcLength,
michael@0 731 UErrorCode *pErrorCode){
michael@0 732 return u_strFromUTF8WithSub(
michael@0 733 dest, destCapacity, pDestLength,
michael@0 734 src, srcLength,
michael@0 735 U_SENTINEL, NULL,
michael@0 736 pErrorCode);
michael@0 737 }
michael@0 738
michael@0 739 U_CAPI UChar * U_EXPORT2
michael@0 740 u_strFromUTF8Lenient(UChar *dest,
michael@0 741 int32_t destCapacity,
michael@0 742 int32_t *pDestLength,
michael@0 743 const char *src,
michael@0 744 int32_t srcLength,
michael@0 745 UErrorCode *pErrorCode) {
michael@0 746 UChar *pDest = dest;
michael@0 747 UChar32 ch;
michael@0 748 int32_t reqLength = 0;
michael@0 749 uint8_t* pSrc = (uint8_t*) src;
michael@0 750
michael@0 751 /* args check */
michael@0 752 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
michael@0 753 return NULL;
michael@0 754 }
michael@0 755
michael@0 756 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 757 (destCapacity<0) || (dest == NULL && destCapacity > 0)
michael@0 758 ) {
michael@0 759 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 760 return NULL;
michael@0 761 }
michael@0 762
michael@0 763 if(srcLength < 0) {
michael@0 764 /* Transform a NUL-terminated string. */
michael@0 765 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
michael@0 766 uint8_t t1, t2, t3; /* trail bytes */
michael@0 767
michael@0 768 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
michael@0 769 if(ch < 0xc0) {
michael@0 770 /*
michael@0 771 * ASCII, or a trail byte in lead position which is treated like
michael@0 772 * a single-byte sequence for better character boundary
michael@0 773 * resynchronization after illegal sequences.
michael@0 774 */
michael@0 775 *pDest++=(UChar)ch;
michael@0 776 ++pSrc;
michael@0 777 continue;
michael@0 778 } else if(ch < 0xe0) { /* U+0080..U+07FF */
michael@0 779 if((t1 = pSrc[1]) != 0) {
michael@0 780 /* 0x3080 = (0xc0 << 6) + 0x80 */
michael@0 781 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
michael@0 782 pSrc += 2;
michael@0 783 continue;
michael@0 784 }
michael@0 785 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
michael@0 786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
michael@0 787 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 788 /* 0x2080 = (0x80 << 6) + 0x80 */
michael@0 789 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
michael@0 790 pSrc += 3;
michael@0 791 continue;
michael@0 792 }
michael@0 793 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
michael@0 794 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
michael@0 795 pSrc += 4;
michael@0 796 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
michael@0 797 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
michael@0 798 *(pDest++) = U16_LEAD(ch);
michael@0 799 if(pDest < pDestLimit) {
michael@0 800 *(pDest++) = U16_TRAIL(ch);
michael@0 801 } else {
michael@0 802 reqLength = 1;
michael@0 803 break;
michael@0 804 }
michael@0 805 continue;
michael@0 806 }
michael@0 807 }
michael@0 808
michael@0 809 /* truncated character at the end */
michael@0 810 *pDest++ = 0xfffd;
michael@0 811 while(*++pSrc != 0) {}
michael@0 812 break;
michael@0 813 }
michael@0 814
michael@0 815 /* Pre-flight the rest of the string. */
michael@0 816 while((ch = *pSrc) != 0) {
michael@0 817 if(ch < 0xc0) {
michael@0 818 /*
michael@0 819 * ASCII, or a trail byte in lead position which is treated like
michael@0 820 * a single-byte sequence for better character boundary
michael@0 821 * resynchronization after illegal sequences.
michael@0 822 */
michael@0 823 ++reqLength;
michael@0 824 ++pSrc;
michael@0 825 continue;
michael@0 826 } else if(ch < 0xe0) { /* U+0080..U+07FF */
michael@0 827 if(pSrc[1] != 0) {
michael@0 828 ++reqLength;
michael@0 829 pSrc += 2;
michael@0 830 continue;
michael@0 831 }
michael@0 832 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
michael@0 833 if(pSrc[1] != 0 && pSrc[2] != 0) {
michael@0 834 ++reqLength;
michael@0 835 pSrc += 3;
michael@0 836 continue;
michael@0 837 }
michael@0 838 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
michael@0 839 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
michael@0 840 reqLength += 2;
michael@0 841 pSrc += 4;
michael@0 842 continue;
michael@0 843 }
michael@0 844 }
michael@0 845
michael@0 846 /* truncated character at the end */
michael@0 847 ++reqLength;
michael@0 848 break;
michael@0 849 }
michael@0 850 } else /* srcLength >= 0 */ {
michael@0 851 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
michael@0 852
michael@0 853 /*
michael@0 854 * This function requires that if srcLength is given, then it must be
michael@0 855 * destCapatity >= srcLength so that we need not check for
michael@0 856 * destination buffer overflow in the loop.
michael@0 857 */
michael@0 858 if(destCapacity < srcLength) {
michael@0 859 if(pDestLength != NULL) {
michael@0 860 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
michael@0 861 }
michael@0 862 *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
michael@0 863 return NULL;
michael@0 864 }
michael@0 865
michael@0 866 if((pSrcLimit - pSrc) >= 4) {
michael@0 867 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
michael@0 868
michael@0 869 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
michael@0 870 do {
michael@0 871 ch = *pSrc++;
michael@0 872 if(ch < 0xc0) {
michael@0 873 /*
michael@0 874 * ASCII, or a trail byte in lead position which is treated like
michael@0 875 * a single-byte sequence for better character boundary
michael@0 876 * resynchronization after illegal sequences.
michael@0 877 */
michael@0 878 *pDest++=(UChar)ch;
michael@0 879 } else if(ch < 0xe0) { /* U+0080..U+07FF */
michael@0 880 /* 0x3080 = (0xc0 << 6) + 0x80 */
michael@0 881 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
michael@0 882 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
michael@0 883 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 884 /* 0x2080 = (0x80 << 6) + 0x80 */
michael@0 885 ch = (ch << 12) + (*pSrc++ << 6);
michael@0 886 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
michael@0 887 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
michael@0 888 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
michael@0 889 ch = (ch << 18) + (*pSrc++ << 12);
michael@0 890 ch += *pSrc++ << 6;
michael@0 891 ch += *pSrc++ - 0x3c82080;
michael@0 892 *(pDest++) = U16_LEAD(ch);
michael@0 893 *(pDest++) = U16_TRAIL(ch);
michael@0 894 }
michael@0 895 } while(pSrc < pSrcLimit);
michael@0 896
michael@0 897 pSrcLimit += 3; /* restore original pSrcLimit */
michael@0 898 }
michael@0 899
michael@0 900 while(pSrc < pSrcLimit) {
michael@0 901 ch = *pSrc++;
michael@0 902 if(ch < 0xc0) {
michael@0 903 /*
michael@0 904 * ASCII, or a trail byte in lead position which is treated like
michael@0 905 * a single-byte sequence for better character boundary
michael@0 906 * resynchronization after illegal sequences.
michael@0 907 */
michael@0 908 *pDest++=(UChar)ch;
michael@0 909 continue;
michael@0 910 } else if(ch < 0xe0) { /* U+0080..U+07FF */
michael@0 911 if(pSrc < pSrcLimit) {
michael@0 912 /* 0x3080 = (0xc0 << 6) + 0x80 */
michael@0 913 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
michael@0 914 continue;
michael@0 915 }
michael@0 916 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
michael@0 917 if((pSrcLimit - pSrc) >= 2) {
michael@0 918 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 919 /* 0x2080 = (0x80 << 6) + 0x80 */
michael@0 920 ch = (ch << 12) + (*pSrc++ << 6);
michael@0 921 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
michael@0 922 pSrc += 3;
michael@0 923 continue;
michael@0 924 }
michael@0 925 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
michael@0 926 if((pSrcLimit - pSrc) >= 3) {
michael@0 927 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
michael@0 928 ch = (ch << 18) + (*pSrc++ << 12);
michael@0 929 ch += *pSrc++ << 6;
michael@0 930 ch += *pSrc++ - 0x3c82080;
michael@0 931 *(pDest++) = U16_LEAD(ch);
michael@0 932 *(pDest++) = U16_TRAIL(ch);
michael@0 933 pSrc += 4;
michael@0 934 continue;
michael@0 935 }
michael@0 936 }
michael@0 937
michael@0 938 /* truncated character at the end */
michael@0 939 *pDest++ = 0xfffd;
michael@0 940 break;
michael@0 941 }
michael@0 942 }
michael@0 943
michael@0 944 reqLength+=(int32_t)(pDest - dest);
michael@0 945
michael@0 946 if(pDestLength){
michael@0 947 *pDestLength = reqLength;
michael@0 948 }
michael@0 949
michael@0 950 /* Terminate the buffer */
michael@0 951 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
michael@0 952
michael@0 953 return dest;
michael@0 954 }
michael@0 955
michael@0 956 static inline uint8_t *
michael@0 957 _appendUTF8(uint8_t *pDest, UChar32 c) {
michael@0 958 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
michael@0 959 if((c)<=0x7f) {
michael@0 960 *pDest++=(uint8_t)c;
michael@0 961 } else if(c<=0x7ff) {
michael@0 962 *pDest++=(uint8_t)((c>>6)|0xc0);
michael@0 963 *pDest++=(uint8_t)((c&0x3f)|0x80);
michael@0 964 } else if(c<=0xffff) {
michael@0 965 *pDest++=(uint8_t)((c>>12)|0xe0);
michael@0 966 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
michael@0 967 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
michael@0 968 } else /* if((uint32_t)(c)<=0x10ffff) */ {
michael@0 969 *pDest++=(uint8_t)(((c)>>18)|0xf0);
michael@0 970 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
michael@0 971 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
michael@0 972 *pDest++=(uint8_t)(((c)&0x3f)|0x80);
michael@0 973 }
michael@0 974 return pDest;
michael@0 975 }
michael@0 976
michael@0 977
michael@0 978 U_CAPI char* U_EXPORT2
michael@0 979 u_strToUTF8WithSub(char *dest,
michael@0 980 int32_t destCapacity,
michael@0 981 int32_t *pDestLength,
michael@0 982 const UChar *pSrc,
michael@0 983 int32_t srcLength,
michael@0 984 UChar32 subchar, int32_t *pNumSubstitutions,
michael@0 985 UErrorCode *pErrorCode){
michael@0 986 int32_t reqLength=0;
michael@0 987 uint32_t ch=0,ch2=0;
michael@0 988 uint8_t *pDest = (uint8_t *)dest;
michael@0 989 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
michael@0 990 int32_t numSubstitutions;
michael@0 991
michael@0 992 /* args check */
michael@0 993 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
michael@0 994 return NULL;
michael@0 995 }
michael@0 996
michael@0 997 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 998 (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
michael@0 999 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
michael@0 1000 ) {
michael@0 1001 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1002 return NULL;
michael@0 1003 }
michael@0 1004
michael@0 1005 if(pNumSubstitutions!=NULL) {
michael@0 1006 *pNumSubstitutions=0;
michael@0 1007 }
michael@0 1008 numSubstitutions=0;
michael@0 1009
michael@0 1010 if(srcLength==-1) {
michael@0 1011 while((ch=*pSrc)!=0) {
michael@0 1012 ++pSrc;
michael@0 1013 if(ch <= 0x7f) {
michael@0 1014 if(pDest<pDestLimit) {
michael@0 1015 *pDest++ = (uint8_t)ch;
michael@0 1016 } else {
michael@0 1017 reqLength = 1;
michael@0 1018 break;
michael@0 1019 }
michael@0 1020 } else if(ch <= 0x7ff) {
michael@0 1021 if((pDestLimit - pDest) >= 2) {
michael@0 1022 *pDest++=(uint8_t)((ch>>6)|0xc0);
michael@0 1023 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1024 } else {
michael@0 1025 reqLength = 2;
michael@0 1026 break;
michael@0 1027 }
michael@0 1028 } else if(ch <= 0xd7ff || ch >= 0xe000) {
michael@0 1029 if((pDestLimit - pDest) >= 3) {
michael@0 1030 *pDest++=(uint8_t)((ch>>12)|0xe0);
michael@0 1031 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1032 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1033 } else {
michael@0 1034 reqLength = 3;
michael@0 1035 break;
michael@0 1036 }
michael@0 1037 } else /* ch is a surrogate */ {
michael@0 1038 int32_t length;
michael@0 1039
michael@0 1040 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
michael@0 1041 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
michael@0 1042 ++pSrc;
michael@0 1043 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
michael@0 1044 } else if(subchar>=0) {
michael@0 1045 ch=subchar;
michael@0 1046 ++numSubstitutions;
michael@0 1047 } else {
michael@0 1048 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
michael@0 1049 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1050 return NULL;
michael@0 1051 }
michael@0 1052
michael@0 1053 length = U8_LENGTH(ch);
michael@0 1054 if((pDestLimit - pDest) >= length) {
michael@0 1055 /* convert and append*/
michael@0 1056 pDest=_appendUTF8(pDest, ch);
michael@0 1057 } else {
michael@0 1058 reqLength = length;
michael@0 1059 break;
michael@0 1060 }
michael@0 1061 }
michael@0 1062 }
michael@0 1063 while((ch=*pSrc++)!=0) {
michael@0 1064 if(ch<=0x7f) {
michael@0 1065 ++reqLength;
michael@0 1066 } else if(ch<=0x7ff) {
michael@0 1067 reqLength+=2;
michael@0 1068 } else if(!U16_IS_SURROGATE(ch)) {
michael@0 1069 reqLength+=3;
michael@0 1070 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
michael@0 1071 ++pSrc;
michael@0 1072 reqLength+=4;
michael@0 1073 } else if(subchar>=0) {
michael@0 1074 reqLength+=U8_LENGTH(subchar);
michael@0 1075 ++numSubstitutions;
michael@0 1076 } else {
michael@0 1077 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
michael@0 1078 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1079 return NULL;
michael@0 1080 }
michael@0 1081 }
michael@0 1082 } else {
michael@0 1083 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
michael@0 1084 int32_t count;
michael@0 1085
michael@0 1086 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
michael@0 1087 for(;;) {
michael@0 1088 /*
michael@0 1089 * Each iteration of the inner loop progresses by at most 3 UTF-8
michael@0 1090 * bytes and one UChar, for most characters.
michael@0 1091 * For supplementary code points (4 & 2), which are rare,
michael@0 1092 * there is an additional adjustment.
michael@0 1093 */
michael@0 1094 count = (int32_t)((pDestLimit - pDest) / 3);
michael@0 1095 srcLength = (int32_t)(pSrcLimit - pSrc);
michael@0 1096 if(count > srcLength) {
michael@0 1097 count = srcLength; /* min(remaining dest/3, remaining src) */
michael@0 1098 }
michael@0 1099 if(count < 3) {
michael@0 1100 /*
michael@0 1101 * Too much overhead if we get near the end of the string,
michael@0 1102 * continue with the next loop.
michael@0 1103 */
michael@0 1104 break;
michael@0 1105 }
michael@0 1106 do {
michael@0 1107 ch=*pSrc++;
michael@0 1108 if(ch <= 0x7f) {
michael@0 1109 *pDest++ = (uint8_t)ch;
michael@0 1110 } else if(ch <= 0x7ff) {
michael@0 1111 *pDest++=(uint8_t)((ch>>6)|0xc0);
michael@0 1112 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1113 } else if(ch <= 0xd7ff || ch >= 0xe000) {
michael@0 1114 *pDest++=(uint8_t)((ch>>12)|0xe0);
michael@0 1115 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1116 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1117 } else /* ch is a surrogate */ {
michael@0 1118 /*
michael@0 1119 * We will read two UChars and probably output four bytes,
michael@0 1120 * which we didn't account for with computing count,
michael@0 1121 * so we adjust it here.
michael@0 1122 */
michael@0 1123 if(--count == 0) {
michael@0 1124 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
michael@0 1125 break; /* recompute count */
michael@0 1126 }
michael@0 1127
michael@0 1128 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
michael@0 1129 ++pSrc;
michael@0 1130 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
michael@0 1131
michael@0 1132 /* writing 4 bytes per 2 UChars is ok */
michael@0 1133 *pDest++=(uint8_t)((ch>>18)|0xf0);
michael@0 1134 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
michael@0 1135 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1136 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1137 } else {
michael@0 1138 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
michael@0 1139 if(subchar>=0) {
michael@0 1140 ch=subchar;
michael@0 1141 ++numSubstitutions;
michael@0 1142 } else {
michael@0 1143 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1144 return NULL;
michael@0 1145 }
michael@0 1146
michael@0 1147 /* convert and append*/
michael@0 1148 pDest=_appendUTF8(pDest, ch);
michael@0 1149 }
michael@0 1150 }
michael@0 1151 } while(--count > 0);
michael@0 1152 }
michael@0 1153
michael@0 1154 while(pSrc<pSrcLimit) {
michael@0 1155 ch=*pSrc++;
michael@0 1156 if(ch <= 0x7f) {
michael@0 1157 if(pDest<pDestLimit) {
michael@0 1158 *pDest++ = (uint8_t)ch;
michael@0 1159 } else {
michael@0 1160 reqLength = 1;
michael@0 1161 break;
michael@0 1162 }
michael@0 1163 } else if(ch <= 0x7ff) {
michael@0 1164 if((pDestLimit - pDest) >= 2) {
michael@0 1165 *pDest++=(uint8_t)((ch>>6)|0xc0);
michael@0 1166 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1167 } else {
michael@0 1168 reqLength = 2;
michael@0 1169 break;
michael@0 1170 }
michael@0 1171 } else if(ch <= 0xd7ff || ch >= 0xe000) {
michael@0 1172 if((pDestLimit - pDest) >= 3) {
michael@0 1173 *pDest++=(uint8_t)((ch>>12)|0xe0);
michael@0 1174 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1175 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1176 } else {
michael@0 1177 reqLength = 3;
michael@0 1178 break;
michael@0 1179 }
michael@0 1180 } else /* ch is a surrogate */ {
michael@0 1181 int32_t length;
michael@0 1182
michael@0 1183 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
michael@0 1184 ++pSrc;
michael@0 1185 ch=U16_GET_SUPPLEMENTARY(ch, ch2);
michael@0 1186 } else if(subchar>=0) {
michael@0 1187 ch=subchar;
michael@0 1188 ++numSubstitutions;
michael@0 1189 } else {
michael@0 1190 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
michael@0 1191 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1192 return NULL;
michael@0 1193 }
michael@0 1194
michael@0 1195 length = U8_LENGTH(ch);
michael@0 1196 if((pDestLimit - pDest) >= length) {
michael@0 1197 /* convert and append*/
michael@0 1198 pDest=_appendUTF8(pDest, ch);
michael@0 1199 } else {
michael@0 1200 reqLength = length;
michael@0 1201 break;
michael@0 1202 }
michael@0 1203 }
michael@0 1204 }
michael@0 1205 while(pSrc<pSrcLimit) {
michael@0 1206 ch=*pSrc++;
michael@0 1207 if(ch<=0x7f) {
michael@0 1208 ++reqLength;
michael@0 1209 } else if(ch<=0x7ff) {
michael@0 1210 reqLength+=2;
michael@0 1211 } else if(!U16_IS_SURROGATE(ch)) {
michael@0 1212 reqLength+=3;
michael@0 1213 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
michael@0 1214 ++pSrc;
michael@0 1215 reqLength+=4;
michael@0 1216 } else if(subchar>=0) {
michael@0 1217 reqLength+=U8_LENGTH(subchar);
michael@0 1218 ++numSubstitutions;
michael@0 1219 } else {
michael@0 1220 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
michael@0 1221 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1222 return NULL;
michael@0 1223 }
michael@0 1224 }
michael@0 1225 }
michael@0 1226
michael@0 1227 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
michael@0 1228
michael@0 1229 if(pNumSubstitutions!=NULL) {
michael@0 1230 *pNumSubstitutions=numSubstitutions;
michael@0 1231 }
michael@0 1232
michael@0 1233 if(pDestLength){
michael@0 1234 *pDestLength = reqLength;
michael@0 1235 }
michael@0 1236
michael@0 1237 /* Terminate the buffer */
michael@0 1238 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 1239 return dest;
michael@0 1240 }
michael@0 1241
michael@0 1242 U_CAPI char* U_EXPORT2
michael@0 1243 u_strToUTF8(char *dest,
michael@0 1244 int32_t destCapacity,
michael@0 1245 int32_t *pDestLength,
michael@0 1246 const UChar *pSrc,
michael@0 1247 int32_t srcLength,
michael@0 1248 UErrorCode *pErrorCode){
michael@0 1249 return u_strToUTF8WithSub(
michael@0 1250 dest, destCapacity, pDestLength,
michael@0 1251 pSrc, srcLength,
michael@0 1252 U_SENTINEL, NULL,
michael@0 1253 pErrorCode);
michael@0 1254 }
michael@0 1255
michael@0 1256 U_CAPI UChar* U_EXPORT2
michael@0 1257 u_strFromJavaModifiedUTF8WithSub(
michael@0 1258 UChar *dest,
michael@0 1259 int32_t destCapacity,
michael@0 1260 int32_t *pDestLength,
michael@0 1261 const char *src,
michael@0 1262 int32_t srcLength,
michael@0 1263 UChar32 subchar, int32_t *pNumSubstitutions,
michael@0 1264 UErrorCode *pErrorCode) {
michael@0 1265 UChar *pDest = dest;
michael@0 1266 UChar *pDestLimit = dest+destCapacity;
michael@0 1267 UChar32 ch;
michael@0 1268 int32_t reqLength = 0;
michael@0 1269 const uint8_t* pSrc = (const uint8_t*) src;
michael@0 1270 const uint8_t *pSrcLimit;
michael@0 1271 int32_t count;
michael@0 1272 uint8_t t1, t2; /* trail bytes */
michael@0 1273 int32_t numSubstitutions;
michael@0 1274
michael@0 1275 /* args check */
michael@0 1276 if(U_FAILURE(*pErrorCode)){
michael@0 1277 return NULL;
michael@0 1278 }
michael@0 1279 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 1280 (dest==NULL && destCapacity!=0) || destCapacity<0 ||
michael@0 1281 subchar > 0x10ffff || U_IS_SURROGATE(subchar)
michael@0 1282 ) {
michael@0 1283 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1284 return NULL;
michael@0 1285 }
michael@0 1286
michael@0 1287 if(pNumSubstitutions!=NULL) {
michael@0 1288 *pNumSubstitutions=0;
michael@0 1289 }
michael@0 1290 numSubstitutions=0;
michael@0 1291
michael@0 1292 if(srcLength < 0) {
michael@0 1293 /*
michael@0 1294 * Transform a NUL-terminated ASCII string.
michael@0 1295 * Handle non-ASCII strings with slower code.
michael@0 1296 */
michael@0 1297 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
michael@0 1298 *pDest++=(UChar)ch;
michael@0 1299 ++pSrc;
michael@0 1300 }
michael@0 1301 if(ch == 0) {
michael@0 1302 reqLength=(int32_t)(pDest - dest);
michael@0 1303 if(pDestLength) {
michael@0 1304 *pDestLength = reqLength;
michael@0 1305 }
michael@0 1306
michael@0 1307 /* Terminate the buffer */
michael@0 1308 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 1309 return dest;
michael@0 1310 }
michael@0 1311 srcLength = uprv_strlen((const char *)pSrc);
michael@0 1312 }
michael@0 1313
michael@0 1314 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
michael@0 1315 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
michael@0 1316 for(;;) {
michael@0 1317 count = (int32_t)(pDestLimit - pDest);
michael@0 1318 srcLength = (int32_t)(pSrcLimit - pSrc);
michael@0 1319 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
michael@0 1320 /* fast ASCII loop */
michael@0 1321 const uint8_t *prevSrc = pSrc;
michael@0 1322 int32_t delta;
michael@0 1323 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
michael@0 1324 *pDest++=(UChar)ch;
michael@0 1325 ++pSrc;
michael@0 1326 }
michael@0 1327 delta = (int32_t)(pSrc - prevSrc);
michael@0 1328 count -= delta;
michael@0 1329 srcLength -= delta;
michael@0 1330 }
michael@0 1331 /*
michael@0 1332 * Each iteration of the inner loop progresses by at most 3 UTF-8
michael@0 1333 * bytes and one UChar.
michael@0 1334 */
michael@0 1335 srcLength /= 3;
michael@0 1336 if(count > srcLength) {
michael@0 1337 count = srcLength; /* min(remaining dest, remaining src/3) */
michael@0 1338 }
michael@0 1339 if(count < 3) {
michael@0 1340 /*
michael@0 1341 * Too much overhead if we get near the end of the string,
michael@0 1342 * continue with the next loop.
michael@0 1343 */
michael@0 1344 break;
michael@0 1345 }
michael@0 1346 do {
michael@0 1347 ch = *pSrc;
michael@0 1348 if(ch <= 0x7f){
michael@0 1349 *pDest++=(UChar)ch;
michael@0 1350 ++pSrc;
michael@0 1351 } else {
michael@0 1352 if(ch >= 0xe0) {
michael@0 1353 if( /* handle U+0000..U+FFFF inline */
michael@0 1354 ch <= 0xef &&
michael@0 1355 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
michael@0 1356 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
michael@0 1357 ) {
michael@0 1358 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 1359 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
michael@0 1360 pSrc += 3;
michael@0 1361 continue;
michael@0 1362 }
michael@0 1363 } else {
michael@0 1364 if( /* handle U+0000..U+07FF inline */
michael@0 1365 ch >= 0xc0 &&
michael@0 1366 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
michael@0 1367 ) {
michael@0 1368 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
michael@0 1369 pSrc += 2;
michael@0 1370 continue;
michael@0 1371 }
michael@0 1372 }
michael@0 1373
michael@0 1374 if(subchar < 0) {
michael@0 1375 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1376 return NULL;
michael@0 1377 } else if(subchar > 0xffff && --count == 0) {
michael@0 1378 /*
michael@0 1379 * We need to write two UChars, adjusted count for that,
michael@0 1380 * and ran out of space.
michael@0 1381 */
michael@0 1382 break;
michael@0 1383 } else {
michael@0 1384 /* function call for error cases */
michael@0 1385 ++pSrc; /* continue after the lead byte */
michael@0 1386 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 1387 ++numSubstitutions;
michael@0 1388 if(subchar<=0xFFFF) {
michael@0 1389 *(pDest++)=(UChar)subchar;
michael@0 1390 } else {
michael@0 1391 *(pDest++)=U16_LEAD(subchar);
michael@0 1392 *(pDest++)=U16_TRAIL(subchar);
michael@0 1393 }
michael@0 1394 }
michael@0 1395 }
michael@0 1396 } while(--count > 0);
michael@0 1397 }
michael@0 1398
michael@0 1399 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
michael@0 1400 ch = *pSrc;
michael@0 1401 if(ch <= 0x7f){
michael@0 1402 *pDest++=(UChar)ch;
michael@0 1403 ++pSrc;
michael@0 1404 } else {
michael@0 1405 if(ch >= 0xe0) {
michael@0 1406 if( /* handle U+0000..U+FFFF inline */
michael@0 1407 ch <= 0xef &&
michael@0 1408 ((pSrcLimit - pSrc) >= 3) &&
michael@0 1409 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
michael@0 1410 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
michael@0 1411 ) {
michael@0 1412 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
michael@0 1413 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
michael@0 1414 pSrc += 3;
michael@0 1415 continue;
michael@0 1416 }
michael@0 1417 } else {
michael@0 1418 if( /* handle U+0000..U+07FF inline */
michael@0 1419 ch >= 0xc0 &&
michael@0 1420 ((pSrcLimit - pSrc) >= 2) &&
michael@0 1421 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
michael@0 1422 ) {
michael@0 1423 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
michael@0 1424 pSrc += 2;
michael@0 1425 continue;
michael@0 1426 }
michael@0 1427 }
michael@0 1428
michael@0 1429 if(subchar < 0) {
michael@0 1430 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1431 return NULL;
michael@0 1432 } else {
michael@0 1433 /* function call for error cases */
michael@0 1434 ++pSrc; /* continue after the lead byte */
michael@0 1435 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 1436 ++numSubstitutions;
michael@0 1437 if(subchar<=0xFFFF) {
michael@0 1438 *(pDest++)=(UChar)subchar;
michael@0 1439 } else {
michael@0 1440 *(pDest++)=U16_LEAD(subchar);
michael@0 1441 if(pDest<pDestLimit) {
michael@0 1442 *(pDest++)=U16_TRAIL(subchar);
michael@0 1443 } else {
michael@0 1444 reqLength++;
michael@0 1445 break;
michael@0 1446 }
michael@0 1447 }
michael@0 1448 }
michael@0 1449 }
michael@0 1450 }
michael@0 1451
michael@0 1452 /* do not fill the dest buffer just count the UChars needed */
michael@0 1453 while(pSrc < pSrcLimit){
michael@0 1454 ch = *pSrc;
michael@0 1455 if(ch <= 0x7f) {
michael@0 1456 reqLength++;
michael@0 1457 ++pSrc;
michael@0 1458 } else {
michael@0 1459 if(ch >= 0xe0) {
michael@0 1460 if( /* handle U+0000..U+FFFF inline */
michael@0 1461 ch <= 0xef &&
michael@0 1462 ((pSrcLimit - pSrc) >= 3) &&
michael@0 1463 (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
michael@0 1464 (uint8_t)(pSrc[2] - 0x80) <= 0x3f
michael@0 1465 ) {
michael@0 1466 reqLength++;
michael@0 1467 pSrc += 3;
michael@0 1468 continue;
michael@0 1469 }
michael@0 1470 } else {
michael@0 1471 if( /* handle U+0000..U+07FF inline */
michael@0 1472 ch >= 0xc0 &&
michael@0 1473 ((pSrcLimit - pSrc) >= 2) &&
michael@0 1474 (uint8_t)(pSrc[1] - 0x80) <= 0x3f
michael@0 1475 ) {
michael@0 1476 reqLength++;
michael@0 1477 pSrc += 2;
michael@0 1478 continue;
michael@0 1479 }
michael@0 1480 }
michael@0 1481
michael@0 1482 if(subchar < 0) {
michael@0 1483 *pErrorCode = U_INVALID_CHAR_FOUND;
michael@0 1484 return NULL;
michael@0 1485 } else {
michael@0 1486 /* function call for error cases */
michael@0 1487 ++pSrc; /* continue after the lead byte */
michael@0 1488 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
michael@0 1489 ++numSubstitutions;
michael@0 1490 reqLength+=U16_LENGTH(ch);
michael@0 1491 }
michael@0 1492 }
michael@0 1493 }
michael@0 1494
michael@0 1495 if(pNumSubstitutions!=NULL) {
michael@0 1496 *pNumSubstitutions=numSubstitutions;
michael@0 1497 }
michael@0 1498
michael@0 1499 reqLength+=(int32_t)(pDest - dest);
michael@0 1500 if(pDestLength) {
michael@0 1501 *pDestLength = reqLength;
michael@0 1502 }
michael@0 1503
michael@0 1504 /* Terminate the buffer */
michael@0 1505 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 1506 return dest;
michael@0 1507 }
michael@0 1508
michael@0 1509 U_CAPI char* U_EXPORT2
michael@0 1510 u_strToJavaModifiedUTF8(
michael@0 1511 char *dest,
michael@0 1512 int32_t destCapacity,
michael@0 1513 int32_t *pDestLength,
michael@0 1514 const UChar *src,
michael@0 1515 int32_t srcLength,
michael@0 1516 UErrorCode *pErrorCode) {
michael@0 1517 int32_t reqLength=0;
michael@0 1518 uint32_t ch=0;
michael@0 1519 uint8_t *pDest = (uint8_t *)dest;
michael@0 1520 uint8_t *pDestLimit = pDest + destCapacity;
michael@0 1521 const UChar *pSrcLimit;
michael@0 1522 int32_t count;
michael@0 1523
michael@0 1524 /* args check */
michael@0 1525 if(U_FAILURE(*pErrorCode)){
michael@0 1526 return NULL;
michael@0 1527 }
michael@0 1528 if( (src==NULL && srcLength!=0) || srcLength < -1 ||
michael@0 1529 (dest==NULL && destCapacity!=0) || destCapacity<0
michael@0 1530 ) {
michael@0 1531 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1532 return NULL;
michael@0 1533 }
michael@0 1534
michael@0 1535 if(srcLength==-1) {
michael@0 1536 /* Convert NUL-terminated ASCII, then find the string length. */
michael@0 1537 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
michael@0 1538 *pDest++ = (uint8_t)ch;
michael@0 1539 ++src;
michael@0 1540 }
michael@0 1541 if(ch == 0) {
michael@0 1542 reqLength=(int32_t)(pDest - (uint8_t *)dest);
michael@0 1543 if(pDestLength) {
michael@0 1544 *pDestLength = reqLength;
michael@0 1545 }
michael@0 1546
michael@0 1547 /* Terminate the buffer */
michael@0 1548 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 1549 return dest;
michael@0 1550 }
michael@0 1551 srcLength = u_strlen(src);
michael@0 1552 }
michael@0 1553
michael@0 1554 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
michael@0 1555 pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
michael@0 1556 for(;;) {
michael@0 1557 count = (int32_t)(pDestLimit - pDest);
michael@0 1558 srcLength = (int32_t)(pSrcLimit - src);
michael@0 1559 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
michael@0 1560 /* fast ASCII loop */
michael@0 1561 const UChar *prevSrc = src;
michael@0 1562 int32_t delta;
michael@0 1563 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
michael@0 1564 *pDest++=(uint8_t)ch;
michael@0 1565 ++src;
michael@0 1566 }
michael@0 1567 delta = (int32_t)(src - prevSrc);
michael@0 1568 count -= delta;
michael@0 1569 srcLength -= delta;
michael@0 1570 }
michael@0 1571 /*
michael@0 1572 * Each iteration of the inner loop progresses by at most 3 UTF-8
michael@0 1573 * bytes and one UChar.
michael@0 1574 */
michael@0 1575 count /= 3;
michael@0 1576 if(count > srcLength) {
michael@0 1577 count = srcLength; /* min(remaining dest/3, remaining src) */
michael@0 1578 }
michael@0 1579 if(count < 3) {
michael@0 1580 /*
michael@0 1581 * Too much overhead if we get near the end of the string,
michael@0 1582 * continue with the next loop.
michael@0 1583 */
michael@0 1584 break;
michael@0 1585 }
michael@0 1586 do {
michael@0 1587 ch=*src++;
michael@0 1588 if(ch <= 0x7f && ch != 0) {
michael@0 1589 *pDest++ = (uint8_t)ch;
michael@0 1590 } else if(ch <= 0x7ff) {
michael@0 1591 *pDest++=(uint8_t)((ch>>6)|0xc0);
michael@0 1592 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1593 } else {
michael@0 1594 *pDest++=(uint8_t)((ch>>12)|0xe0);
michael@0 1595 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1596 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1597 }
michael@0 1598 } while(--count > 0);
michael@0 1599 }
michael@0 1600
michael@0 1601 while(src<pSrcLimit) {
michael@0 1602 ch=*src++;
michael@0 1603 if(ch <= 0x7f && ch != 0) {
michael@0 1604 if(pDest<pDestLimit) {
michael@0 1605 *pDest++ = (uint8_t)ch;
michael@0 1606 } else {
michael@0 1607 reqLength = 1;
michael@0 1608 break;
michael@0 1609 }
michael@0 1610 } else if(ch <= 0x7ff) {
michael@0 1611 if((pDestLimit - pDest) >= 2) {
michael@0 1612 *pDest++=(uint8_t)((ch>>6)|0xc0);
michael@0 1613 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1614 } else {
michael@0 1615 reqLength = 2;
michael@0 1616 break;
michael@0 1617 }
michael@0 1618 } else {
michael@0 1619 if((pDestLimit - pDest) >= 3) {
michael@0 1620 *pDest++=(uint8_t)((ch>>12)|0xe0);
michael@0 1621 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
michael@0 1622 *pDest++=(uint8_t)((ch&0x3f)|0x80);
michael@0 1623 } else {
michael@0 1624 reqLength = 3;
michael@0 1625 break;
michael@0 1626 }
michael@0 1627 }
michael@0 1628 }
michael@0 1629 while(src<pSrcLimit) {
michael@0 1630 ch=*src++;
michael@0 1631 if(ch <= 0x7f && ch != 0) {
michael@0 1632 ++reqLength;
michael@0 1633 } else if(ch<=0x7ff) {
michael@0 1634 reqLength+=2;
michael@0 1635 } else {
michael@0 1636 reqLength+=3;
michael@0 1637 }
michael@0 1638 }
michael@0 1639
michael@0 1640 reqLength+=(int32_t)(pDest - (uint8_t *)dest);
michael@0 1641 if(pDestLength){
michael@0 1642 *pDestLength = reqLength;
michael@0 1643 }
michael@0 1644
michael@0 1645 /* Terminate the buffer */
michael@0 1646 u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
michael@0 1647 return dest;
michael@0 1648 }

mercurial