intl/icu/source/common/ucnv_u16.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 **********************************************************************
michael@0 3 * Copyright (C) 2002-2010, International Business Machines
michael@0 4 * Corporation and others. All Rights Reserved.
michael@0 5 **********************************************************************
michael@0 6 * file name: ucnv_u16.c
michael@0 7 * encoding: US-ASCII
michael@0 8 * tab size: 8 (not used)
michael@0 9 * indentation:4
michael@0 10 *
michael@0 11 * created on: 2002jul01
michael@0 12 * created by: Markus W. Scherer
michael@0 13 *
michael@0 14 * UTF-16 converter implementation. Used to be in ucnv_utf.c.
michael@0 15 */
michael@0 16
michael@0 17 #include "unicode/utypes.h"
michael@0 18
michael@0 19 #if !UCONFIG_NO_CONVERSION
michael@0 20
michael@0 21 #include "unicode/ucnv.h"
michael@0 22 #include "ucnv_bld.h"
michael@0 23 #include "ucnv_cnv.h"
michael@0 24 #include "cmemory.h"
michael@0 25
michael@0 26 enum {
michael@0 27 UCNV_NEED_TO_WRITE_BOM=1
michael@0 28 };
michael@0 29
michael@0 30 /*
michael@0 31 * The UTF-16 toUnicode implementation is also used for the Java-specific
michael@0 32 * "with BOM" variants of UTF-16BE and UTF-16LE.
michael@0 33 */
michael@0 34 static void
michael@0 35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 36 UErrorCode *pErrorCode);
michael@0 37
michael@0 38 /* UTF-16BE ----------------------------------------------------------------- */
michael@0 39
michael@0 40 #if U_IS_BIG_ENDIAN
michael@0 41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets
michael@0 42 #else
michael@0 43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets
michael@0 44 #endif
michael@0 45
michael@0 46
michael@0 47 static void
michael@0 48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0 49 UErrorCode *pErrorCode) {
michael@0 50 UConverter *cnv;
michael@0 51 const UChar *source;
michael@0 52 char *target;
michael@0 53 int32_t *offsets;
michael@0 54
michael@0 55 uint32_t targetCapacity, length, sourceIndex;
michael@0 56 UChar c, trail;
michael@0 57 char overflow[4];
michael@0 58
michael@0 59 source=pArgs->source;
michael@0 60 length=(int32_t)(pArgs->sourceLimit-source);
michael@0 61 if(length<=0) {
michael@0 62 /* no input, nothing to do */
michael@0 63 return;
michael@0 64 }
michael@0 65
michael@0 66 cnv=pArgs->converter;
michael@0 67
michael@0 68 /* write the BOM if necessary */
michael@0 69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 70 static const char bom[]={ (char)0xfe, (char)0xff };
michael@0 71 ucnv_fromUWriteBytes(cnv,
michael@0 72 bom, 2,
michael@0 73 &pArgs->target, pArgs->targetLimit,
michael@0 74 &pArgs->offsets, -1,
michael@0 75 pErrorCode);
michael@0 76 cnv->fromUnicodeStatus=0;
michael@0 77 }
michael@0 78
michael@0 79 target=pArgs->target;
michael@0 80 if(target >= pArgs->targetLimit) {
michael@0 81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 82 return;
michael@0 83 }
michael@0 84
michael@0 85 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
michael@0 86 offsets=pArgs->offsets;
michael@0 87 sourceIndex=0;
michael@0 88
michael@0 89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
michael@0 90
michael@0 91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
michael@0 92 /* the last buffer ended with a lead surrogate, output the surrogate pair */
michael@0 93 ++source;
michael@0 94 --length;
michael@0 95 target[0]=(uint8_t)(c>>8);
michael@0 96 target[1]=(uint8_t)c;
michael@0 97 target[2]=(uint8_t)(trail>>8);
michael@0 98 target[3]=(uint8_t)trail;
michael@0 99 target+=4;
michael@0 100 targetCapacity-=4;
michael@0 101 if(offsets!=NULL) {
michael@0 102 *offsets++=-1;
michael@0 103 *offsets++=-1;
michael@0 104 *offsets++=-1;
michael@0 105 *offsets++=-1;
michael@0 106 }
michael@0 107 sourceIndex=1;
michael@0 108 cnv->fromUChar32=c=0;
michael@0 109 }
michael@0 110
michael@0 111 if(c==0) {
michael@0 112 /* copy an even number of bytes for complete UChars */
michael@0 113 uint32_t count=2*length;
michael@0 114 if(count>targetCapacity) {
michael@0 115 count=targetCapacity&~1;
michael@0 116 }
michael@0 117 /* count is even */
michael@0 118 targetCapacity-=count;
michael@0 119 count>>=1;
michael@0 120 length-=count;
michael@0 121
michael@0 122 if(offsets==NULL) {
michael@0 123 while(count>0) {
michael@0 124 c=*source++;
michael@0 125 if(U16_IS_SINGLE(c)) {
michael@0 126 target[0]=(uint8_t)(c>>8);
michael@0 127 target[1]=(uint8_t)c;
michael@0 128 target+=2;
michael@0 129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
michael@0 130 ++source;
michael@0 131 --count;
michael@0 132 target[0]=(uint8_t)(c>>8);
michael@0 133 target[1]=(uint8_t)c;
michael@0 134 target[2]=(uint8_t)(trail>>8);
michael@0 135 target[3]=(uint8_t)trail;
michael@0 136 target+=4;
michael@0 137 } else {
michael@0 138 break;
michael@0 139 }
michael@0 140 --count;
michael@0 141 }
michael@0 142 } else {
michael@0 143 while(count>0) {
michael@0 144 c=*source++;
michael@0 145 if(U16_IS_SINGLE(c)) {
michael@0 146 target[0]=(uint8_t)(c>>8);
michael@0 147 target[1]=(uint8_t)c;
michael@0 148 target+=2;
michael@0 149 *offsets++=sourceIndex;
michael@0 150 *offsets++=sourceIndex++;
michael@0 151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
michael@0 152 ++source;
michael@0 153 --count;
michael@0 154 target[0]=(uint8_t)(c>>8);
michael@0 155 target[1]=(uint8_t)c;
michael@0 156 target[2]=(uint8_t)(trail>>8);
michael@0 157 target[3]=(uint8_t)trail;
michael@0 158 target+=4;
michael@0 159 *offsets++=sourceIndex;
michael@0 160 *offsets++=sourceIndex;
michael@0 161 *offsets++=sourceIndex;
michael@0 162 *offsets++=sourceIndex;
michael@0 163 sourceIndex+=2;
michael@0 164 } else {
michael@0 165 break;
michael@0 166 }
michael@0 167 --count;
michael@0 168 }
michael@0 169 }
michael@0 170
michael@0 171 if(count==0) {
michael@0 172 /* done with the loop for complete UChars */
michael@0 173 if(length>0 && targetCapacity>0) {
michael@0 174 /*
michael@0 175 * there is more input and some target capacity -
michael@0 176 * it must be targetCapacity==1 because otherwise
michael@0 177 * the above would have copied more;
michael@0 178 * prepare for overflow output
michael@0 179 */
michael@0 180 if(U16_IS_SINGLE(c=*source++)) {
michael@0 181 overflow[0]=(char)(c>>8);
michael@0 182 overflow[1]=(char)c;
michael@0 183 length=2; /* 2 bytes to output */
michael@0 184 c=0;
michael@0 185 /* } else { keep c for surrogate handling, length will be set there */
michael@0 186 }
michael@0 187 } else {
michael@0 188 length=0;
michael@0 189 c=0;
michael@0 190 }
michael@0 191 } else {
michael@0 192 /* keep c for surrogate handling, length will be set there */
michael@0 193 targetCapacity+=2*count;
michael@0 194 }
michael@0 195 } else {
michael@0 196 length=0; /* from here on, length counts the bytes in overflow[] */
michael@0 197 }
michael@0 198
michael@0 199 if(c!=0) {
michael@0 200 /*
michael@0 201 * c is a surrogate, and
michael@0 202 * - source or target too short
michael@0 203 * - or the surrogate is unmatched
michael@0 204 */
michael@0 205 length=0;
michael@0 206 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 207 if(source<pArgs->sourceLimit) {
michael@0 208 if(U16_IS_TRAIL(trail=*source)) {
michael@0 209 /* output the surrogate pair, will overflow (see conditions comment above) */
michael@0 210 ++source;
michael@0 211 overflow[0]=(char)(c>>8);
michael@0 212 overflow[1]=(char)c;
michael@0 213 overflow[2]=(char)(trail>>8);
michael@0 214 overflow[3]=(char)trail;
michael@0 215 length=4; /* 4 bytes to output */
michael@0 216 c=0;
michael@0 217 } else {
michael@0 218 /* unmatched lead surrogate */
michael@0 219 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 220 }
michael@0 221 } else {
michael@0 222 /* see if the trail surrogate is in the next buffer */
michael@0 223 }
michael@0 224 } else {
michael@0 225 /* unmatched trail surrogate */
michael@0 226 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 227 }
michael@0 228 cnv->fromUChar32=c;
michael@0 229 }
michael@0 230
michael@0 231 if(length>0) {
michael@0 232 /* output length bytes with overflow (length>targetCapacity>0) */
michael@0 233 ucnv_fromUWriteBytes(cnv,
michael@0 234 overflow, length,
michael@0 235 (char **)&target, pArgs->targetLimit,
michael@0 236 &offsets, sourceIndex,
michael@0 237 pErrorCode);
michael@0 238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
michael@0 239 }
michael@0 240
michael@0 241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
michael@0 242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 243 }
michael@0 244
michael@0 245 /* write back the updated pointers */
michael@0 246 pArgs->source=source;
michael@0 247 pArgs->target=(char *)target;
michael@0 248 pArgs->offsets=offsets;
michael@0 249 }
michael@0 250
michael@0 251 static void
michael@0 252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 253 UErrorCode *pErrorCode) {
michael@0 254 UConverter *cnv;
michael@0 255 const uint8_t *source;
michael@0 256 UChar *target;
michael@0 257 int32_t *offsets;
michael@0 258
michael@0 259 uint32_t targetCapacity, length, count, sourceIndex;
michael@0 260 UChar c, trail;
michael@0 261
michael@0 262 if(pArgs->converter->mode<8) {
michael@0 263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 264 return;
michael@0 265 }
michael@0 266
michael@0 267 cnv=pArgs->converter;
michael@0 268 source=(const uint8_t *)pArgs->source;
michael@0 269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
michael@0 270 if(length<=0 && cnv->toUnicodeStatus==0) {
michael@0 271 /* no input, nothing to do */
michael@0 272 return;
michael@0 273 }
michael@0 274
michael@0 275 target=pArgs->target;
michael@0 276 if(target >= pArgs->targetLimit) {
michael@0 277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 278 return;
michael@0 279 }
michael@0 280
michael@0 281 targetCapacity=(uint32_t)(pArgs->targetLimit-target);
michael@0 282 offsets=pArgs->offsets;
michael@0 283 sourceIndex=0;
michael@0 284 c=0;
michael@0 285
michael@0 286 /* complete a partial UChar or pair from the last call */
michael@0 287 if(cnv->toUnicodeStatus!=0) {
michael@0 288 /*
michael@0 289 * special case: single byte from a previous buffer,
michael@0 290 * where the byte turned out not to belong to a trail surrogate
michael@0 291 * and the preceding, unmatched lead surrogate was put into toUBytes[]
michael@0 292 * for error handling
michael@0 293 */
michael@0 294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
michael@0 295 cnv->toULength=1;
michael@0 296 cnv->toUnicodeStatus=0;
michael@0 297 }
michael@0 298 if((count=cnv->toULength)!=0) {
michael@0 299 uint8_t *p=cnv->toUBytes;
michael@0 300 do {
michael@0 301 p[count++]=*source++;
michael@0 302 ++sourceIndex;
michael@0 303 --length;
michael@0 304 if(count==2) {
michael@0 305 c=((UChar)p[0]<<8)|p[1];
michael@0 306 if(U16_IS_SINGLE(c)) {
michael@0 307 /* output the BMP code point */
michael@0 308 *target++=c;
michael@0 309 if(offsets!=NULL) {
michael@0 310 *offsets++=-1;
michael@0 311 }
michael@0 312 --targetCapacity;
michael@0 313 count=0;
michael@0 314 c=0;
michael@0 315 break;
michael@0 316 } else if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 317 /* continue collecting bytes for the trail surrogate */
michael@0 318 c=0; /* avoid unnecessary surrogate handling below */
michael@0 319 } else {
michael@0 320 /* fall through to error handling for an unmatched trail surrogate */
michael@0 321 break;
michael@0 322 }
michael@0 323 } else if(count==4) {
michael@0 324 c=((UChar)p[0]<<8)|p[1];
michael@0 325 trail=((UChar)p[2]<<8)|p[3];
michael@0 326 if(U16_IS_TRAIL(trail)) {
michael@0 327 /* output the surrogate pair */
michael@0 328 *target++=c;
michael@0 329 if(targetCapacity>=2) {
michael@0 330 *target++=trail;
michael@0 331 if(offsets!=NULL) {
michael@0 332 *offsets++=-1;
michael@0 333 *offsets++=-1;
michael@0 334 }
michael@0 335 targetCapacity-=2;
michael@0 336 } else /* targetCapacity==1 */ {
michael@0 337 targetCapacity=0;
michael@0 338 cnv->UCharErrorBuffer[0]=trail;
michael@0 339 cnv->UCharErrorBufferLength=1;
michael@0 340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 341 }
michael@0 342 count=0;
michael@0 343 c=0;
michael@0 344 break;
michael@0 345 } else {
michael@0 346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
michael@0 347 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 348
michael@0 349 /* back out reading the code unit after it */
michael@0 350 if(((const uint8_t *)pArgs->source-source)>=2) {
michael@0 351 source-=2;
michael@0 352 } else {
michael@0 353 /*
michael@0 354 * if the trail unit's first byte was in a previous buffer, then
michael@0 355 * we need to put it into a special place because toUBytes[] will be
michael@0 356 * used for the lead unit's bytes
michael@0 357 */
michael@0 358 cnv->toUnicodeStatus=0x100|p[2];
michael@0 359 --source;
michael@0 360 }
michael@0 361 cnv->toULength=2;
michael@0 362
michael@0 363 /* write back the updated pointers */
michael@0 364 pArgs->source=(const char *)source;
michael@0 365 pArgs->target=target;
michael@0 366 pArgs->offsets=offsets;
michael@0 367 return;
michael@0 368 }
michael@0 369 }
michael@0 370 } while(length>0);
michael@0 371 cnv->toULength=(int8_t)count;
michael@0 372 }
michael@0 373
michael@0 374 /* copy an even number of bytes for complete UChars */
michael@0 375 count=2*targetCapacity;
michael@0 376 if(count>length) {
michael@0 377 count=length&~1;
michael@0 378 }
michael@0 379 if(c==0 && count>0) {
michael@0 380 length-=count;
michael@0 381 count>>=1;
michael@0 382 targetCapacity-=count;
michael@0 383 if(offsets==NULL) {
michael@0 384 do {
michael@0 385 c=((UChar)source[0]<<8)|source[1];
michael@0 386 source+=2;
michael@0 387 if(U16_IS_SINGLE(c)) {
michael@0 388 *target++=c;
michael@0 389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
michael@0 390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
michael@0 391 ) {
michael@0 392 source+=2;
michael@0 393 --count;
michael@0 394 *target++=c;
michael@0 395 *target++=trail;
michael@0 396 } else {
michael@0 397 break;
michael@0 398 }
michael@0 399 } while(--count>0);
michael@0 400 } else {
michael@0 401 do {
michael@0 402 c=((UChar)source[0]<<8)|source[1];
michael@0 403 source+=2;
michael@0 404 if(U16_IS_SINGLE(c)) {
michael@0 405 *target++=c;
michael@0 406 *offsets++=sourceIndex;
michael@0 407 sourceIndex+=2;
michael@0 408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
michael@0 409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
michael@0 410 ) {
michael@0 411 source+=2;
michael@0 412 --count;
michael@0 413 *target++=c;
michael@0 414 *target++=trail;
michael@0 415 *offsets++=sourceIndex;
michael@0 416 *offsets++=sourceIndex;
michael@0 417 sourceIndex+=4;
michael@0 418 } else {
michael@0 419 break;
michael@0 420 }
michael@0 421 } while(--count>0);
michael@0 422 }
michael@0 423
michael@0 424 if(count==0) {
michael@0 425 /* done with the loop for complete UChars */
michael@0 426 c=0;
michael@0 427 } else {
michael@0 428 /* keep c for surrogate handling, trail will be set there */
michael@0 429 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
michael@0 430 targetCapacity+=count;
michael@0 431 }
michael@0 432 }
michael@0 433
michael@0 434 if(c!=0) {
michael@0 435 /*
michael@0 436 * c is a surrogate, and
michael@0 437 * - source or target too short
michael@0 438 * - or the surrogate is unmatched
michael@0 439 */
michael@0 440 cnv->toUBytes[0]=(uint8_t)(c>>8);
michael@0 441 cnv->toUBytes[1]=(uint8_t)c;
michael@0 442 cnv->toULength=2;
michael@0 443
michael@0 444 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 445 if(length>=2) {
michael@0 446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
michael@0 447 /* output the surrogate pair, will overflow (see conditions comment above) */
michael@0 448 source+=2;
michael@0 449 length-=2;
michael@0 450 *target++=c;
michael@0 451 if(offsets!=NULL) {
michael@0 452 *offsets++=sourceIndex;
michael@0 453 }
michael@0 454 cnv->UCharErrorBuffer[0]=trail;
michael@0 455 cnv->UCharErrorBufferLength=1;
michael@0 456 cnv->toULength=0;
michael@0 457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 458 } else {
michael@0 459 /* unmatched lead surrogate */
michael@0 460 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 461 }
michael@0 462 } else {
michael@0 463 /* see if the trail surrogate is in the next buffer */
michael@0 464 }
michael@0 465 } else {
michael@0 466 /* unmatched trail surrogate */
michael@0 467 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 468 }
michael@0 469 }
michael@0 470
michael@0 471 if(U_SUCCESS(*pErrorCode)) {
michael@0 472 /* check for a remaining source byte */
michael@0 473 if(length>0) {
michael@0 474 if(targetCapacity==0) {
michael@0 475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 476 } else {
michael@0 477 /* it must be length==1 because otherwise the above would have copied more */
michael@0 478 cnv->toUBytes[cnv->toULength++]=*source++;
michael@0 479 }
michael@0 480 }
michael@0 481 }
michael@0 482
michael@0 483 /* write back the updated pointers */
michael@0 484 pArgs->source=(const char *)source;
michael@0 485 pArgs->target=target;
michael@0 486 pArgs->offsets=offsets;
michael@0 487 }
michael@0 488
michael@0 489 static UChar32
michael@0 490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
michael@0 491 const uint8_t *s, *sourceLimit;
michael@0 492 UChar32 c;
michael@0 493
michael@0 494 if(pArgs->converter->mode<8) {
michael@0 495 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
michael@0 496 }
michael@0 497
michael@0 498 s=(const uint8_t *)pArgs->source;
michael@0 499 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 500
michael@0 501 if(s>=sourceLimit) {
michael@0 502 /* no input */
michael@0 503 *err=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 504 return 0xffff;
michael@0 505 }
michael@0 506
michael@0 507 if(s+2>sourceLimit) {
michael@0 508 /* only one byte: truncated UChar */
michael@0 509 pArgs->converter->toUBytes[0]=*s++;
michael@0 510 pArgs->converter->toULength=1;
michael@0 511 pArgs->source=(const char *)s;
michael@0 512 *err = U_TRUNCATED_CHAR_FOUND;
michael@0 513 return 0xffff;
michael@0 514 }
michael@0 515
michael@0 516 /* get one UChar */
michael@0 517 c=((UChar32)*s<<8)|s[1];
michael@0 518 s+=2;
michael@0 519
michael@0 520 /* check for a surrogate pair */
michael@0 521 if(U_IS_SURROGATE(c)) {
michael@0 522 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 523 if(s+2<=sourceLimit) {
michael@0 524 UChar trail;
michael@0 525
michael@0 526 /* get a second UChar and see if it is a trail surrogate */
michael@0 527 trail=((UChar)*s<<8)|s[1];
michael@0 528 if(U16_IS_TRAIL(trail)) {
michael@0 529 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 530 s+=2;
michael@0 531 } else {
michael@0 532 /* unmatched lead surrogate */
michael@0 533 c=-2;
michael@0 534 }
michael@0 535 } else {
michael@0 536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
michael@0 537 uint8_t *bytes=pArgs->converter->toUBytes;
michael@0 538 s-=2;
michael@0 539 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
michael@0 540 do {
michael@0 541 *bytes++=*s++;
michael@0 542 } while(s<sourceLimit);
michael@0 543
michael@0 544 c=0xffff;
michael@0 545 *err=U_TRUNCATED_CHAR_FOUND;
michael@0 546 }
michael@0 547 } else {
michael@0 548 /* unmatched trail surrogate */
michael@0 549 c=-2;
michael@0 550 }
michael@0 551
michael@0 552 if(c<0) {
michael@0 553 /* write the unmatched surrogate */
michael@0 554 uint8_t *bytes=pArgs->converter->toUBytes;
michael@0 555 pArgs->converter->toULength=2;
michael@0 556 *bytes=*(s-2);
michael@0 557 bytes[1]=*(s-1);
michael@0 558
michael@0 559 c=0xffff;
michael@0 560 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 561 }
michael@0 562 }
michael@0 563
michael@0 564 pArgs->source=(const char *)s;
michael@0 565 return c;
michael@0 566 }
michael@0 567
michael@0 568 static void
michael@0 569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
michael@0 570 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 571 /* reset toUnicode state */
michael@0 572 if(UCNV_GET_VERSION(cnv)==0) {
michael@0 573 cnv->mode=8; /* no BOM handling */
michael@0 574 } else {
michael@0 575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
michael@0 576 }
michael@0 577 }
michael@0 578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
michael@0 579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
michael@0 580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
michael@0 581 }
michael@0 582 }
michael@0 583
michael@0 584 static void
michael@0 585 _UTF16BEOpen(UConverter *cnv,
michael@0 586 UConverterLoadArgs *pArgs,
michael@0 587 UErrorCode *pErrorCode) {
michael@0 588 if(UCNV_GET_VERSION(cnv)<=1) {
michael@0 589 _UTF16BEReset(cnv, UCNV_RESET_BOTH);
michael@0 590 } else {
michael@0 591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 592 }
michael@0 593 }
michael@0 594
michael@0 595 static const char *
michael@0 596 _UTF16BEGetName(const UConverter *cnv) {
michael@0 597 if(UCNV_GET_VERSION(cnv)==0) {
michael@0 598 return "UTF-16BE";
michael@0 599 } else {
michael@0 600 return "UTF-16BE,version=1";
michael@0 601 }
michael@0 602 }
michael@0 603
michael@0 604 static const UConverterImpl _UTF16BEImpl={
michael@0 605 UCNV_UTF16_BigEndian,
michael@0 606
michael@0 607 NULL,
michael@0 608 NULL,
michael@0 609
michael@0 610 _UTF16BEOpen,
michael@0 611 NULL,
michael@0 612 _UTF16BEReset,
michael@0 613
michael@0 614 _UTF16BEToUnicodeWithOffsets,
michael@0 615 _UTF16BEToUnicodeWithOffsets,
michael@0 616 _UTF16BEFromUnicodeWithOffsets,
michael@0 617 _UTF16BEFromUnicodeWithOffsets,
michael@0 618 _UTF16BEGetNextUChar,
michael@0 619
michael@0 620 NULL,
michael@0 621 _UTF16BEGetName,
michael@0 622 NULL,
michael@0 623 NULL,
michael@0 624 ucnv_getNonSurrogateUnicodeSet
michael@0 625 };
michael@0 626
michael@0 627 static const UConverterStaticData _UTF16BEStaticData={
michael@0 628 sizeof(UConverterStaticData),
michael@0 629 "UTF-16BE",
michael@0 630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
michael@0 631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
michael@0 632 0,
michael@0 633 0,
michael@0 634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 635 };
michael@0 636
michael@0 637
michael@0 638 const UConverterSharedData _UTF16BEData={
michael@0 639 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
michael@0 641 0
michael@0 642 };
michael@0 643
michael@0 644 /* UTF-16LE ----------------------------------------------------------------- */
michael@0 645
michael@0 646 static void
michael@0 647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
michael@0 648 UErrorCode *pErrorCode) {
michael@0 649 UConverter *cnv;
michael@0 650 const UChar *source;
michael@0 651 char *target;
michael@0 652 int32_t *offsets;
michael@0 653
michael@0 654 uint32_t targetCapacity, length, sourceIndex;
michael@0 655 UChar c, trail;
michael@0 656 char overflow[4];
michael@0 657
michael@0 658 source=pArgs->source;
michael@0 659 length=(int32_t)(pArgs->sourceLimit-source);
michael@0 660 if(length<=0) {
michael@0 661 /* no input, nothing to do */
michael@0 662 return;
michael@0 663 }
michael@0 664
michael@0 665 cnv=pArgs->converter;
michael@0 666
michael@0 667 /* write the BOM if necessary */
michael@0 668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
michael@0 669 static const char bom[]={ (char)0xff, (char)0xfe };
michael@0 670 ucnv_fromUWriteBytes(cnv,
michael@0 671 bom, 2,
michael@0 672 &pArgs->target, pArgs->targetLimit,
michael@0 673 &pArgs->offsets, -1,
michael@0 674 pErrorCode);
michael@0 675 cnv->fromUnicodeStatus=0;
michael@0 676 }
michael@0 677
michael@0 678 target=pArgs->target;
michael@0 679 if(target >= pArgs->targetLimit) {
michael@0 680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 681 return;
michael@0 682 }
michael@0 683
michael@0 684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
michael@0 685 offsets=pArgs->offsets;
michael@0 686 sourceIndex=0;
michael@0 687
michael@0 688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */
michael@0 689
michael@0 690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
michael@0 691 /* the last buffer ended with a lead surrogate, output the surrogate pair */
michael@0 692 ++source;
michael@0 693 --length;
michael@0 694 target[0]=(uint8_t)c;
michael@0 695 target[1]=(uint8_t)(c>>8);
michael@0 696 target[2]=(uint8_t)trail;
michael@0 697 target[3]=(uint8_t)(trail>>8);
michael@0 698 target+=4;
michael@0 699 targetCapacity-=4;
michael@0 700 if(offsets!=NULL) {
michael@0 701 *offsets++=-1;
michael@0 702 *offsets++=-1;
michael@0 703 *offsets++=-1;
michael@0 704 *offsets++=-1;
michael@0 705 }
michael@0 706 sourceIndex=1;
michael@0 707 cnv->fromUChar32=c=0;
michael@0 708 }
michael@0 709
michael@0 710 if(c==0) {
michael@0 711 /* copy an even number of bytes for complete UChars */
michael@0 712 uint32_t count=2*length;
michael@0 713 if(count>targetCapacity) {
michael@0 714 count=targetCapacity&~1;
michael@0 715 }
michael@0 716 /* count is even */
michael@0 717 targetCapacity-=count;
michael@0 718 count>>=1;
michael@0 719 length-=count;
michael@0 720
michael@0 721 if(offsets==NULL) {
michael@0 722 while(count>0) {
michael@0 723 c=*source++;
michael@0 724 if(U16_IS_SINGLE(c)) {
michael@0 725 target[0]=(uint8_t)c;
michael@0 726 target[1]=(uint8_t)(c>>8);
michael@0 727 target+=2;
michael@0 728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
michael@0 729 ++source;
michael@0 730 --count;
michael@0 731 target[0]=(uint8_t)c;
michael@0 732 target[1]=(uint8_t)(c>>8);
michael@0 733 target[2]=(uint8_t)trail;
michael@0 734 target[3]=(uint8_t)(trail>>8);
michael@0 735 target+=4;
michael@0 736 } else {
michael@0 737 break;
michael@0 738 }
michael@0 739 --count;
michael@0 740 }
michael@0 741 } else {
michael@0 742 while(count>0) {
michael@0 743 c=*source++;
michael@0 744 if(U16_IS_SINGLE(c)) {
michael@0 745 target[0]=(uint8_t)c;
michael@0 746 target[1]=(uint8_t)(c>>8);
michael@0 747 target+=2;
michael@0 748 *offsets++=sourceIndex;
michael@0 749 *offsets++=sourceIndex++;
michael@0 750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
michael@0 751 ++source;
michael@0 752 --count;
michael@0 753 target[0]=(uint8_t)c;
michael@0 754 target[1]=(uint8_t)(c>>8);
michael@0 755 target[2]=(uint8_t)trail;
michael@0 756 target[3]=(uint8_t)(trail>>8);
michael@0 757 target+=4;
michael@0 758 *offsets++=sourceIndex;
michael@0 759 *offsets++=sourceIndex;
michael@0 760 *offsets++=sourceIndex;
michael@0 761 *offsets++=sourceIndex;
michael@0 762 sourceIndex+=2;
michael@0 763 } else {
michael@0 764 break;
michael@0 765 }
michael@0 766 --count;
michael@0 767 }
michael@0 768 }
michael@0 769
michael@0 770 if(count==0) {
michael@0 771 /* done with the loop for complete UChars */
michael@0 772 if(length>0 && targetCapacity>0) {
michael@0 773 /*
michael@0 774 * there is more input and some target capacity -
michael@0 775 * it must be targetCapacity==1 because otherwise
michael@0 776 * the above would have copied more;
michael@0 777 * prepare for overflow output
michael@0 778 */
michael@0 779 if(U16_IS_SINGLE(c=*source++)) {
michael@0 780 overflow[0]=(char)c;
michael@0 781 overflow[1]=(char)(c>>8);
michael@0 782 length=2; /* 2 bytes to output */
michael@0 783 c=0;
michael@0 784 /* } else { keep c for surrogate handling, length will be set there */
michael@0 785 }
michael@0 786 } else {
michael@0 787 length=0;
michael@0 788 c=0;
michael@0 789 }
michael@0 790 } else {
michael@0 791 /* keep c for surrogate handling, length will be set there */
michael@0 792 targetCapacity+=2*count;
michael@0 793 }
michael@0 794 } else {
michael@0 795 length=0; /* from here on, length counts the bytes in overflow[] */
michael@0 796 }
michael@0 797
michael@0 798 if(c!=0) {
michael@0 799 /*
michael@0 800 * c is a surrogate, and
michael@0 801 * - source or target too short
michael@0 802 * - or the surrogate is unmatched
michael@0 803 */
michael@0 804 length=0;
michael@0 805 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 806 if(source<pArgs->sourceLimit) {
michael@0 807 if(U16_IS_TRAIL(trail=*source)) {
michael@0 808 /* output the surrogate pair, will overflow (see conditions comment above) */
michael@0 809 ++source;
michael@0 810 overflow[0]=(char)c;
michael@0 811 overflow[1]=(char)(c>>8);
michael@0 812 overflow[2]=(char)trail;
michael@0 813 overflow[3]=(char)(trail>>8);
michael@0 814 length=4; /* 4 bytes to output */
michael@0 815 c=0;
michael@0 816 } else {
michael@0 817 /* unmatched lead surrogate */
michael@0 818 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 819 }
michael@0 820 } else {
michael@0 821 /* see if the trail surrogate is in the next buffer */
michael@0 822 }
michael@0 823 } else {
michael@0 824 /* unmatched trail surrogate */
michael@0 825 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 826 }
michael@0 827 cnv->fromUChar32=c;
michael@0 828 }
michael@0 829
michael@0 830 if(length>0) {
michael@0 831 /* output length bytes with overflow (length>targetCapacity>0) */
michael@0 832 ucnv_fromUWriteBytes(cnv,
michael@0 833 overflow, length,
michael@0 834 &target, pArgs->targetLimit,
michael@0 835 &offsets, sourceIndex,
michael@0 836 pErrorCode);
michael@0 837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
michael@0 838 }
michael@0 839
michael@0 840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
michael@0 841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 842 }
michael@0 843
michael@0 844 /* write back the updated pointers */
michael@0 845 pArgs->source=source;
michael@0 846 pArgs->target=target;
michael@0 847 pArgs->offsets=offsets;
michael@0 848 }
michael@0 849
michael@0 850 static void
michael@0 851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 852 UErrorCode *pErrorCode) {
michael@0 853 UConverter *cnv;
michael@0 854 const uint8_t *source;
michael@0 855 UChar *target;
michael@0 856 int32_t *offsets;
michael@0 857
michael@0 858 uint32_t targetCapacity, length, count, sourceIndex;
michael@0 859 UChar c, trail;
michael@0 860
michael@0 861 if(pArgs->converter->mode<8) {
michael@0 862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 863 return;
michael@0 864 }
michael@0 865
michael@0 866 cnv=pArgs->converter;
michael@0 867 source=(const uint8_t *)pArgs->source;
michael@0 868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
michael@0 869 if(length<=0 && cnv->toUnicodeStatus==0) {
michael@0 870 /* no input, nothing to do */
michael@0 871 return;
michael@0 872 }
michael@0 873
michael@0 874 target=pArgs->target;
michael@0 875 if(target >= pArgs->targetLimit) {
michael@0 876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 877 return;
michael@0 878 }
michael@0 879
michael@0 880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
michael@0 881 offsets=pArgs->offsets;
michael@0 882 sourceIndex=0;
michael@0 883 c=0;
michael@0 884
michael@0 885 /* complete a partial UChar or pair from the last call */
michael@0 886 if(cnv->toUnicodeStatus!=0) {
michael@0 887 /*
michael@0 888 * special case: single byte from a previous buffer,
michael@0 889 * where the byte turned out not to belong to a trail surrogate
michael@0 890 * and the preceding, unmatched lead surrogate was put into toUBytes[]
michael@0 891 * for error handling
michael@0 892 */
michael@0 893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
michael@0 894 cnv->toULength=1;
michael@0 895 cnv->toUnicodeStatus=0;
michael@0 896 }
michael@0 897 if((count=cnv->toULength)!=0) {
michael@0 898 uint8_t *p=cnv->toUBytes;
michael@0 899 do {
michael@0 900 p[count++]=*source++;
michael@0 901 ++sourceIndex;
michael@0 902 --length;
michael@0 903 if(count==2) {
michael@0 904 c=((UChar)p[1]<<8)|p[0];
michael@0 905 if(U16_IS_SINGLE(c)) {
michael@0 906 /* output the BMP code point */
michael@0 907 *target++=c;
michael@0 908 if(offsets!=NULL) {
michael@0 909 *offsets++=-1;
michael@0 910 }
michael@0 911 --targetCapacity;
michael@0 912 count=0;
michael@0 913 c=0;
michael@0 914 break;
michael@0 915 } else if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 916 /* continue collecting bytes for the trail surrogate */
michael@0 917 c=0; /* avoid unnecessary surrogate handling below */
michael@0 918 } else {
michael@0 919 /* fall through to error handling for an unmatched trail surrogate */
michael@0 920 break;
michael@0 921 }
michael@0 922 } else if(count==4) {
michael@0 923 c=((UChar)p[1]<<8)|p[0];
michael@0 924 trail=((UChar)p[3]<<8)|p[2];
michael@0 925 if(U16_IS_TRAIL(trail)) {
michael@0 926 /* output the surrogate pair */
michael@0 927 *target++=c;
michael@0 928 if(targetCapacity>=2) {
michael@0 929 *target++=trail;
michael@0 930 if(offsets!=NULL) {
michael@0 931 *offsets++=-1;
michael@0 932 *offsets++=-1;
michael@0 933 }
michael@0 934 targetCapacity-=2;
michael@0 935 } else /* targetCapacity==1 */ {
michael@0 936 targetCapacity=0;
michael@0 937 cnv->UCharErrorBuffer[0]=trail;
michael@0 938 cnv->UCharErrorBufferLength=1;
michael@0 939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 940 }
michael@0 941 count=0;
michael@0 942 c=0;
michael@0 943 break;
michael@0 944 } else {
michael@0 945 /* unmatched lead surrogate, handle here for consistent toUBytes[] */
michael@0 946 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 947
michael@0 948 /* back out reading the code unit after it */
michael@0 949 if(((const uint8_t *)pArgs->source-source)>=2) {
michael@0 950 source-=2;
michael@0 951 } else {
michael@0 952 /*
michael@0 953 * if the trail unit's first byte was in a previous buffer, then
michael@0 954 * we need to put it into a special place because toUBytes[] will be
michael@0 955 * used for the lead unit's bytes
michael@0 956 */
michael@0 957 cnv->toUnicodeStatus=0x100|p[2];
michael@0 958 --source;
michael@0 959 }
michael@0 960 cnv->toULength=2;
michael@0 961
michael@0 962 /* write back the updated pointers */
michael@0 963 pArgs->source=(const char *)source;
michael@0 964 pArgs->target=target;
michael@0 965 pArgs->offsets=offsets;
michael@0 966 return;
michael@0 967 }
michael@0 968 }
michael@0 969 } while(length>0);
michael@0 970 cnv->toULength=(int8_t)count;
michael@0 971 }
michael@0 972
michael@0 973 /* copy an even number of bytes for complete UChars */
michael@0 974 count=2*targetCapacity;
michael@0 975 if(count>length) {
michael@0 976 count=length&~1;
michael@0 977 }
michael@0 978 if(c==0 && count>0) {
michael@0 979 length-=count;
michael@0 980 count>>=1;
michael@0 981 targetCapacity-=count;
michael@0 982 if(offsets==NULL) {
michael@0 983 do {
michael@0 984 c=((UChar)source[1]<<8)|source[0];
michael@0 985 source+=2;
michael@0 986 if(U16_IS_SINGLE(c)) {
michael@0 987 *target++=c;
michael@0 988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
michael@0 989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
michael@0 990 ) {
michael@0 991 source+=2;
michael@0 992 --count;
michael@0 993 *target++=c;
michael@0 994 *target++=trail;
michael@0 995 } else {
michael@0 996 break;
michael@0 997 }
michael@0 998 } while(--count>0);
michael@0 999 } else {
michael@0 1000 do {
michael@0 1001 c=((UChar)source[1]<<8)|source[0];
michael@0 1002 source+=2;
michael@0 1003 if(U16_IS_SINGLE(c)) {
michael@0 1004 *target++=c;
michael@0 1005 *offsets++=sourceIndex;
michael@0 1006 sourceIndex+=2;
michael@0 1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
michael@0 1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
michael@0 1009 ) {
michael@0 1010 source+=2;
michael@0 1011 --count;
michael@0 1012 *target++=c;
michael@0 1013 *target++=trail;
michael@0 1014 *offsets++=sourceIndex;
michael@0 1015 *offsets++=sourceIndex;
michael@0 1016 sourceIndex+=4;
michael@0 1017 } else {
michael@0 1018 break;
michael@0 1019 }
michael@0 1020 } while(--count>0);
michael@0 1021 }
michael@0 1022
michael@0 1023 if(count==0) {
michael@0 1024 /* done with the loop for complete UChars */
michael@0 1025 c=0;
michael@0 1026 } else {
michael@0 1027 /* keep c for surrogate handling, trail will be set there */
michael@0 1028 length+=2*(count-1); /* one more byte pair was consumed than count decremented */
michael@0 1029 targetCapacity+=count;
michael@0 1030 }
michael@0 1031 }
michael@0 1032
michael@0 1033 if(c!=0) {
michael@0 1034 /*
michael@0 1035 * c is a surrogate, and
michael@0 1036 * - source or target too short
michael@0 1037 * - or the surrogate is unmatched
michael@0 1038 */
michael@0 1039 cnv->toUBytes[0]=(uint8_t)c;
michael@0 1040 cnv->toUBytes[1]=(uint8_t)(c>>8);
michael@0 1041 cnv->toULength=2;
michael@0 1042
michael@0 1043 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1044 if(length>=2) {
michael@0 1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
michael@0 1046 /* output the surrogate pair, will overflow (see conditions comment above) */
michael@0 1047 source+=2;
michael@0 1048 length-=2;
michael@0 1049 *target++=c;
michael@0 1050 if(offsets!=NULL) {
michael@0 1051 *offsets++=sourceIndex;
michael@0 1052 }
michael@0 1053 cnv->UCharErrorBuffer[0]=trail;
michael@0 1054 cnv->UCharErrorBufferLength=1;
michael@0 1055 cnv->toULength=0;
michael@0 1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1057 } else {
michael@0 1058 /* unmatched lead surrogate */
michael@0 1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1060 }
michael@0 1061 } else {
michael@0 1062 /* see if the trail surrogate is in the next buffer */
michael@0 1063 }
michael@0 1064 } else {
michael@0 1065 /* unmatched trail surrogate */
michael@0 1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
michael@0 1067 }
michael@0 1068 }
michael@0 1069
michael@0 1070 if(U_SUCCESS(*pErrorCode)) {
michael@0 1071 /* check for a remaining source byte */
michael@0 1072 if(length>0) {
michael@0 1073 if(targetCapacity==0) {
michael@0 1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
michael@0 1075 } else {
michael@0 1076 /* it must be length==1 because otherwise the above would have copied more */
michael@0 1077 cnv->toUBytes[cnv->toULength++]=*source++;
michael@0 1078 }
michael@0 1079 }
michael@0 1080 }
michael@0 1081
michael@0 1082 /* write back the updated pointers */
michael@0 1083 pArgs->source=(const char *)source;
michael@0 1084 pArgs->target=target;
michael@0 1085 pArgs->offsets=offsets;
michael@0 1086 }
michael@0 1087
michael@0 1088 static UChar32
michael@0 1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
michael@0 1090 const uint8_t *s, *sourceLimit;
michael@0 1091 UChar32 c;
michael@0 1092
michael@0 1093 if(pArgs->converter->mode<8) {
michael@0 1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
michael@0 1095 }
michael@0 1096
michael@0 1097 s=(const uint8_t *)pArgs->source;
michael@0 1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit;
michael@0 1099
michael@0 1100 if(s>=sourceLimit) {
michael@0 1101 /* no input */
michael@0 1102 *err=U_INDEX_OUTOFBOUNDS_ERROR;
michael@0 1103 return 0xffff;
michael@0 1104 }
michael@0 1105
michael@0 1106 if(s+2>sourceLimit) {
michael@0 1107 /* only one byte: truncated UChar */
michael@0 1108 pArgs->converter->toUBytes[0]=*s++;
michael@0 1109 pArgs->converter->toULength=1;
michael@0 1110 pArgs->source=(const char *)s;
michael@0 1111 *err = U_TRUNCATED_CHAR_FOUND;
michael@0 1112 return 0xffff;
michael@0 1113 }
michael@0 1114
michael@0 1115 /* get one UChar */
michael@0 1116 c=((UChar32)s[1]<<8)|*s;
michael@0 1117 s+=2;
michael@0 1118
michael@0 1119 /* check for a surrogate pair */
michael@0 1120 if(U_IS_SURROGATE(c)) {
michael@0 1121 if(U16_IS_SURROGATE_LEAD(c)) {
michael@0 1122 if(s+2<=sourceLimit) {
michael@0 1123 UChar trail;
michael@0 1124
michael@0 1125 /* get a second UChar and see if it is a trail surrogate */
michael@0 1126 trail=((UChar)s[1]<<8)|*s;
michael@0 1127 if(U16_IS_TRAIL(trail)) {
michael@0 1128 c=U16_GET_SUPPLEMENTARY(c, trail);
michael@0 1129 s+=2;
michael@0 1130 } else {
michael@0 1131 /* unmatched lead surrogate */
michael@0 1132 c=-2;
michael@0 1133 }
michael@0 1134 } else {
michael@0 1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
michael@0 1136 uint8_t *bytes=pArgs->converter->toUBytes;
michael@0 1137 s-=2;
michael@0 1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
michael@0 1139 do {
michael@0 1140 *bytes++=*s++;
michael@0 1141 } while(s<sourceLimit);
michael@0 1142
michael@0 1143 c=0xffff;
michael@0 1144 *err=U_TRUNCATED_CHAR_FOUND;
michael@0 1145 }
michael@0 1146 } else {
michael@0 1147 /* unmatched trail surrogate */
michael@0 1148 c=-2;
michael@0 1149 }
michael@0 1150
michael@0 1151 if(c<0) {
michael@0 1152 /* write the unmatched surrogate */
michael@0 1153 uint8_t *bytes=pArgs->converter->toUBytes;
michael@0 1154 pArgs->converter->toULength=2;
michael@0 1155 *bytes=*(s-2);
michael@0 1156 bytes[1]=*(s-1);
michael@0 1157
michael@0 1158 c=0xffff;
michael@0 1159 *err=U_ILLEGAL_CHAR_FOUND;
michael@0 1160 }
michael@0 1161 }
michael@0 1162
michael@0 1163 pArgs->source=(const char *)s;
michael@0 1164 return c;
michael@0 1165 }
michael@0 1166
michael@0 1167 static void
michael@0 1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
michael@0 1169 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 1170 /* reset toUnicode state */
michael@0 1171 if(UCNV_GET_VERSION(cnv)==0) {
michael@0 1172 cnv->mode=8; /* no BOM handling */
michael@0 1173 } else {
michael@0 1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
michael@0 1175 }
michael@0 1176 }
michael@0 1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
michael@0 1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
michael@0 1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
michael@0 1180 }
michael@0 1181 }
michael@0 1182
michael@0 1183 static void
michael@0 1184 _UTF16LEOpen(UConverter *cnv,
michael@0 1185 UConverterLoadArgs *pArgs,
michael@0 1186 UErrorCode *pErrorCode) {
michael@0 1187 if(UCNV_GET_VERSION(cnv)<=1) {
michael@0 1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH);
michael@0 1189 } else {
michael@0 1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1191 }
michael@0 1192 }
michael@0 1193
michael@0 1194 static const char *
michael@0 1195 _UTF16LEGetName(const UConverter *cnv) {
michael@0 1196 if(UCNV_GET_VERSION(cnv)==0) {
michael@0 1197 return "UTF-16LE";
michael@0 1198 } else {
michael@0 1199 return "UTF-16LE,version=1";
michael@0 1200 }
michael@0 1201 }
michael@0 1202
michael@0 1203 static const UConverterImpl _UTF16LEImpl={
michael@0 1204 UCNV_UTF16_LittleEndian,
michael@0 1205
michael@0 1206 NULL,
michael@0 1207 NULL,
michael@0 1208
michael@0 1209 _UTF16LEOpen,
michael@0 1210 NULL,
michael@0 1211 _UTF16LEReset,
michael@0 1212
michael@0 1213 _UTF16LEToUnicodeWithOffsets,
michael@0 1214 _UTF16LEToUnicodeWithOffsets,
michael@0 1215 _UTF16LEFromUnicodeWithOffsets,
michael@0 1216 _UTF16LEFromUnicodeWithOffsets,
michael@0 1217 _UTF16LEGetNextUChar,
michael@0 1218
michael@0 1219 NULL,
michael@0 1220 _UTF16LEGetName,
michael@0 1221 NULL,
michael@0 1222 NULL,
michael@0 1223 ucnv_getNonSurrogateUnicodeSet
michael@0 1224 };
michael@0 1225
michael@0 1226
michael@0 1227 static const UConverterStaticData _UTF16LEStaticData={
michael@0 1228 sizeof(UConverterStaticData),
michael@0 1229 "UTF-16LE",
michael@0 1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
michael@0 1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
michael@0 1232 0,
michael@0 1233 0,
michael@0 1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 1235 };
michael@0 1236
michael@0 1237
michael@0 1238 const UConverterSharedData _UTF16LEData={
michael@0 1239 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
michael@0 1241 0
michael@0 1242 };
michael@0 1243
michael@0 1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
michael@0 1245
michael@0 1246 /*
michael@0 1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
michael@0 1248 * accordingly.
michael@0 1249 * This is a simpler version of the UTF-32 converter, with
michael@0 1250 * fewer states for shorter BOMs.
michael@0 1251 *
michael@0 1252 * State values:
michael@0 1253 * 0 initial state
michael@0 1254 * 1 saw first byte
michael@0 1255 * 2..5 -
michael@0 1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
michael@0 1257 * 8 UTF-16BE mode
michael@0 1258 * 9 UTF-16LE mode
michael@0 1259 *
michael@0 1260 * During detection: state==number of initial bytes seen so far.
michael@0 1261 *
michael@0 1262 * On output, emit U+FEFF as the first code point.
michael@0 1263 *
michael@0 1264 * Variants:
michael@0 1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
michael@0 1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
michael@0 1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
michael@0 1268 */
michael@0 1269
michael@0 1270 static void
michael@0 1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
michael@0 1272 if(choice<=UCNV_RESET_TO_UNICODE) {
michael@0 1273 /* reset toUnicode: state=0 */
michael@0 1274 cnv->mode=0;
michael@0 1275 }
michael@0 1276 if(choice!=UCNV_RESET_TO_UNICODE) {
michael@0 1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */
michael@0 1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
michael@0 1279 }
michael@0 1280 }
michael@0 1281
michael@0 1282 static const UConverterSharedData _UTF16v2Data;
michael@0 1283
michael@0 1284 static void
michael@0 1285 _UTF16Open(UConverter *cnv,
michael@0 1286 UConverterLoadArgs *pArgs,
michael@0 1287 UErrorCode *pErrorCode) {
michael@0 1288 if(UCNV_GET_VERSION(cnv)<=2) {
michael@0 1289 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
michael@0 1290 /*
michael@0 1291 * Switch implementation, and switch the staticData that's different
michael@0 1292 * and was copied into the UConverter.
michael@0 1293 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
michael@0 1294 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
michael@0 1295 */
michael@0 1296 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
michael@0 1297 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
michael@0 1298 }
michael@0 1299 _UTF16Reset(cnv, UCNV_RESET_BOTH);
michael@0 1300 } else {
michael@0 1301 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
michael@0 1302 }
michael@0 1303 }
michael@0 1304
michael@0 1305 static const char *
michael@0 1306 _UTF16GetName(const UConverter *cnv) {
michael@0 1307 if(UCNV_GET_VERSION(cnv)==0) {
michael@0 1308 return "UTF-16";
michael@0 1309 } else if(UCNV_GET_VERSION(cnv)==1) {
michael@0 1310 return "UTF-16,version=1";
michael@0 1311 } else {
michael@0 1312 return "UTF-16,version=2";
michael@0 1313 }
michael@0 1314 }
michael@0 1315
michael@0 1316 const UConverterSharedData _UTF16Data;
michael@0 1317
michael@0 1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
michael@0 1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
michael@0 1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
michael@0 1321
michael@0 1322 static void
michael@0 1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
michael@0 1324 UErrorCode *pErrorCode) {
michael@0 1325 UConverter *cnv=pArgs->converter;
michael@0 1326 const char *source=pArgs->source;
michael@0 1327 const char *sourceLimit=pArgs->sourceLimit;
michael@0 1328 int32_t *offsets=pArgs->offsets;
michael@0 1329
michael@0 1330 int32_t state, offsetDelta;
michael@0 1331 uint8_t b;
michael@0 1332
michael@0 1333 state=cnv->mode;
michael@0 1334
michael@0 1335 /*
michael@0 1336 * If we detect a BOM in this buffer, then we must add the BOM size to the
michael@0 1337 * offsets because the actual converter function will not see and count the BOM.
michael@0 1338 * offsetDelta will have the number of the BOM bytes that are in the current buffer.
michael@0 1339 */
michael@0 1340 offsetDelta=0;
michael@0 1341
michael@0 1342 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
michael@0 1343 switch(state) {
michael@0 1344 case 0:
michael@0 1345 cnv->toUBytes[0]=(uint8_t)*source++;
michael@0 1346 cnv->toULength=1;
michael@0 1347 state=1;
michael@0 1348 break;
michael@0 1349 case 1:
michael@0 1350 /*
michael@0 1351 * Only inside this switch case can the state variable
michael@0 1352 * temporarily take two additional values:
michael@0 1353 * 6: BOM error, continue with BE
michael@0 1354 * 7: BOM error, continue with LE
michael@0 1355 */
michael@0 1356 b=*source;
michael@0 1357 if(cnv->toUBytes[0]==0xfe && b==0xff) {
michael@0 1358 if(IS_UTF16LE(cnv)) {
michael@0 1359 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
michael@0 1360 } else {
michael@0 1361 state=8; /* detect UTF-16BE */
michael@0 1362 }
michael@0 1363 } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
michael@0 1364 if(IS_UTF16BE(cnv)) {
michael@0 1365 state=6; /* illegal reverse BOM for Java "UnicodeBig" */
michael@0 1366 } else {
michael@0 1367 state=9; /* detect UTF-16LE */
michael@0 1368 }
michael@0 1369 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
michael@0 1370 state=6; /* illegal missing BOM for Java "Unicode" */
michael@0 1371 }
michael@0 1372 if(state>=8) {
michael@0 1373 /* BOM detected, consume it */
michael@0 1374 ++source;
michael@0 1375 cnv->toULength=0;
michael@0 1376 offsetDelta=(int32_t)(source-pArgs->source);
michael@0 1377 } else if(state<6) {
michael@0 1378 /* ok: no BOM, and not a reverse BOM */
michael@0 1379 if(source!=pArgs->source) {
michael@0 1380 /* reset the source for a correct first offset */
michael@0 1381 source=pArgs->source;
michael@0 1382 cnv->toULength=0;
michael@0 1383 }
michael@0 1384 if(IS_UTF16LE(cnv)) {
michael@0 1385 /* Make Java "UnicodeLittle" default to LE. */
michael@0 1386 state=9;
michael@0 1387 } else {
michael@0 1388 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
michael@0 1389 state=8;
michael@0 1390 }
michael@0 1391 } else {
michael@0 1392 /*
michael@0 1393 * error: missing BOM, or reverse BOM
michael@0 1394 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
michael@0 1395 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
michael@0 1396 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
michael@0 1397 */
michael@0 1398 /* report the non-BOM or reverse BOM as an illegal sequence */
michael@0 1399 cnv->toUBytes[1]=b;
michael@0 1400 cnv->toULength=2;
michael@0 1401 pArgs->source=source+1;
michael@0 1402 /* continue with conversion if the callback resets the error */
michael@0 1403 /*
michael@0 1404 * Make Java "Unicode" default to BE like standard UTF-16.
michael@0 1405 * Make Java "UnicodeBig" and "UnicodeLittle" default
michael@0 1406 * to their normal endiannesses.
michael@0 1407 */
michael@0 1408 cnv->mode=state+2;
michael@0 1409 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
michael@0 1410 return;
michael@0 1411 }
michael@0 1412 /* convert the rest of the stream */
michael@0 1413 cnv->mode=state;
michael@0 1414 continue;
michael@0 1415 case 8:
michael@0 1416 /* call UTF-16BE */
michael@0 1417 pArgs->source=source;
michael@0 1418 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 1419 source=pArgs->source;
michael@0 1420 break;
michael@0 1421 case 9:
michael@0 1422 /* call UTF-16LE */
michael@0 1423 pArgs->source=source;
michael@0 1424 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 1425 source=pArgs->source;
michael@0 1426 break;
michael@0 1427 default:
michael@0 1428 break; /* does not occur */
michael@0 1429 }
michael@0 1430 }
michael@0 1431
michael@0 1432 /* add BOM size to offsets - see comment at offsetDelta declaration */
michael@0 1433 if(offsets!=NULL && offsetDelta!=0) {
michael@0 1434 int32_t *offsetsLimit=pArgs->offsets;
michael@0 1435 while(offsets<offsetsLimit) {
michael@0 1436 *offsets++ += offsetDelta;
michael@0 1437 }
michael@0 1438 }
michael@0 1439
michael@0 1440 pArgs->source=source;
michael@0 1441
michael@0 1442 if(source==sourceLimit && pArgs->flush) {
michael@0 1443 /* handle truncated input */
michael@0 1444 switch(state) {
michael@0 1445 case 0:
michael@0 1446 break; /* no input at all, nothing to do */
michael@0 1447 case 8:
michael@0 1448 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 1449 break;
michael@0 1450 case 9:
michael@0 1451 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
michael@0 1452 break;
michael@0 1453 default:
michael@0 1454 /* 0<state<8: framework will report truncation, nothing to do here */
michael@0 1455 break;
michael@0 1456 }
michael@0 1457 }
michael@0 1458
michael@0 1459 cnv->mode=state;
michael@0 1460 }
michael@0 1461
michael@0 1462 static UChar32
michael@0 1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
michael@0 1464 UErrorCode *pErrorCode) {
michael@0 1465 switch(pArgs->converter->mode) {
michael@0 1466 case 8:
michael@0 1467 return _UTF16BEGetNextUChar(pArgs, pErrorCode);
michael@0 1468 case 9:
michael@0 1469 return _UTF16LEGetNextUChar(pArgs, pErrorCode);
michael@0 1470 default:
michael@0 1471 return UCNV_GET_NEXT_UCHAR_USE_TO_U;
michael@0 1472 }
michael@0 1473 }
michael@0 1474
michael@0 1475 static const UConverterImpl _UTF16Impl = {
michael@0 1476 UCNV_UTF16,
michael@0 1477
michael@0 1478 NULL,
michael@0 1479 NULL,
michael@0 1480
michael@0 1481 _UTF16Open,
michael@0 1482 NULL,
michael@0 1483 _UTF16Reset,
michael@0 1484
michael@0 1485 _UTF16ToUnicodeWithOffsets,
michael@0 1486 _UTF16ToUnicodeWithOffsets,
michael@0 1487 _UTF16PEFromUnicodeWithOffsets,
michael@0 1488 _UTF16PEFromUnicodeWithOffsets,
michael@0 1489 _UTF16GetNextUChar,
michael@0 1490
michael@0 1491 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
michael@0 1492 _UTF16GetName,
michael@0 1493 NULL,
michael@0 1494 NULL,
michael@0 1495 ucnv_getNonSurrogateUnicodeSet
michael@0 1496 };
michael@0 1497
michael@0 1498 static const UConverterStaticData _UTF16StaticData = {
michael@0 1499 sizeof(UConverterStaticData),
michael@0 1500 "UTF-16",
michael@0 1501 1204, /* CCSID for BOM sensitive UTF-16 */
michael@0 1502 UCNV_IBM, UCNV_UTF16, 2, 2,
michael@0 1503 #if U_IS_BIG_ENDIAN
michael@0 1504 { 0xff, 0xfd, 0, 0 }, 2,
michael@0 1505 #else
michael@0 1506 { 0xfd, 0xff, 0, 0 }, 2,
michael@0 1507 #endif
michael@0 1508 FALSE, FALSE,
michael@0 1509 0,
michael@0 1510 0,
michael@0 1511 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 1512 };
michael@0 1513
michael@0 1514 const UConverterSharedData _UTF16Data = {
michael@0 1515 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 1516 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
michael@0 1517 0
michael@0 1518 };
michael@0 1519
michael@0 1520 static const UConverterImpl _UTF16v2Impl = {
michael@0 1521 UCNV_UTF16,
michael@0 1522
michael@0 1523 NULL,
michael@0 1524 NULL,
michael@0 1525
michael@0 1526 _UTF16Open,
michael@0 1527 NULL,
michael@0 1528 _UTF16Reset,
michael@0 1529
michael@0 1530 _UTF16ToUnicodeWithOffsets,
michael@0 1531 _UTF16ToUnicodeWithOffsets,
michael@0 1532 _UTF16BEFromUnicodeWithOffsets,
michael@0 1533 _UTF16BEFromUnicodeWithOffsets,
michael@0 1534 _UTF16GetNextUChar,
michael@0 1535
michael@0 1536 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
michael@0 1537 _UTF16GetName,
michael@0 1538 NULL,
michael@0 1539 NULL,
michael@0 1540 ucnv_getNonSurrogateUnicodeSet
michael@0 1541 };
michael@0 1542
michael@0 1543 static const UConverterStaticData _UTF16v2StaticData = {
michael@0 1544 sizeof(UConverterStaticData),
michael@0 1545 "UTF-16,version=2",
michael@0 1546 1204, /* CCSID for BOM sensitive UTF-16 */
michael@0 1547 UCNV_IBM, UCNV_UTF16, 2, 2,
michael@0 1548 { 0xff, 0xfd, 0, 0 }, 2,
michael@0 1549 FALSE, FALSE,
michael@0 1550 0,
michael@0 1551 0,
michael@0 1552 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
michael@0 1553 };
michael@0 1554
michael@0 1555 static const UConverterSharedData _UTF16v2Data = {
michael@0 1556 sizeof(UConverterSharedData), ~((uint32_t) 0),
michael@0 1557 NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
michael@0 1558 0
michael@0 1559 };
michael@0 1560
michael@0 1561 #endif

mercurial