1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/icu/source/common/ucnv_u16.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1561 @@ 1.4 +/* 1.5 +********************************************************************** 1.6 +* Copyright (C) 2002-2010, International Business Machines 1.7 +* Corporation and others. All Rights Reserved. 1.8 +********************************************************************** 1.9 +* file name: ucnv_u16.c 1.10 +* encoding: US-ASCII 1.11 +* tab size: 8 (not used) 1.12 +* indentation:4 1.13 +* 1.14 +* created on: 2002jul01 1.15 +* created by: Markus W. Scherer 1.16 +* 1.17 +* UTF-16 converter implementation. Used to be in ucnv_utf.c. 1.18 +*/ 1.19 + 1.20 +#include "unicode/utypes.h" 1.21 + 1.22 +#if !UCONFIG_NO_CONVERSION 1.23 + 1.24 +#include "unicode/ucnv.h" 1.25 +#include "ucnv_bld.h" 1.26 +#include "ucnv_cnv.h" 1.27 +#include "cmemory.h" 1.28 + 1.29 +enum { 1.30 + UCNV_NEED_TO_WRITE_BOM=1 1.31 +}; 1.32 + 1.33 +/* 1.34 + * The UTF-16 toUnicode implementation is also used for the Java-specific 1.35 + * "with BOM" variants of UTF-16BE and UTF-16LE. 1.36 + */ 1.37 +static void 1.38 +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.39 + UErrorCode *pErrorCode); 1.40 + 1.41 +/* UTF-16BE ----------------------------------------------------------------- */ 1.42 + 1.43 +#if U_IS_BIG_ENDIAN 1.44 +# define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets 1.45 +#else 1.46 +# define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets 1.47 +#endif 1.48 + 1.49 + 1.50 +static void 1.51 +_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.52 + UErrorCode *pErrorCode) { 1.53 + UConverter *cnv; 1.54 + const UChar *source; 1.55 + char *target; 1.56 + int32_t *offsets; 1.57 + 1.58 + uint32_t targetCapacity, length, sourceIndex; 1.59 + UChar c, trail; 1.60 + char overflow[4]; 1.61 + 1.62 + source=pArgs->source; 1.63 + length=(int32_t)(pArgs->sourceLimit-source); 1.64 + if(length<=0) { 1.65 + /* no input, nothing to do */ 1.66 + return; 1.67 + } 1.68 + 1.69 + cnv=pArgs->converter; 1.70 + 1.71 + /* write the BOM if necessary */ 1.72 + if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 1.73 + static const char bom[]={ (char)0xfe, (char)0xff }; 1.74 + ucnv_fromUWriteBytes(cnv, 1.75 + bom, 2, 1.76 + &pArgs->target, pArgs->targetLimit, 1.77 + &pArgs->offsets, -1, 1.78 + pErrorCode); 1.79 + cnv->fromUnicodeStatus=0; 1.80 + } 1.81 + 1.82 + target=pArgs->target; 1.83 + if(target >= pArgs->targetLimit) { 1.84 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.85 + return; 1.86 + } 1.87 + 1.88 + targetCapacity=(uint32_t)(pArgs->targetLimit-target); 1.89 + offsets=pArgs->offsets; 1.90 + sourceIndex=0; 1.91 + 1.92 + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 1.93 + 1.94 + if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 1.95 + /* the last buffer ended with a lead surrogate, output the surrogate pair */ 1.96 + ++source; 1.97 + --length; 1.98 + target[0]=(uint8_t)(c>>8); 1.99 + target[1]=(uint8_t)c; 1.100 + target[2]=(uint8_t)(trail>>8); 1.101 + target[3]=(uint8_t)trail; 1.102 + target+=4; 1.103 + targetCapacity-=4; 1.104 + if(offsets!=NULL) { 1.105 + *offsets++=-1; 1.106 + *offsets++=-1; 1.107 + *offsets++=-1; 1.108 + *offsets++=-1; 1.109 + } 1.110 + sourceIndex=1; 1.111 + cnv->fromUChar32=c=0; 1.112 + } 1.113 + 1.114 + if(c==0) { 1.115 + /* copy an even number of bytes for complete UChars */ 1.116 + uint32_t count=2*length; 1.117 + if(count>targetCapacity) { 1.118 + count=targetCapacity&~1; 1.119 + } 1.120 + /* count is even */ 1.121 + targetCapacity-=count; 1.122 + count>>=1; 1.123 + length-=count; 1.124 + 1.125 + if(offsets==NULL) { 1.126 + while(count>0) { 1.127 + c=*source++; 1.128 + if(U16_IS_SINGLE(c)) { 1.129 + target[0]=(uint8_t)(c>>8); 1.130 + target[1]=(uint8_t)c; 1.131 + target+=2; 1.132 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 1.133 + ++source; 1.134 + --count; 1.135 + target[0]=(uint8_t)(c>>8); 1.136 + target[1]=(uint8_t)c; 1.137 + target[2]=(uint8_t)(trail>>8); 1.138 + target[3]=(uint8_t)trail; 1.139 + target+=4; 1.140 + } else { 1.141 + break; 1.142 + } 1.143 + --count; 1.144 + } 1.145 + } else { 1.146 + while(count>0) { 1.147 + c=*source++; 1.148 + if(U16_IS_SINGLE(c)) { 1.149 + target[0]=(uint8_t)(c>>8); 1.150 + target[1]=(uint8_t)c; 1.151 + target+=2; 1.152 + *offsets++=sourceIndex; 1.153 + *offsets++=sourceIndex++; 1.154 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 1.155 + ++source; 1.156 + --count; 1.157 + target[0]=(uint8_t)(c>>8); 1.158 + target[1]=(uint8_t)c; 1.159 + target[2]=(uint8_t)(trail>>8); 1.160 + target[3]=(uint8_t)trail; 1.161 + target+=4; 1.162 + *offsets++=sourceIndex; 1.163 + *offsets++=sourceIndex; 1.164 + *offsets++=sourceIndex; 1.165 + *offsets++=sourceIndex; 1.166 + sourceIndex+=2; 1.167 + } else { 1.168 + break; 1.169 + } 1.170 + --count; 1.171 + } 1.172 + } 1.173 + 1.174 + if(count==0) { 1.175 + /* done with the loop for complete UChars */ 1.176 + if(length>0 && targetCapacity>0) { 1.177 + /* 1.178 + * there is more input and some target capacity - 1.179 + * it must be targetCapacity==1 because otherwise 1.180 + * the above would have copied more; 1.181 + * prepare for overflow output 1.182 + */ 1.183 + if(U16_IS_SINGLE(c=*source++)) { 1.184 + overflow[0]=(char)(c>>8); 1.185 + overflow[1]=(char)c; 1.186 + length=2; /* 2 bytes to output */ 1.187 + c=0; 1.188 + /* } else { keep c for surrogate handling, length will be set there */ 1.189 + } 1.190 + } else { 1.191 + length=0; 1.192 + c=0; 1.193 + } 1.194 + } else { 1.195 + /* keep c for surrogate handling, length will be set there */ 1.196 + targetCapacity+=2*count; 1.197 + } 1.198 + } else { 1.199 + length=0; /* from here on, length counts the bytes in overflow[] */ 1.200 + } 1.201 + 1.202 + if(c!=0) { 1.203 + /* 1.204 + * c is a surrogate, and 1.205 + * - source or target too short 1.206 + * - or the surrogate is unmatched 1.207 + */ 1.208 + length=0; 1.209 + if(U16_IS_SURROGATE_LEAD(c)) { 1.210 + if(source<pArgs->sourceLimit) { 1.211 + if(U16_IS_TRAIL(trail=*source)) { 1.212 + /* output the surrogate pair, will overflow (see conditions comment above) */ 1.213 + ++source; 1.214 + overflow[0]=(char)(c>>8); 1.215 + overflow[1]=(char)c; 1.216 + overflow[2]=(char)(trail>>8); 1.217 + overflow[3]=(char)trail; 1.218 + length=4; /* 4 bytes to output */ 1.219 + c=0; 1.220 + } else { 1.221 + /* unmatched lead surrogate */ 1.222 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.223 + } 1.224 + } else { 1.225 + /* see if the trail surrogate is in the next buffer */ 1.226 + } 1.227 + } else { 1.228 + /* unmatched trail surrogate */ 1.229 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.230 + } 1.231 + cnv->fromUChar32=c; 1.232 + } 1.233 + 1.234 + if(length>0) { 1.235 + /* output length bytes with overflow (length>targetCapacity>0) */ 1.236 + ucnv_fromUWriteBytes(cnv, 1.237 + overflow, length, 1.238 + (char **)&target, pArgs->targetLimit, 1.239 + &offsets, sourceIndex, 1.240 + pErrorCode); 1.241 + targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 1.242 + } 1.243 + 1.244 + if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 1.245 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.246 + } 1.247 + 1.248 + /* write back the updated pointers */ 1.249 + pArgs->source=source; 1.250 + pArgs->target=(char *)target; 1.251 + pArgs->offsets=offsets; 1.252 +} 1.253 + 1.254 +static void 1.255 +_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.256 + UErrorCode *pErrorCode) { 1.257 + UConverter *cnv; 1.258 + const uint8_t *source; 1.259 + UChar *target; 1.260 + int32_t *offsets; 1.261 + 1.262 + uint32_t targetCapacity, length, count, sourceIndex; 1.263 + UChar c, trail; 1.264 + 1.265 + if(pArgs->converter->mode<8) { 1.266 + _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 1.267 + return; 1.268 + } 1.269 + 1.270 + cnv=pArgs->converter; 1.271 + source=(const uint8_t *)pArgs->source; 1.272 + length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 1.273 + if(length<=0 && cnv->toUnicodeStatus==0) { 1.274 + /* no input, nothing to do */ 1.275 + return; 1.276 + } 1.277 + 1.278 + target=pArgs->target; 1.279 + if(target >= pArgs->targetLimit) { 1.280 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.281 + return; 1.282 + } 1.283 + 1.284 + targetCapacity=(uint32_t)(pArgs->targetLimit-target); 1.285 + offsets=pArgs->offsets; 1.286 + sourceIndex=0; 1.287 + c=0; 1.288 + 1.289 + /* complete a partial UChar or pair from the last call */ 1.290 + if(cnv->toUnicodeStatus!=0) { 1.291 + /* 1.292 + * special case: single byte from a previous buffer, 1.293 + * where the byte turned out not to belong to a trail surrogate 1.294 + * and the preceding, unmatched lead surrogate was put into toUBytes[] 1.295 + * for error handling 1.296 + */ 1.297 + cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 1.298 + cnv->toULength=1; 1.299 + cnv->toUnicodeStatus=0; 1.300 + } 1.301 + if((count=cnv->toULength)!=0) { 1.302 + uint8_t *p=cnv->toUBytes; 1.303 + do { 1.304 + p[count++]=*source++; 1.305 + ++sourceIndex; 1.306 + --length; 1.307 + if(count==2) { 1.308 + c=((UChar)p[0]<<8)|p[1]; 1.309 + if(U16_IS_SINGLE(c)) { 1.310 + /* output the BMP code point */ 1.311 + *target++=c; 1.312 + if(offsets!=NULL) { 1.313 + *offsets++=-1; 1.314 + } 1.315 + --targetCapacity; 1.316 + count=0; 1.317 + c=0; 1.318 + break; 1.319 + } else if(U16_IS_SURROGATE_LEAD(c)) { 1.320 + /* continue collecting bytes for the trail surrogate */ 1.321 + c=0; /* avoid unnecessary surrogate handling below */ 1.322 + } else { 1.323 + /* fall through to error handling for an unmatched trail surrogate */ 1.324 + break; 1.325 + } 1.326 + } else if(count==4) { 1.327 + c=((UChar)p[0]<<8)|p[1]; 1.328 + trail=((UChar)p[2]<<8)|p[3]; 1.329 + if(U16_IS_TRAIL(trail)) { 1.330 + /* output the surrogate pair */ 1.331 + *target++=c; 1.332 + if(targetCapacity>=2) { 1.333 + *target++=trail; 1.334 + if(offsets!=NULL) { 1.335 + *offsets++=-1; 1.336 + *offsets++=-1; 1.337 + } 1.338 + targetCapacity-=2; 1.339 + } else /* targetCapacity==1 */ { 1.340 + targetCapacity=0; 1.341 + cnv->UCharErrorBuffer[0]=trail; 1.342 + cnv->UCharErrorBufferLength=1; 1.343 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.344 + } 1.345 + count=0; 1.346 + c=0; 1.347 + break; 1.348 + } else { 1.349 + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 1.350 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.351 + 1.352 + /* back out reading the code unit after it */ 1.353 + if(((const uint8_t *)pArgs->source-source)>=2) { 1.354 + source-=2; 1.355 + } else { 1.356 + /* 1.357 + * if the trail unit's first byte was in a previous buffer, then 1.358 + * we need to put it into a special place because toUBytes[] will be 1.359 + * used for the lead unit's bytes 1.360 + */ 1.361 + cnv->toUnicodeStatus=0x100|p[2]; 1.362 + --source; 1.363 + } 1.364 + cnv->toULength=2; 1.365 + 1.366 + /* write back the updated pointers */ 1.367 + pArgs->source=(const char *)source; 1.368 + pArgs->target=target; 1.369 + pArgs->offsets=offsets; 1.370 + return; 1.371 + } 1.372 + } 1.373 + } while(length>0); 1.374 + cnv->toULength=(int8_t)count; 1.375 + } 1.376 + 1.377 + /* copy an even number of bytes for complete UChars */ 1.378 + count=2*targetCapacity; 1.379 + if(count>length) { 1.380 + count=length&~1; 1.381 + } 1.382 + if(c==0 && count>0) { 1.383 + length-=count; 1.384 + count>>=1; 1.385 + targetCapacity-=count; 1.386 + if(offsets==NULL) { 1.387 + do { 1.388 + c=((UChar)source[0]<<8)|source[1]; 1.389 + source+=2; 1.390 + if(U16_IS_SINGLE(c)) { 1.391 + *target++=c; 1.392 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1.393 + U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 1.394 + ) { 1.395 + source+=2; 1.396 + --count; 1.397 + *target++=c; 1.398 + *target++=trail; 1.399 + } else { 1.400 + break; 1.401 + } 1.402 + } while(--count>0); 1.403 + } else { 1.404 + do { 1.405 + c=((UChar)source[0]<<8)|source[1]; 1.406 + source+=2; 1.407 + if(U16_IS_SINGLE(c)) { 1.408 + *target++=c; 1.409 + *offsets++=sourceIndex; 1.410 + sourceIndex+=2; 1.411 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1.412 + U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 1.413 + ) { 1.414 + source+=2; 1.415 + --count; 1.416 + *target++=c; 1.417 + *target++=trail; 1.418 + *offsets++=sourceIndex; 1.419 + *offsets++=sourceIndex; 1.420 + sourceIndex+=4; 1.421 + } else { 1.422 + break; 1.423 + } 1.424 + } while(--count>0); 1.425 + } 1.426 + 1.427 + if(count==0) { 1.428 + /* done with the loop for complete UChars */ 1.429 + c=0; 1.430 + } else { 1.431 + /* keep c for surrogate handling, trail will be set there */ 1.432 + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 1.433 + targetCapacity+=count; 1.434 + } 1.435 + } 1.436 + 1.437 + if(c!=0) { 1.438 + /* 1.439 + * c is a surrogate, and 1.440 + * - source or target too short 1.441 + * - or the surrogate is unmatched 1.442 + */ 1.443 + cnv->toUBytes[0]=(uint8_t)(c>>8); 1.444 + cnv->toUBytes[1]=(uint8_t)c; 1.445 + cnv->toULength=2; 1.446 + 1.447 + if(U16_IS_SURROGATE_LEAD(c)) { 1.448 + if(length>=2) { 1.449 + if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { 1.450 + /* output the surrogate pair, will overflow (see conditions comment above) */ 1.451 + source+=2; 1.452 + length-=2; 1.453 + *target++=c; 1.454 + if(offsets!=NULL) { 1.455 + *offsets++=sourceIndex; 1.456 + } 1.457 + cnv->UCharErrorBuffer[0]=trail; 1.458 + cnv->UCharErrorBufferLength=1; 1.459 + cnv->toULength=0; 1.460 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.461 + } else { 1.462 + /* unmatched lead surrogate */ 1.463 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.464 + } 1.465 + } else { 1.466 + /* see if the trail surrogate is in the next buffer */ 1.467 + } 1.468 + } else { 1.469 + /* unmatched trail surrogate */ 1.470 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.471 + } 1.472 + } 1.473 + 1.474 + if(U_SUCCESS(*pErrorCode)) { 1.475 + /* check for a remaining source byte */ 1.476 + if(length>0) { 1.477 + if(targetCapacity==0) { 1.478 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.479 + } else { 1.480 + /* it must be length==1 because otherwise the above would have copied more */ 1.481 + cnv->toUBytes[cnv->toULength++]=*source++; 1.482 + } 1.483 + } 1.484 + } 1.485 + 1.486 + /* write back the updated pointers */ 1.487 + pArgs->source=(const char *)source; 1.488 + pArgs->target=target; 1.489 + pArgs->offsets=offsets; 1.490 +} 1.491 + 1.492 +static UChar32 1.493 +_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 1.494 + const uint8_t *s, *sourceLimit; 1.495 + UChar32 c; 1.496 + 1.497 + if(pArgs->converter->mode<8) { 1.498 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.499 + } 1.500 + 1.501 + s=(const uint8_t *)pArgs->source; 1.502 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.503 + 1.504 + if(s>=sourceLimit) { 1.505 + /* no input */ 1.506 + *err=U_INDEX_OUTOFBOUNDS_ERROR; 1.507 + return 0xffff; 1.508 + } 1.509 + 1.510 + if(s+2>sourceLimit) { 1.511 + /* only one byte: truncated UChar */ 1.512 + pArgs->converter->toUBytes[0]=*s++; 1.513 + pArgs->converter->toULength=1; 1.514 + pArgs->source=(const char *)s; 1.515 + *err = U_TRUNCATED_CHAR_FOUND; 1.516 + return 0xffff; 1.517 + } 1.518 + 1.519 + /* get one UChar */ 1.520 + c=((UChar32)*s<<8)|s[1]; 1.521 + s+=2; 1.522 + 1.523 + /* check for a surrogate pair */ 1.524 + if(U_IS_SURROGATE(c)) { 1.525 + if(U16_IS_SURROGATE_LEAD(c)) { 1.526 + if(s+2<=sourceLimit) { 1.527 + UChar trail; 1.528 + 1.529 + /* get a second UChar and see if it is a trail surrogate */ 1.530 + trail=((UChar)*s<<8)|s[1]; 1.531 + if(U16_IS_TRAIL(trail)) { 1.532 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.533 + s+=2; 1.534 + } else { 1.535 + /* unmatched lead surrogate */ 1.536 + c=-2; 1.537 + } 1.538 + } else { 1.539 + /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 1.540 + uint8_t *bytes=pArgs->converter->toUBytes; 1.541 + s-=2; 1.542 + pArgs->converter->toULength=(int8_t)(sourceLimit-s); 1.543 + do { 1.544 + *bytes++=*s++; 1.545 + } while(s<sourceLimit); 1.546 + 1.547 + c=0xffff; 1.548 + *err=U_TRUNCATED_CHAR_FOUND; 1.549 + } 1.550 + } else { 1.551 + /* unmatched trail surrogate */ 1.552 + c=-2; 1.553 + } 1.554 + 1.555 + if(c<0) { 1.556 + /* write the unmatched surrogate */ 1.557 + uint8_t *bytes=pArgs->converter->toUBytes; 1.558 + pArgs->converter->toULength=2; 1.559 + *bytes=*(s-2); 1.560 + bytes[1]=*(s-1); 1.561 + 1.562 + c=0xffff; 1.563 + *err=U_ILLEGAL_CHAR_FOUND; 1.564 + } 1.565 + } 1.566 + 1.567 + pArgs->source=(const char *)s; 1.568 + return c; 1.569 +} 1.570 + 1.571 +static void 1.572 +_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { 1.573 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.574 + /* reset toUnicode state */ 1.575 + if(UCNV_GET_VERSION(cnv)==0) { 1.576 + cnv->mode=8; /* no BOM handling */ 1.577 + } else { 1.578 + cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ 1.579 + } 1.580 + } 1.581 + if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 1.582 + /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ 1.583 + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1.584 + } 1.585 +} 1.586 + 1.587 +static void 1.588 +_UTF16BEOpen(UConverter *cnv, 1.589 + UConverterLoadArgs *pArgs, 1.590 + UErrorCode *pErrorCode) { 1.591 + if(UCNV_GET_VERSION(cnv)<=1) { 1.592 + _UTF16BEReset(cnv, UCNV_RESET_BOTH); 1.593 + } else { 1.594 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.595 + } 1.596 +} 1.597 + 1.598 +static const char * 1.599 +_UTF16BEGetName(const UConverter *cnv) { 1.600 + if(UCNV_GET_VERSION(cnv)==0) { 1.601 + return "UTF-16BE"; 1.602 + } else { 1.603 + return "UTF-16BE,version=1"; 1.604 + } 1.605 +} 1.606 + 1.607 +static const UConverterImpl _UTF16BEImpl={ 1.608 + UCNV_UTF16_BigEndian, 1.609 + 1.610 + NULL, 1.611 + NULL, 1.612 + 1.613 + _UTF16BEOpen, 1.614 + NULL, 1.615 + _UTF16BEReset, 1.616 + 1.617 + _UTF16BEToUnicodeWithOffsets, 1.618 + _UTF16BEToUnicodeWithOffsets, 1.619 + _UTF16BEFromUnicodeWithOffsets, 1.620 + _UTF16BEFromUnicodeWithOffsets, 1.621 + _UTF16BEGetNextUChar, 1.622 + 1.623 + NULL, 1.624 + _UTF16BEGetName, 1.625 + NULL, 1.626 + NULL, 1.627 + ucnv_getNonSurrogateUnicodeSet 1.628 +}; 1.629 + 1.630 +static const UConverterStaticData _UTF16BEStaticData={ 1.631 + sizeof(UConverterStaticData), 1.632 + "UTF-16BE", 1.633 + 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, 1.634 + { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, 1.635 + 0, 1.636 + 0, 1.637 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.638 +}; 1.639 + 1.640 + 1.641 +const UConverterSharedData _UTF16BEData={ 1.642 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.643 + NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, 1.644 + 0 1.645 +}; 1.646 + 1.647 +/* UTF-16LE ----------------------------------------------------------------- */ 1.648 + 1.649 +static void 1.650 +_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 1.651 + UErrorCode *pErrorCode) { 1.652 + UConverter *cnv; 1.653 + const UChar *source; 1.654 + char *target; 1.655 + int32_t *offsets; 1.656 + 1.657 + uint32_t targetCapacity, length, sourceIndex; 1.658 + UChar c, trail; 1.659 + char overflow[4]; 1.660 + 1.661 + source=pArgs->source; 1.662 + length=(int32_t)(pArgs->sourceLimit-source); 1.663 + if(length<=0) { 1.664 + /* no input, nothing to do */ 1.665 + return; 1.666 + } 1.667 + 1.668 + cnv=pArgs->converter; 1.669 + 1.670 + /* write the BOM if necessary */ 1.671 + if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 1.672 + static const char bom[]={ (char)0xff, (char)0xfe }; 1.673 + ucnv_fromUWriteBytes(cnv, 1.674 + bom, 2, 1.675 + &pArgs->target, pArgs->targetLimit, 1.676 + &pArgs->offsets, -1, 1.677 + pErrorCode); 1.678 + cnv->fromUnicodeStatus=0; 1.679 + } 1.680 + 1.681 + target=pArgs->target; 1.682 + if(target >= pArgs->targetLimit) { 1.683 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.684 + return; 1.685 + } 1.686 + 1.687 + targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 1.688 + offsets=pArgs->offsets; 1.689 + sourceIndex=0; 1.690 + 1.691 + /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 1.692 + 1.693 + if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 1.694 + /* the last buffer ended with a lead surrogate, output the surrogate pair */ 1.695 + ++source; 1.696 + --length; 1.697 + target[0]=(uint8_t)c; 1.698 + target[1]=(uint8_t)(c>>8); 1.699 + target[2]=(uint8_t)trail; 1.700 + target[3]=(uint8_t)(trail>>8); 1.701 + target+=4; 1.702 + targetCapacity-=4; 1.703 + if(offsets!=NULL) { 1.704 + *offsets++=-1; 1.705 + *offsets++=-1; 1.706 + *offsets++=-1; 1.707 + *offsets++=-1; 1.708 + } 1.709 + sourceIndex=1; 1.710 + cnv->fromUChar32=c=0; 1.711 + } 1.712 + 1.713 + if(c==0) { 1.714 + /* copy an even number of bytes for complete UChars */ 1.715 + uint32_t count=2*length; 1.716 + if(count>targetCapacity) { 1.717 + count=targetCapacity&~1; 1.718 + } 1.719 + /* count is even */ 1.720 + targetCapacity-=count; 1.721 + count>>=1; 1.722 + length-=count; 1.723 + 1.724 + if(offsets==NULL) { 1.725 + while(count>0) { 1.726 + c=*source++; 1.727 + if(U16_IS_SINGLE(c)) { 1.728 + target[0]=(uint8_t)c; 1.729 + target[1]=(uint8_t)(c>>8); 1.730 + target+=2; 1.731 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 1.732 + ++source; 1.733 + --count; 1.734 + target[0]=(uint8_t)c; 1.735 + target[1]=(uint8_t)(c>>8); 1.736 + target[2]=(uint8_t)trail; 1.737 + target[3]=(uint8_t)(trail>>8); 1.738 + target+=4; 1.739 + } else { 1.740 + break; 1.741 + } 1.742 + --count; 1.743 + } 1.744 + } else { 1.745 + while(count>0) { 1.746 + c=*source++; 1.747 + if(U16_IS_SINGLE(c)) { 1.748 + target[0]=(uint8_t)c; 1.749 + target[1]=(uint8_t)(c>>8); 1.750 + target+=2; 1.751 + *offsets++=sourceIndex; 1.752 + *offsets++=sourceIndex++; 1.753 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 1.754 + ++source; 1.755 + --count; 1.756 + target[0]=(uint8_t)c; 1.757 + target[1]=(uint8_t)(c>>8); 1.758 + target[2]=(uint8_t)trail; 1.759 + target[3]=(uint8_t)(trail>>8); 1.760 + target+=4; 1.761 + *offsets++=sourceIndex; 1.762 + *offsets++=sourceIndex; 1.763 + *offsets++=sourceIndex; 1.764 + *offsets++=sourceIndex; 1.765 + sourceIndex+=2; 1.766 + } else { 1.767 + break; 1.768 + } 1.769 + --count; 1.770 + } 1.771 + } 1.772 + 1.773 + if(count==0) { 1.774 + /* done with the loop for complete UChars */ 1.775 + if(length>0 && targetCapacity>0) { 1.776 + /* 1.777 + * there is more input and some target capacity - 1.778 + * it must be targetCapacity==1 because otherwise 1.779 + * the above would have copied more; 1.780 + * prepare for overflow output 1.781 + */ 1.782 + if(U16_IS_SINGLE(c=*source++)) { 1.783 + overflow[0]=(char)c; 1.784 + overflow[1]=(char)(c>>8); 1.785 + length=2; /* 2 bytes to output */ 1.786 + c=0; 1.787 + /* } else { keep c for surrogate handling, length will be set there */ 1.788 + } 1.789 + } else { 1.790 + length=0; 1.791 + c=0; 1.792 + } 1.793 + } else { 1.794 + /* keep c for surrogate handling, length will be set there */ 1.795 + targetCapacity+=2*count; 1.796 + } 1.797 + } else { 1.798 + length=0; /* from here on, length counts the bytes in overflow[] */ 1.799 + } 1.800 + 1.801 + if(c!=0) { 1.802 + /* 1.803 + * c is a surrogate, and 1.804 + * - source or target too short 1.805 + * - or the surrogate is unmatched 1.806 + */ 1.807 + length=0; 1.808 + if(U16_IS_SURROGATE_LEAD(c)) { 1.809 + if(source<pArgs->sourceLimit) { 1.810 + if(U16_IS_TRAIL(trail=*source)) { 1.811 + /* output the surrogate pair, will overflow (see conditions comment above) */ 1.812 + ++source; 1.813 + overflow[0]=(char)c; 1.814 + overflow[1]=(char)(c>>8); 1.815 + overflow[2]=(char)trail; 1.816 + overflow[3]=(char)(trail>>8); 1.817 + length=4; /* 4 bytes to output */ 1.818 + c=0; 1.819 + } else { 1.820 + /* unmatched lead surrogate */ 1.821 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.822 + } 1.823 + } else { 1.824 + /* see if the trail surrogate is in the next buffer */ 1.825 + } 1.826 + } else { 1.827 + /* unmatched trail surrogate */ 1.828 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.829 + } 1.830 + cnv->fromUChar32=c; 1.831 + } 1.832 + 1.833 + if(length>0) { 1.834 + /* output length bytes with overflow (length>targetCapacity>0) */ 1.835 + ucnv_fromUWriteBytes(cnv, 1.836 + overflow, length, 1.837 + &target, pArgs->targetLimit, 1.838 + &offsets, sourceIndex, 1.839 + pErrorCode); 1.840 + targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 1.841 + } 1.842 + 1.843 + if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 1.844 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.845 + } 1.846 + 1.847 + /* write back the updated pointers */ 1.848 + pArgs->source=source; 1.849 + pArgs->target=target; 1.850 + pArgs->offsets=offsets; 1.851 +} 1.852 + 1.853 +static void 1.854 +_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.855 + UErrorCode *pErrorCode) { 1.856 + UConverter *cnv; 1.857 + const uint8_t *source; 1.858 + UChar *target; 1.859 + int32_t *offsets; 1.860 + 1.861 + uint32_t targetCapacity, length, count, sourceIndex; 1.862 + UChar c, trail; 1.863 + 1.864 + if(pArgs->converter->mode<8) { 1.865 + _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 1.866 + return; 1.867 + } 1.868 + 1.869 + cnv=pArgs->converter; 1.870 + source=(const uint8_t *)pArgs->source; 1.871 + length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 1.872 + if(length<=0 && cnv->toUnicodeStatus==0) { 1.873 + /* no input, nothing to do */ 1.874 + return; 1.875 + } 1.876 + 1.877 + target=pArgs->target; 1.878 + if(target >= pArgs->targetLimit) { 1.879 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.880 + return; 1.881 + } 1.882 + 1.883 + targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 1.884 + offsets=pArgs->offsets; 1.885 + sourceIndex=0; 1.886 + c=0; 1.887 + 1.888 + /* complete a partial UChar or pair from the last call */ 1.889 + if(cnv->toUnicodeStatus!=0) { 1.890 + /* 1.891 + * special case: single byte from a previous buffer, 1.892 + * where the byte turned out not to belong to a trail surrogate 1.893 + * and the preceding, unmatched lead surrogate was put into toUBytes[] 1.894 + * for error handling 1.895 + */ 1.896 + cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 1.897 + cnv->toULength=1; 1.898 + cnv->toUnicodeStatus=0; 1.899 + } 1.900 + if((count=cnv->toULength)!=0) { 1.901 + uint8_t *p=cnv->toUBytes; 1.902 + do { 1.903 + p[count++]=*source++; 1.904 + ++sourceIndex; 1.905 + --length; 1.906 + if(count==2) { 1.907 + c=((UChar)p[1]<<8)|p[0]; 1.908 + if(U16_IS_SINGLE(c)) { 1.909 + /* output the BMP code point */ 1.910 + *target++=c; 1.911 + if(offsets!=NULL) { 1.912 + *offsets++=-1; 1.913 + } 1.914 + --targetCapacity; 1.915 + count=0; 1.916 + c=0; 1.917 + break; 1.918 + } else if(U16_IS_SURROGATE_LEAD(c)) { 1.919 + /* continue collecting bytes for the trail surrogate */ 1.920 + c=0; /* avoid unnecessary surrogate handling below */ 1.921 + } else { 1.922 + /* fall through to error handling for an unmatched trail surrogate */ 1.923 + break; 1.924 + } 1.925 + } else if(count==4) { 1.926 + c=((UChar)p[1]<<8)|p[0]; 1.927 + trail=((UChar)p[3]<<8)|p[2]; 1.928 + if(U16_IS_TRAIL(trail)) { 1.929 + /* output the surrogate pair */ 1.930 + *target++=c; 1.931 + if(targetCapacity>=2) { 1.932 + *target++=trail; 1.933 + if(offsets!=NULL) { 1.934 + *offsets++=-1; 1.935 + *offsets++=-1; 1.936 + } 1.937 + targetCapacity-=2; 1.938 + } else /* targetCapacity==1 */ { 1.939 + targetCapacity=0; 1.940 + cnv->UCharErrorBuffer[0]=trail; 1.941 + cnv->UCharErrorBufferLength=1; 1.942 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.943 + } 1.944 + count=0; 1.945 + c=0; 1.946 + break; 1.947 + } else { 1.948 + /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 1.949 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.950 + 1.951 + /* back out reading the code unit after it */ 1.952 + if(((const uint8_t *)pArgs->source-source)>=2) { 1.953 + source-=2; 1.954 + } else { 1.955 + /* 1.956 + * if the trail unit's first byte was in a previous buffer, then 1.957 + * we need to put it into a special place because toUBytes[] will be 1.958 + * used for the lead unit's bytes 1.959 + */ 1.960 + cnv->toUnicodeStatus=0x100|p[2]; 1.961 + --source; 1.962 + } 1.963 + cnv->toULength=2; 1.964 + 1.965 + /* write back the updated pointers */ 1.966 + pArgs->source=(const char *)source; 1.967 + pArgs->target=target; 1.968 + pArgs->offsets=offsets; 1.969 + return; 1.970 + } 1.971 + } 1.972 + } while(length>0); 1.973 + cnv->toULength=(int8_t)count; 1.974 + } 1.975 + 1.976 + /* copy an even number of bytes for complete UChars */ 1.977 + count=2*targetCapacity; 1.978 + if(count>length) { 1.979 + count=length&~1; 1.980 + } 1.981 + if(c==0 && count>0) { 1.982 + length-=count; 1.983 + count>>=1; 1.984 + targetCapacity-=count; 1.985 + if(offsets==NULL) { 1.986 + do { 1.987 + c=((UChar)source[1]<<8)|source[0]; 1.988 + source+=2; 1.989 + if(U16_IS_SINGLE(c)) { 1.990 + *target++=c; 1.991 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1.992 + U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 1.993 + ) { 1.994 + source+=2; 1.995 + --count; 1.996 + *target++=c; 1.997 + *target++=trail; 1.998 + } else { 1.999 + break; 1.1000 + } 1.1001 + } while(--count>0); 1.1002 + } else { 1.1003 + do { 1.1004 + c=((UChar)source[1]<<8)|source[0]; 1.1005 + source+=2; 1.1006 + if(U16_IS_SINGLE(c)) { 1.1007 + *target++=c; 1.1008 + *offsets++=sourceIndex; 1.1009 + sourceIndex+=2; 1.1010 + } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1.1011 + U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 1.1012 + ) { 1.1013 + source+=2; 1.1014 + --count; 1.1015 + *target++=c; 1.1016 + *target++=trail; 1.1017 + *offsets++=sourceIndex; 1.1018 + *offsets++=sourceIndex; 1.1019 + sourceIndex+=4; 1.1020 + } else { 1.1021 + break; 1.1022 + } 1.1023 + } while(--count>0); 1.1024 + } 1.1025 + 1.1026 + if(count==0) { 1.1027 + /* done with the loop for complete UChars */ 1.1028 + c=0; 1.1029 + } else { 1.1030 + /* keep c for surrogate handling, trail will be set there */ 1.1031 + length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 1.1032 + targetCapacity+=count; 1.1033 + } 1.1034 + } 1.1035 + 1.1036 + if(c!=0) { 1.1037 + /* 1.1038 + * c is a surrogate, and 1.1039 + * - source or target too short 1.1040 + * - or the surrogate is unmatched 1.1041 + */ 1.1042 + cnv->toUBytes[0]=(uint8_t)c; 1.1043 + cnv->toUBytes[1]=(uint8_t)(c>>8); 1.1044 + cnv->toULength=2; 1.1045 + 1.1046 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1047 + if(length>=2) { 1.1048 + if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { 1.1049 + /* output the surrogate pair, will overflow (see conditions comment above) */ 1.1050 + source+=2; 1.1051 + length-=2; 1.1052 + *target++=c; 1.1053 + if(offsets!=NULL) { 1.1054 + *offsets++=sourceIndex; 1.1055 + } 1.1056 + cnv->UCharErrorBuffer[0]=trail; 1.1057 + cnv->UCharErrorBufferLength=1; 1.1058 + cnv->toULength=0; 1.1059 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1060 + } else { 1.1061 + /* unmatched lead surrogate */ 1.1062 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1063 + } 1.1064 + } else { 1.1065 + /* see if the trail surrogate is in the next buffer */ 1.1066 + } 1.1067 + } else { 1.1068 + /* unmatched trail surrogate */ 1.1069 + *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1.1070 + } 1.1071 + } 1.1072 + 1.1073 + if(U_SUCCESS(*pErrorCode)) { 1.1074 + /* check for a remaining source byte */ 1.1075 + if(length>0) { 1.1076 + if(targetCapacity==0) { 1.1077 + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1.1078 + } else { 1.1079 + /* it must be length==1 because otherwise the above would have copied more */ 1.1080 + cnv->toUBytes[cnv->toULength++]=*source++; 1.1081 + } 1.1082 + } 1.1083 + } 1.1084 + 1.1085 + /* write back the updated pointers */ 1.1086 + pArgs->source=(const char *)source; 1.1087 + pArgs->target=target; 1.1088 + pArgs->offsets=offsets; 1.1089 +} 1.1090 + 1.1091 +static UChar32 1.1092 +_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 1.1093 + const uint8_t *s, *sourceLimit; 1.1094 + UChar32 c; 1.1095 + 1.1096 + if(pArgs->converter->mode<8) { 1.1097 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.1098 + } 1.1099 + 1.1100 + s=(const uint8_t *)pArgs->source; 1.1101 + sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1.1102 + 1.1103 + if(s>=sourceLimit) { 1.1104 + /* no input */ 1.1105 + *err=U_INDEX_OUTOFBOUNDS_ERROR; 1.1106 + return 0xffff; 1.1107 + } 1.1108 + 1.1109 + if(s+2>sourceLimit) { 1.1110 + /* only one byte: truncated UChar */ 1.1111 + pArgs->converter->toUBytes[0]=*s++; 1.1112 + pArgs->converter->toULength=1; 1.1113 + pArgs->source=(const char *)s; 1.1114 + *err = U_TRUNCATED_CHAR_FOUND; 1.1115 + return 0xffff; 1.1116 + } 1.1117 + 1.1118 + /* get one UChar */ 1.1119 + c=((UChar32)s[1]<<8)|*s; 1.1120 + s+=2; 1.1121 + 1.1122 + /* check for a surrogate pair */ 1.1123 + if(U_IS_SURROGATE(c)) { 1.1124 + if(U16_IS_SURROGATE_LEAD(c)) { 1.1125 + if(s+2<=sourceLimit) { 1.1126 + UChar trail; 1.1127 + 1.1128 + /* get a second UChar and see if it is a trail surrogate */ 1.1129 + trail=((UChar)s[1]<<8)|*s; 1.1130 + if(U16_IS_TRAIL(trail)) { 1.1131 + c=U16_GET_SUPPLEMENTARY(c, trail); 1.1132 + s+=2; 1.1133 + } else { 1.1134 + /* unmatched lead surrogate */ 1.1135 + c=-2; 1.1136 + } 1.1137 + } else { 1.1138 + /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 1.1139 + uint8_t *bytes=pArgs->converter->toUBytes; 1.1140 + s-=2; 1.1141 + pArgs->converter->toULength=(int8_t)(sourceLimit-s); 1.1142 + do { 1.1143 + *bytes++=*s++; 1.1144 + } while(s<sourceLimit); 1.1145 + 1.1146 + c=0xffff; 1.1147 + *err=U_TRUNCATED_CHAR_FOUND; 1.1148 + } 1.1149 + } else { 1.1150 + /* unmatched trail surrogate */ 1.1151 + c=-2; 1.1152 + } 1.1153 + 1.1154 + if(c<0) { 1.1155 + /* write the unmatched surrogate */ 1.1156 + uint8_t *bytes=pArgs->converter->toUBytes; 1.1157 + pArgs->converter->toULength=2; 1.1158 + *bytes=*(s-2); 1.1159 + bytes[1]=*(s-1); 1.1160 + 1.1161 + c=0xffff; 1.1162 + *err=U_ILLEGAL_CHAR_FOUND; 1.1163 + } 1.1164 + } 1.1165 + 1.1166 + pArgs->source=(const char *)s; 1.1167 + return c; 1.1168 +} 1.1169 + 1.1170 +static void 1.1171 +_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { 1.1172 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.1173 + /* reset toUnicode state */ 1.1174 + if(UCNV_GET_VERSION(cnv)==0) { 1.1175 + cnv->mode=8; /* no BOM handling */ 1.1176 + } else { 1.1177 + cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ 1.1178 + } 1.1179 + } 1.1180 + if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 1.1181 + /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ 1.1182 + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1.1183 + } 1.1184 +} 1.1185 + 1.1186 +static void 1.1187 +_UTF16LEOpen(UConverter *cnv, 1.1188 + UConverterLoadArgs *pArgs, 1.1189 + UErrorCode *pErrorCode) { 1.1190 + if(UCNV_GET_VERSION(cnv)<=1) { 1.1191 + _UTF16LEReset(cnv, UCNV_RESET_BOTH); 1.1192 + } else { 1.1193 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1194 + } 1.1195 +} 1.1196 + 1.1197 +static const char * 1.1198 +_UTF16LEGetName(const UConverter *cnv) { 1.1199 + if(UCNV_GET_VERSION(cnv)==0) { 1.1200 + return "UTF-16LE"; 1.1201 + } else { 1.1202 + return "UTF-16LE,version=1"; 1.1203 + } 1.1204 +} 1.1205 + 1.1206 +static const UConverterImpl _UTF16LEImpl={ 1.1207 + UCNV_UTF16_LittleEndian, 1.1208 + 1.1209 + NULL, 1.1210 + NULL, 1.1211 + 1.1212 + _UTF16LEOpen, 1.1213 + NULL, 1.1214 + _UTF16LEReset, 1.1215 + 1.1216 + _UTF16LEToUnicodeWithOffsets, 1.1217 + _UTF16LEToUnicodeWithOffsets, 1.1218 + _UTF16LEFromUnicodeWithOffsets, 1.1219 + _UTF16LEFromUnicodeWithOffsets, 1.1220 + _UTF16LEGetNextUChar, 1.1221 + 1.1222 + NULL, 1.1223 + _UTF16LEGetName, 1.1224 + NULL, 1.1225 + NULL, 1.1226 + ucnv_getNonSurrogateUnicodeSet 1.1227 +}; 1.1228 + 1.1229 + 1.1230 +static const UConverterStaticData _UTF16LEStaticData={ 1.1231 + sizeof(UConverterStaticData), 1.1232 + "UTF-16LE", 1.1233 + 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, 1.1234 + { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, 1.1235 + 0, 1.1236 + 0, 1.1237 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1238 +}; 1.1239 + 1.1240 + 1.1241 +const UConverterSharedData _UTF16LEData={ 1.1242 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.1243 + NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 1.1244 + 0 1.1245 +}; 1.1246 + 1.1247 +/* UTF-16 (Detect BOM) ------------------------------------------------------ */ 1.1248 + 1.1249 +/* 1.1250 + * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE 1.1251 + * accordingly. 1.1252 + * This is a simpler version of the UTF-32 converter, with 1.1253 + * fewer states for shorter BOMs. 1.1254 + * 1.1255 + * State values: 1.1256 + * 0 initial state 1.1257 + * 1 saw first byte 1.1258 + * 2..5 - 1.1259 + * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 1.1260 + * 8 UTF-16BE mode 1.1261 + * 9 UTF-16LE mode 1.1262 + * 1.1263 + * During detection: state==number of initial bytes seen so far. 1.1264 + * 1.1265 + * On output, emit U+FEFF as the first code point. 1.1266 + * 1.1267 + * Variants: 1.1268 + * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. 1.1269 + * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and 1.1270 + * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. 1.1271 + */ 1.1272 + 1.1273 +static void 1.1274 +_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { 1.1275 + if(choice<=UCNV_RESET_TO_UNICODE) { 1.1276 + /* reset toUnicode: state=0 */ 1.1277 + cnv->mode=0; 1.1278 + } 1.1279 + if(choice!=UCNV_RESET_TO_UNICODE) { 1.1280 + /* reset fromUnicode: prepare to output the UTF-16PE BOM */ 1.1281 + cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1.1282 + } 1.1283 +} 1.1284 + 1.1285 +static const UConverterSharedData _UTF16v2Data; 1.1286 + 1.1287 +static void 1.1288 +_UTF16Open(UConverter *cnv, 1.1289 + UConverterLoadArgs *pArgs, 1.1290 + UErrorCode *pErrorCode) { 1.1291 + if(UCNV_GET_VERSION(cnv)<=2) { 1.1292 + if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { 1.1293 + /* 1.1294 + * Switch implementation, and switch the staticData that's different 1.1295 + * and was copied into the UConverter. 1.1296 + * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) 1.1297 + * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. 1.1298 + */ 1.1299 + cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; 1.1300 + uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); 1.1301 + } 1.1302 + _UTF16Reset(cnv, UCNV_RESET_BOTH); 1.1303 + } else { 1.1304 + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1.1305 + } 1.1306 +} 1.1307 + 1.1308 +static const char * 1.1309 +_UTF16GetName(const UConverter *cnv) { 1.1310 + if(UCNV_GET_VERSION(cnv)==0) { 1.1311 + return "UTF-16"; 1.1312 + } else if(UCNV_GET_VERSION(cnv)==1) { 1.1313 + return "UTF-16,version=1"; 1.1314 + } else { 1.1315 + return "UTF-16,version=2"; 1.1316 + } 1.1317 +} 1.1318 + 1.1319 +const UConverterSharedData _UTF16Data; 1.1320 + 1.1321 +#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) 1.1322 +#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) 1.1323 +#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data) 1.1324 + 1.1325 +static void 1.1326 +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1.1327 + UErrorCode *pErrorCode) { 1.1328 + UConverter *cnv=pArgs->converter; 1.1329 + const char *source=pArgs->source; 1.1330 + const char *sourceLimit=pArgs->sourceLimit; 1.1331 + int32_t *offsets=pArgs->offsets; 1.1332 + 1.1333 + int32_t state, offsetDelta; 1.1334 + uint8_t b; 1.1335 + 1.1336 + state=cnv->mode; 1.1337 + 1.1338 + /* 1.1339 + * If we detect a BOM in this buffer, then we must add the BOM size to the 1.1340 + * offsets because the actual converter function will not see and count the BOM. 1.1341 + * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1.1342 + */ 1.1343 + offsetDelta=0; 1.1344 + 1.1345 + while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1.1346 + switch(state) { 1.1347 + case 0: 1.1348 + cnv->toUBytes[0]=(uint8_t)*source++; 1.1349 + cnv->toULength=1; 1.1350 + state=1; 1.1351 + break; 1.1352 + case 1: 1.1353 + /* 1.1354 + * Only inside this switch case can the state variable 1.1355 + * temporarily take two additional values: 1.1356 + * 6: BOM error, continue with BE 1.1357 + * 7: BOM error, continue with LE 1.1358 + */ 1.1359 + b=*source; 1.1360 + if(cnv->toUBytes[0]==0xfe && b==0xff) { 1.1361 + if(IS_UTF16LE(cnv)) { 1.1362 + state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ 1.1363 + } else { 1.1364 + state=8; /* detect UTF-16BE */ 1.1365 + } 1.1366 + } else if(cnv->toUBytes[0]==0xff && b==0xfe) { 1.1367 + if(IS_UTF16BE(cnv)) { 1.1368 + state=6; /* illegal reverse BOM for Java "UnicodeBig" */ 1.1369 + } else { 1.1370 + state=9; /* detect UTF-16LE */ 1.1371 + } 1.1372 + } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { 1.1373 + state=6; /* illegal missing BOM for Java "Unicode" */ 1.1374 + } 1.1375 + if(state>=8) { 1.1376 + /* BOM detected, consume it */ 1.1377 + ++source; 1.1378 + cnv->toULength=0; 1.1379 + offsetDelta=(int32_t)(source-pArgs->source); 1.1380 + } else if(state<6) { 1.1381 + /* ok: no BOM, and not a reverse BOM */ 1.1382 + if(source!=pArgs->source) { 1.1383 + /* reset the source for a correct first offset */ 1.1384 + source=pArgs->source; 1.1385 + cnv->toULength=0; 1.1386 + } 1.1387 + if(IS_UTF16LE(cnv)) { 1.1388 + /* Make Java "UnicodeLittle" default to LE. */ 1.1389 + state=9; 1.1390 + } else { 1.1391 + /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ 1.1392 + state=8; 1.1393 + } 1.1394 + } else { 1.1395 + /* 1.1396 + * error: missing BOM, or reverse BOM 1.1397 + * UTF-16,version=1: Java-specific "Unicode" requires a BOM. 1.1398 + * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. 1.1399 + * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. 1.1400 + */ 1.1401 + /* report the non-BOM or reverse BOM as an illegal sequence */ 1.1402 + cnv->toUBytes[1]=b; 1.1403 + cnv->toULength=2; 1.1404 + pArgs->source=source+1; 1.1405 + /* continue with conversion if the callback resets the error */ 1.1406 + /* 1.1407 + * Make Java "Unicode" default to BE like standard UTF-16. 1.1408 + * Make Java "UnicodeBig" and "UnicodeLittle" default 1.1409 + * to their normal endiannesses. 1.1410 + */ 1.1411 + cnv->mode=state+2; 1.1412 + *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; 1.1413 + return; 1.1414 + } 1.1415 + /* convert the rest of the stream */ 1.1416 + cnv->mode=state; 1.1417 + continue; 1.1418 + case 8: 1.1419 + /* call UTF-16BE */ 1.1420 + pArgs->source=source; 1.1421 + _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1.1422 + source=pArgs->source; 1.1423 + break; 1.1424 + case 9: 1.1425 + /* call UTF-16LE */ 1.1426 + pArgs->source=source; 1.1427 + _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1.1428 + source=pArgs->source; 1.1429 + break; 1.1430 + default: 1.1431 + break; /* does not occur */ 1.1432 + } 1.1433 + } 1.1434 + 1.1435 + /* add BOM size to offsets - see comment at offsetDelta declaration */ 1.1436 + if(offsets!=NULL && offsetDelta!=0) { 1.1437 + int32_t *offsetsLimit=pArgs->offsets; 1.1438 + while(offsets<offsetsLimit) { 1.1439 + *offsets++ += offsetDelta; 1.1440 + } 1.1441 + } 1.1442 + 1.1443 + pArgs->source=source; 1.1444 + 1.1445 + if(source==sourceLimit && pArgs->flush) { 1.1446 + /* handle truncated input */ 1.1447 + switch(state) { 1.1448 + case 0: 1.1449 + break; /* no input at all, nothing to do */ 1.1450 + case 8: 1.1451 + _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1.1452 + break; 1.1453 + case 9: 1.1454 + _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1.1455 + break; 1.1456 + default: 1.1457 + /* 0<state<8: framework will report truncation, nothing to do here */ 1.1458 + break; 1.1459 + } 1.1460 + } 1.1461 + 1.1462 + cnv->mode=state; 1.1463 +} 1.1464 + 1.1465 +static UChar32 1.1466 +_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, 1.1467 + UErrorCode *pErrorCode) { 1.1468 + switch(pArgs->converter->mode) { 1.1469 + case 8: 1.1470 + return _UTF16BEGetNextUChar(pArgs, pErrorCode); 1.1471 + case 9: 1.1472 + return _UTF16LEGetNextUChar(pArgs, pErrorCode); 1.1473 + default: 1.1474 + return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1.1475 + } 1.1476 +} 1.1477 + 1.1478 +static const UConverterImpl _UTF16Impl = { 1.1479 + UCNV_UTF16, 1.1480 + 1.1481 + NULL, 1.1482 + NULL, 1.1483 + 1.1484 + _UTF16Open, 1.1485 + NULL, 1.1486 + _UTF16Reset, 1.1487 + 1.1488 + _UTF16ToUnicodeWithOffsets, 1.1489 + _UTF16ToUnicodeWithOffsets, 1.1490 + _UTF16PEFromUnicodeWithOffsets, 1.1491 + _UTF16PEFromUnicodeWithOffsets, 1.1492 + _UTF16GetNextUChar, 1.1493 + 1.1494 + NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1.1495 + _UTF16GetName, 1.1496 + NULL, 1.1497 + NULL, 1.1498 + ucnv_getNonSurrogateUnicodeSet 1.1499 +}; 1.1500 + 1.1501 +static const UConverterStaticData _UTF16StaticData = { 1.1502 + sizeof(UConverterStaticData), 1.1503 + "UTF-16", 1.1504 + 1204, /* CCSID for BOM sensitive UTF-16 */ 1.1505 + UCNV_IBM, UCNV_UTF16, 2, 2, 1.1506 +#if U_IS_BIG_ENDIAN 1.1507 + { 0xff, 0xfd, 0, 0 }, 2, 1.1508 +#else 1.1509 + { 0xfd, 0xff, 0, 0 }, 2, 1.1510 +#endif 1.1511 + FALSE, FALSE, 1.1512 + 0, 1.1513 + 0, 1.1514 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1515 +}; 1.1516 + 1.1517 +const UConverterSharedData _UTF16Data = { 1.1518 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.1519 + NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, 1.1520 + 0 1.1521 +}; 1.1522 + 1.1523 +static const UConverterImpl _UTF16v2Impl = { 1.1524 + UCNV_UTF16, 1.1525 + 1.1526 + NULL, 1.1527 + NULL, 1.1528 + 1.1529 + _UTF16Open, 1.1530 + NULL, 1.1531 + _UTF16Reset, 1.1532 + 1.1533 + _UTF16ToUnicodeWithOffsets, 1.1534 + _UTF16ToUnicodeWithOffsets, 1.1535 + _UTF16BEFromUnicodeWithOffsets, 1.1536 + _UTF16BEFromUnicodeWithOffsets, 1.1537 + _UTF16GetNextUChar, 1.1538 + 1.1539 + NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1.1540 + _UTF16GetName, 1.1541 + NULL, 1.1542 + NULL, 1.1543 + ucnv_getNonSurrogateUnicodeSet 1.1544 +}; 1.1545 + 1.1546 +static const UConverterStaticData _UTF16v2StaticData = { 1.1547 + sizeof(UConverterStaticData), 1.1548 + "UTF-16,version=2", 1.1549 + 1204, /* CCSID for BOM sensitive UTF-16 */ 1.1550 + UCNV_IBM, UCNV_UTF16, 2, 2, 1.1551 + { 0xff, 0xfd, 0, 0 }, 2, 1.1552 + FALSE, FALSE, 1.1553 + 0, 1.1554 + 0, 1.1555 + { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1.1556 +}; 1.1557 + 1.1558 +static const UConverterSharedData _UTF16v2Data = { 1.1559 + sizeof(UConverterSharedData), ~((uint32_t) 0), 1.1560 + NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, 1.1561 + 0 1.1562 +}; 1.1563 + 1.1564 +#endif