Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /* |
michael@0 | 2 | ********************************************************************** |
michael@0 | 3 | * Copyright (C) 2002-2010, International Business Machines |
michael@0 | 4 | * Corporation and others. All Rights Reserved. |
michael@0 | 5 | ********************************************************************** |
michael@0 | 6 | * file name: ucnv_u16.c |
michael@0 | 7 | * encoding: US-ASCII |
michael@0 | 8 | * tab size: 8 (not used) |
michael@0 | 9 | * indentation:4 |
michael@0 | 10 | * |
michael@0 | 11 | * created on: 2002jul01 |
michael@0 | 12 | * created by: Markus W. Scherer |
michael@0 | 13 | * |
michael@0 | 14 | * UTF-16 converter implementation. Used to be in ucnv_utf.c. |
michael@0 | 15 | */ |
michael@0 | 16 | |
michael@0 | 17 | #include "unicode/utypes.h" |
michael@0 | 18 | |
michael@0 | 19 | #if !UCONFIG_NO_CONVERSION |
michael@0 | 20 | |
michael@0 | 21 | #include "unicode/ucnv.h" |
michael@0 | 22 | #include "ucnv_bld.h" |
michael@0 | 23 | #include "ucnv_cnv.h" |
michael@0 | 24 | #include "cmemory.h" |
michael@0 | 25 | |
michael@0 | 26 | enum { |
michael@0 | 27 | UCNV_NEED_TO_WRITE_BOM=1 |
michael@0 | 28 | }; |
michael@0 | 29 | |
michael@0 | 30 | /* |
michael@0 | 31 | * The UTF-16 toUnicode implementation is also used for the Java-specific |
michael@0 | 32 | * "with BOM" variants of UTF-16BE and UTF-16LE. |
michael@0 | 33 | */ |
michael@0 | 34 | static void |
michael@0 | 35 | _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 36 | UErrorCode *pErrorCode); |
michael@0 | 37 | |
michael@0 | 38 | /* UTF-16BE ----------------------------------------------------------------- */ |
michael@0 | 39 | |
michael@0 | 40 | #if U_IS_BIG_ENDIAN |
michael@0 | 41 | # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets |
michael@0 | 42 | #else |
michael@0 | 43 | # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets |
michael@0 | 44 | #endif |
michael@0 | 45 | |
michael@0 | 46 | |
michael@0 | 47 | static void |
michael@0 | 48 | _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
michael@0 | 49 | UErrorCode *pErrorCode) { |
michael@0 | 50 | UConverter *cnv; |
michael@0 | 51 | const UChar *source; |
michael@0 | 52 | char *target; |
michael@0 | 53 | int32_t *offsets; |
michael@0 | 54 | |
michael@0 | 55 | uint32_t targetCapacity, length, sourceIndex; |
michael@0 | 56 | UChar c, trail; |
michael@0 | 57 | char overflow[4]; |
michael@0 | 58 | |
michael@0 | 59 | source=pArgs->source; |
michael@0 | 60 | length=(int32_t)(pArgs->sourceLimit-source); |
michael@0 | 61 | if(length<=0) { |
michael@0 | 62 | /* no input, nothing to do */ |
michael@0 | 63 | return; |
michael@0 | 64 | } |
michael@0 | 65 | |
michael@0 | 66 | cnv=pArgs->converter; |
michael@0 | 67 | |
michael@0 | 68 | /* write the BOM if necessary */ |
michael@0 | 69 | if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 70 | static const char bom[]={ (char)0xfe, (char)0xff }; |
michael@0 | 71 | ucnv_fromUWriteBytes(cnv, |
michael@0 | 72 | bom, 2, |
michael@0 | 73 | &pArgs->target, pArgs->targetLimit, |
michael@0 | 74 | &pArgs->offsets, -1, |
michael@0 | 75 | pErrorCode); |
michael@0 | 76 | cnv->fromUnicodeStatus=0; |
michael@0 | 77 | } |
michael@0 | 78 | |
michael@0 | 79 | target=pArgs->target; |
michael@0 | 80 | if(target >= pArgs->targetLimit) { |
michael@0 | 81 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 82 | return; |
michael@0 | 83 | } |
michael@0 | 84 | |
michael@0 | 85 | targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
michael@0 | 86 | offsets=pArgs->offsets; |
michael@0 | 87 | sourceIndex=0; |
michael@0 | 88 | |
michael@0 | 89 | /* c!=0 indicates in several places outside the main loops that a surrogate was found */ |
michael@0 | 90 | |
michael@0 | 91 | if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { |
michael@0 | 92 | /* the last buffer ended with a lead surrogate, output the surrogate pair */ |
michael@0 | 93 | ++source; |
michael@0 | 94 | --length; |
michael@0 | 95 | target[0]=(uint8_t)(c>>8); |
michael@0 | 96 | target[1]=(uint8_t)c; |
michael@0 | 97 | target[2]=(uint8_t)(trail>>8); |
michael@0 | 98 | target[3]=(uint8_t)trail; |
michael@0 | 99 | target+=4; |
michael@0 | 100 | targetCapacity-=4; |
michael@0 | 101 | if(offsets!=NULL) { |
michael@0 | 102 | *offsets++=-1; |
michael@0 | 103 | *offsets++=-1; |
michael@0 | 104 | *offsets++=-1; |
michael@0 | 105 | *offsets++=-1; |
michael@0 | 106 | } |
michael@0 | 107 | sourceIndex=1; |
michael@0 | 108 | cnv->fromUChar32=c=0; |
michael@0 | 109 | } |
michael@0 | 110 | |
michael@0 | 111 | if(c==0) { |
michael@0 | 112 | /* copy an even number of bytes for complete UChars */ |
michael@0 | 113 | uint32_t count=2*length; |
michael@0 | 114 | if(count>targetCapacity) { |
michael@0 | 115 | count=targetCapacity&~1; |
michael@0 | 116 | } |
michael@0 | 117 | /* count is even */ |
michael@0 | 118 | targetCapacity-=count; |
michael@0 | 119 | count>>=1; |
michael@0 | 120 | length-=count; |
michael@0 | 121 | |
michael@0 | 122 | if(offsets==NULL) { |
michael@0 | 123 | while(count>0) { |
michael@0 | 124 | c=*source++; |
michael@0 | 125 | if(U16_IS_SINGLE(c)) { |
michael@0 | 126 | target[0]=(uint8_t)(c>>8); |
michael@0 | 127 | target[1]=(uint8_t)c; |
michael@0 | 128 | target+=2; |
michael@0 | 129 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
michael@0 | 130 | ++source; |
michael@0 | 131 | --count; |
michael@0 | 132 | target[0]=(uint8_t)(c>>8); |
michael@0 | 133 | target[1]=(uint8_t)c; |
michael@0 | 134 | target[2]=(uint8_t)(trail>>8); |
michael@0 | 135 | target[3]=(uint8_t)trail; |
michael@0 | 136 | target+=4; |
michael@0 | 137 | } else { |
michael@0 | 138 | break; |
michael@0 | 139 | } |
michael@0 | 140 | --count; |
michael@0 | 141 | } |
michael@0 | 142 | } else { |
michael@0 | 143 | while(count>0) { |
michael@0 | 144 | c=*source++; |
michael@0 | 145 | if(U16_IS_SINGLE(c)) { |
michael@0 | 146 | target[0]=(uint8_t)(c>>8); |
michael@0 | 147 | target[1]=(uint8_t)c; |
michael@0 | 148 | target+=2; |
michael@0 | 149 | *offsets++=sourceIndex; |
michael@0 | 150 | *offsets++=sourceIndex++; |
michael@0 | 151 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
michael@0 | 152 | ++source; |
michael@0 | 153 | --count; |
michael@0 | 154 | target[0]=(uint8_t)(c>>8); |
michael@0 | 155 | target[1]=(uint8_t)c; |
michael@0 | 156 | target[2]=(uint8_t)(trail>>8); |
michael@0 | 157 | target[3]=(uint8_t)trail; |
michael@0 | 158 | target+=4; |
michael@0 | 159 | *offsets++=sourceIndex; |
michael@0 | 160 | *offsets++=sourceIndex; |
michael@0 | 161 | *offsets++=sourceIndex; |
michael@0 | 162 | *offsets++=sourceIndex; |
michael@0 | 163 | sourceIndex+=2; |
michael@0 | 164 | } else { |
michael@0 | 165 | break; |
michael@0 | 166 | } |
michael@0 | 167 | --count; |
michael@0 | 168 | } |
michael@0 | 169 | } |
michael@0 | 170 | |
michael@0 | 171 | if(count==0) { |
michael@0 | 172 | /* done with the loop for complete UChars */ |
michael@0 | 173 | if(length>0 && targetCapacity>0) { |
michael@0 | 174 | /* |
michael@0 | 175 | * there is more input and some target capacity - |
michael@0 | 176 | * it must be targetCapacity==1 because otherwise |
michael@0 | 177 | * the above would have copied more; |
michael@0 | 178 | * prepare for overflow output |
michael@0 | 179 | */ |
michael@0 | 180 | if(U16_IS_SINGLE(c=*source++)) { |
michael@0 | 181 | overflow[0]=(char)(c>>8); |
michael@0 | 182 | overflow[1]=(char)c; |
michael@0 | 183 | length=2; /* 2 bytes to output */ |
michael@0 | 184 | c=0; |
michael@0 | 185 | /* } else { keep c for surrogate handling, length will be set there */ |
michael@0 | 186 | } |
michael@0 | 187 | } else { |
michael@0 | 188 | length=0; |
michael@0 | 189 | c=0; |
michael@0 | 190 | } |
michael@0 | 191 | } else { |
michael@0 | 192 | /* keep c for surrogate handling, length will be set there */ |
michael@0 | 193 | targetCapacity+=2*count; |
michael@0 | 194 | } |
michael@0 | 195 | } else { |
michael@0 | 196 | length=0; /* from here on, length counts the bytes in overflow[] */ |
michael@0 | 197 | } |
michael@0 | 198 | |
michael@0 | 199 | if(c!=0) { |
michael@0 | 200 | /* |
michael@0 | 201 | * c is a surrogate, and |
michael@0 | 202 | * - source or target too short |
michael@0 | 203 | * - or the surrogate is unmatched |
michael@0 | 204 | */ |
michael@0 | 205 | length=0; |
michael@0 | 206 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 207 | if(source<pArgs->sourceLimit) { |
michael@0 | 208 | if(U16_IS_TRAIL(trail=*source)) { |
michael@0 | 209 | /* output the surrogate pair, will overflow (see conditions comment above) */ |
michael@0 | 210 | ++source; |
michael@0 | 211 | overflow[0]=(char)(c>>8); |
michael@0 | 212 | overflow[1]=(char)c; |
michael@0 | 213 | overflow[2]=(char)(trail>>8); |
michael@0 | 214 | overflow[3]=(char)trail; |
michael@0 | 215 | length=4; /* 4 bytes to output */ |
michael@0 | 216 | c=0; |
michael@0 | 217 | } else { |
michael@0 | 218 | /* unmatched lead surrogate */ |
michael@0 | 219 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 220 | } |
michael@0 | 221 | } else { |
michael@0 | 222 | /* see if the trail surrogate is in the next buffer */ |
michael@0 | 223 | } |
michael@0 | 224 | } else { |
michael@0 | 225 | /* unmatched trail surrogate */ |
michael@0 | 226 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 227 | } |
michael@0 | 228 | cnv->fromUChar32=c; |
michael@0 | 229 | } |
michael@0 | 230 | |
michael@0 | 231 | if(length>0) { |
michael@0 | 232 | /* output length bytes with overflow (length>targetCapacity>0) */ |
michael@0 | 233 | ucnv_fromUWriteBytes(cnv, |
michael@0 | 234 | overflow, length, |
michael@0 | 235 | (char **)&target, pArgs->targetLimit, |
michael@0 | 236 | &offsets, sourceIndex, |
michael@0 | 237 | pErrorCode); |
michael@0 | 238 | targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
michael@0 | 239 | } |
michael@0 | 240 | |
michael@0 | 241 | if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
michael@0 | 242 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 243 | } |
michael@0 | 244 | |
michael@0 | 245 | /* write back the updated pointers */ |
michael@0 | 246 | pArgs->source=source; |
michael@0 | 247 | pArgs->target=(char *)target; |
michael@0 | 248 | pArgs->offsets=offsets; |
michael@0 | 249 | } |
michael@0 | 250 | |
michael@0 | 251 | static void |
michael@0 | 252 | _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 253 | UErrorCode *pErrorCode) { |
michael@0 | 254 | UConverter *cnv; |
michael@0 | 255 | const uint8_t *source; |
michael@0 | 256 | UChar *target; |
michael@0 | 257 | int32_t *offsets; |
michael@0 | 258 | |
michael@0 | 259 | uint32_t targetCapacity, length, count, sourceIndex; |
michael@0 | 260 | UChar c, trail; |
michael@0 | 261 | |
michael@0 | 262 | if(pArgs->converter->mode<8) { |
michael@0 | 263 | _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 264 | return; |
michael@0 | 265 | } |
michael@0 | 266 | |
michael@0 | 267 | cnv=pArgs->converter; |
michael@0 | 268 | source=(const uint8_t *)pArgs->source; |
michael@0 | 269 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
michael@0 | 270 | if(length<=0 && cnv->toUnicodeStatus==0) { |
michael@0 | 271 | /* no input, nothing to do */ |
michael@0 | 272 | return; |
michael@0 | 273 | } |
michael@0 | 274 | |
michael@0 | 275 | target=pArgs->target; |
michael@0 | 276 | if(target >= pArgs->targetLimit) { |
michael@0 | 277 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 278 | return; |
michael@0 | 279 | } |
michael@0 | 280 | |
michael@0 | 281 | targetCapacity=(uint32_t)(pArgs->targetLimit-target); |
michael@0 | 282 | offsets=pArgs->offsets; |
michael@0 | 283 | sourceIndex=0; |
michael@0 | 284 | c=0; |
michael@0 | 285 | |
michael@0 | 286 | /* complete a partial UChar or pair from the last call */ |
michael@0 | 287 | if(cnv->toUnicodeStatus!=0) { |
michael@0 | 288 | /* |
michael@0 | 289 | * special case: single byte from a previous buffer, |
michael@0 | 290 | * where the byte turned out not to belong to a trail surrogate |
michael@0 | 291 | * and the preceding, unmatched lead surrogate was put into toUBytes[] |
michael@0 | 292 | * for error handling |
michael@0 | 293 | */ |
michael@0 | 294 | cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
michael@0 | 295 | cnv->toULength=1; |
michael@0 | 296 | cnv->toUnicodeStatus=0; |
michael@0 | 297 | } |
michael@0 | 298 | if((count=cnv->toULength)!=0) { |
michael@0 | 299 | uint8_t *p=cnv->toUBytes; |
michael@0 | 300 | do { |
michael@0 | 301 | p[count++]=*source++; |
michael@0 | 302 | ++sourceIndex; |
michael@0 | 303 | --length; |
michael@0 | 304 | if(count==2) { |
michael@0 | 305 | c=((UChar)p[0]<<8)|p[1]; |
michael@0 | 306 | if(U16_IS_SINGLE(c)) { |
michael@0 | 307 | /* output the BMP code point */ |
michael@0 | 308 | *target++=c; |
michael@0 | 309 | if(offsets!=NULL) { |
michael@0 | 310 | *offsets++=-1; |
michael@0 | 311 | } |
michael@0 | 312 | --targetCapacity; |
michael@0 | 313 | count=0; |
michael@0 | 314 | c=0; |
michael@0 | 315 | break; |
michael@0 | 316 | } else if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 317 | /* continue collecting bytes for the trail surrogate */ |
michael@0 | 318 | c=0; /* avoid unnecessary surrogate handling below */ |
michael@0 | 319 | } else { |
michael@0 | 320 | /* fall through to error handling for an unmatched trail surrogate */ |
michael@0 | 321 | break; |
michael@0 | 322 | } |
michael@0 | 323 | } else if(count==4) { |
michael@0 | 324 | c=((UChar)p[0]<<8)|p[1]; |
michael@0 | 325 | trail=((UChar)p[2]<<8)|p[3]; |
michael@0 | 326 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 327 | /* output the surrogate pair */ |
michael@0 | 328 | *target++=c; |
michael@0 | 329 | if(targetCapacity>=2) { |
michael@0 | 330 | *target++=trail; |
michael@0 | 331 | if(offsets!=NULL) { |
michael@0 | 332 | *offsets++=-1; |
michael@0 | 333 | *offsets++=-1; |
michael@0 | 334 | } |
michael@0 | 335 | targetCapacity-=2; |
michael@0 | 336 | } else /* targetCapacity==1 */ { |
michael@0 | 337 | targetCapacity=0; |
michael@0 | 338 | cnv->UCharErrorBuffer[0]=trail; |
michael@0 | 339 | cnv->UCharErrorBufferLength=1; |
michael@0 | 340 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 341 | } |
michael@0 | 342 | count=0; |
michael@0 | 343 | c=0; |
michael@0 | 344 | break; |
michael@0 | 345 | } else { |
michael@0 | 346 | /* unmatched lead surrogate, handle here for consistent toUBytes[] */ |
michael@0 | 347 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 348 | |
michael@0 | 349 | /* back out reading the code unit after it */ |
michael@0 | 350 | if(((const uint8_t *)pArgs->source-source)>=2) { |
michael@0 | 351 | source-=2; |
michael@0 | 352 | } else { |
michael@0 | 353 | /* |
michael@0 | 354 | * if the trail unit's first byte was in a previous buffer, then |
michael@0 | 355 | * we need to put it into a special place because toUBytes[] will be |
michael@0 | 356 | * used for the lead unit's bytes |
michael@0 | 357 | */ |
michael@0 | 358 | cnv->toUnicodeStatus=0x100|p[2]; |
michael@0 | 359 | --source; |
michael@0 | 360 | } |
michael@0 | 361 | cnv->toULength=2; |
michael@0 | 362 | |
michael@0 | 363 | /* write back the updated pointers */ |
michael@0 | 364 | pArgs->source=(const char *)source; |
michael@0 | 365 | pArgs->target=target; |
michael@0 | 366 | pArgs->offsets=offsets; |
michael@0 | 367 | return; |
michael@0 | 368 | } |
michael@0 | 369 | } |
michael@0 | 370 | } while(length>0); |
michael@0 | 371 | cnv->toULength=(int8_t)count; |
michael@0 | 372 | } |
michael@0 | 373 | |
michael@0 | 374 | /* copy an even number of bytes for complete UChars */ |
michael@0 | 375 | count=2*targetCapacity; |
michael@0 | 376 | if(count>length) { |
michael@0 | 377 | count=length&~1; |
michael@0 | 378 | } |
michael@0 | 379 | if(c==0 && count>0) { |
michael@0 | 380 | length-=count; |
michael@0 | 381 | count>>=1; |
michael@0 | 382 | targetCapacity-=count; |
michael@0 | 383 | if(offsets==NULL) { |
michael@0 | 384 | do { |
michael@0 | 385 | c=((UChar)source[0]<<8)|source[1]; |
michael@0 | 386 | source+=2; |
michael@0 | 387 | if(U16_IS_SINGLE(c)) { |
michael@0 | 388 | *target++=c; |
michael@0 | 389 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
michael@0 | 390 | U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
michael@0 | 391 | ) { |
michael@0 | 392 | source+=2; |
michael@0 | 393 | --count; |
michael@0 | 394 | *target++=c; |
michael@0 | 395 | *target++=trail; |
michael@0 | 396 | } else { |
michael@0 | 397 | break; |
michael@0 | 398 | } |
michael@0 | 399 | } while(--count>0); |
michael@0 | 400 | } else { |
michael@0 | 401 | do { |
michael@0 | 402 | c=((UChar)source[0]<<8)|source[1]; |
michael@0 | 403 | source+=2; |
michael@0 | 404 | if(U16_IS_SINGLE(c)) { |
michael@0 | 405 | *target++=c; |
michael@0 | 406 | *offsets++=sourceIndex; |
michael@0 | 407 | sourceIndex+=2; |
michael@0 | 408 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
michael@0 | 409 | U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) |
michael@0 | 410 | ) { |
michael@0 | 411 | source+=2; |
michael@0 | 412 | --count; |
michael@0 | 413 | *target++=c; |
michael@0 | 414 | *target++=trail; |
michael@0 | 415 | *offsets++=sourceIndex; |
michael@0 | 416 | *offsets++=sourceIndex; |
michael@0 | 417 | sourceIndex+=4; |
michael@0 | 418 | } else { |
michael@0 | 419 | break; |
michael@0 | 420 | } |
michael@0 | 421 | } while(--count>0); |
michael@0 | 422 | } |
michael@0 | 423 | |
michael@0 | 424 | if(count==0) { |
michael@0 | 425 | /* done with the loop for complete UChars */ |
michael@0 | 426 | c=0; |
michael@0 | 427 | } else { |
michael@0 | 428 | /* keep c for surrogate handling, trail will be set there */ |
michael@0 | 429 | length+=2*(count-1); /* one more byte pair was consumed than count decremented */ |
michael@0 | 430 | targetCapacity+=count; |
michael@0 | 431 | } |
michael@0 | 432 | } |
michael@0 | 433 | |
michael@0 | 434 | if(c!=0) { |
michael@0 | 435 | /* |
michael@0 | 436 | * c is a surrogate, and |
michael@0 | 437 | * - source or target too short |
michael@0 | 438 | * - or the surrogate is unmatched |
michael@0 | 439 | */ |
michael@0 | 440 | cnv->toUBytes[0]=(uint8_t)(c>>8); |
michael@0 | 441 | cnv->toUBytes[1]=(uint8_t)c; |
michael@0 | 442 | cnv->toULength=2; |
michael@0 | 443 | |
michael@0 | 444 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 445 | if(length>=2) { |
michael@0 | 446 | if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { |
michael@0 | 447 | /* output the surrogate pair, will overflow (see conditions comment above) */ |
michael@0 | 448 | source+=2; |
michael@0 | 449 | length-=2; |
michael@0 | 450 | *target++=c; |
michael@0 | 451 | if(offsets!=NULL) { |
michael@0 | 452 | *offsets++=sourceIndex; |
michael@0 | 453 | } |
michael@0 | 454 | cnv->UCharErrorBuffer[0]=trail; |
michael@0 | 455 | cnv->UCharErrorBufferLength=1; |
michael@0 | 456 | cnv->toULength=0; |
michael@0 | 457 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 458 | } else { |
michael@0 | 459 | /* unmatched lead surrogate */ |
michael@0 | 460 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 461 | } |
michael@0 | 462 | } else { |
michael@0 | 463 | /* see if the trail surrogate is in the next buffer */ |
michael@0 | 464 | } |
michael@0 | 465 | } else { |
michael@0 | 466 | /* unmatched trail surrogate */ |
michael@0 | 467 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 468 | } |
michael@0 | 469 | } |
michael@0 | 470 | |
michael@0 | 471 | if(U_SUCCESS(*pErrorCode)) { |
michael@0 | 472 | /* check for a remaining source byte */ |
michael@0 | 473 | if(length>0) { |
michael@0 | 474 | if(targetCapacity==0) { |
michael@0 | 475 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 476 | } else { |
michael@0 | 477 | /* it must be length==1 because otherwise the above would have copied more */ |
michael@0 | 478 | cnv->toUBytes[cnv->toULength++]=*source++; |
michael@0 | 479 | } |
michael@0 | 480 | } |
michael@0 | 481 | } |
michael@0 | 482 | |
michael@0 | 483 | /* write back the updated pointers */ |
michael@0 | 484 | pArgs->source=(const char *)source; |
michael@0 | 485 | pArgs->target=target; |
michael@0 | 486 | pArgs->offsets=offsets; |
michael@0 | 487 | } |
michael@0 | 488 | |
michael@0 | 489 | static UChar32 |
michael@0 | 490 | _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
michael@0 | 491 | const uint8_t *s, *sourceLimit; |
michael@0 | 492 | UChar32 c; |
michael@0 | 493 | |
michael@0 | 494 | if(pArgs->converter->mode<8) { |
michael@0 | 495 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
michael@0 | 496 | } |
michael@0 | 497 | |
michael@0 | 498 | s=(const uint8_t *)pArgs->source; |
michael@0 | 499 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
michael@0 | 500 | |
michael@0 | 501 | if(s>=sourceLimit) { |
michael@0 | 502 | /* no input */ |
michael@0 | 503 | *err=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 504 | return 0xffff; |
michael@0 | 505 | } |
michael@0 | 506 | |
michael@0 | 507 | if(s+2>sourceLimit) { |
michael@0 | 508 | /* only one byte: truncated UChar */ |
michael@0 | 509 | pArgs->converter->toUBytes[0]=*s++; |
michael@0 | 510 | pArgs->converter->toULength=1; |
michael@0 | 511 | pArgs->source=(const char *)s; |
michael@0 | 512 | *err = U_TRUNCATED_CHAR_FOUND; |
michael@0 | 513 | return 0xffff; |
michael@0 | 514 | } |
michael@0 | 515 | |
michael@0 | 516 | /* get one UChar */ |
michael@0 | 517 | c=((UChar32)*s<<8)|s[1]; |
michael@0 | 518 | s+=2; |
michael@0 | 519 | |
michael@0 | 520 | /* check for a surrogate pair */ |
michael@0 | 521 | if(U_IS_SURROGATE(c)) { |
michael@0 | 522 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 523 | if(s+2<=sourceLimit) { |
michael@0 | 524 | UChar trail; |
michael@0 | 525 | |
michael@0 | 526 | /* get a second UChar and see if it is a trail surrogate */ |
michael@0 | 527 | trail=((UChar)*s<<8)|s[1]; |
michael@0 | 528 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 529 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 530 | s+=2; |
michael@0 | 531 | } else { |
michael@0 | 532 | /* unmatched lead surrogate */ |
michael@0 | 533 | c=-2; |
michael@0 | 534 | } |
michael@0 | 535 | } else { |
michael@0 | 536 | /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ |
michael@0 | 537 | uint8_t *bytes=pArgs->converter->toUBytes; |
michael@0 | 538 | s-=2; |
michael@0 | 539 | pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
michael@0 | 540 | do { |
michael@0 | 541 | *bytes++=*s++; |
michael@0 | 542 | } while(s<sourceLimit); |
michael@0 | 543 | |
michael@0 | 544 | c=0xffff; |
michael@0 | 545 | *err=U_TRUNCATED_CHAR_FOUND; |
michael@0 | 546 | } |
michael@0 | 547 | } else { |
michael@0 | 548 | /* unmatched trail surrogate */ |
michael@0 | 549 | c=-2; |
michael@0 | 550 | } |
michael@0 | 551 | |
michael@0 | 552 | if(c<0) { |
michael@0 | 553 | /* write the unmatched surrogate */ |
michael@0 | 554 | uint8_t *bytes=pArgs->converter->toUBytes; |
michael@0 | 555 | pArgs->converter->toULength=2; |
michael@0 | 556 | *bytes=*(s-2); |
michael@0 | 557 | bytes[1]=*(s-1); |
michael@0 | 558 | |
michael@0 | 559 | c=0xffff; |
michael@0 | 560 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 561 | } |
michael@0 | 562 | } |
michael@0 | 563 | |
michael@0 | 564 | pArgs->source=(const char *)s; |
michael@0 | 565 | return c; |
michael@0 | 566 | } |
michael@0 | 567 | |
michael@0 | 568 | static void |
michael@0 | 569 | _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { |
michael@0 | 570 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 571 | /* reset toUnicode state */ |
michael@0 | 572 | if(UCNV_GET_VERSION(cnv)==0) { |
michael@0 | 573 | cnv->mode=8; /* no BOM handling */ |
michael@0 | 574 | } else { |
michael@0 | 575 | cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ |
michael@0 | 576 | } |
michael@0 | 577 | } |
michael@0 | 578 | if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
michael@0 | 579 | /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ |
michael@0 | 580 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
michael@0 | 581 | } |
michael@0 | 582 | } |
michael@0 | 583 | |
michael@0 | 584 | static void |
michael@0 | 585 | _UTF16BEOpen(UConverter *cnv, |
michael@0 | 586 | UConverterLoadArgs *pArgs, |
michael@0 | 587 | UErrorCode *pErrorCode) { |
michael@0 | 588 | if(UCNV_GET_VERSION(cnv)<=1) { |
michael@0 | 589 | _UTF16BEReset(cnv, UCNV_RESET_BOTH); |
michael@0 | 590 | } else { |
michael@0 | 591 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 592 | } |
michael@0 | 593 | } |
michael@0 | 594 | |
michael@0 | 595 | static const char * |
michael@0 | 596 | _UTF16BEGetName(const UConverter *cnv) { |
michael@0 | 597 | if(UCNV_GET_VERSION(cnv)==0) { |
michael@0 | 598 | return "UTF-16BE"; |
michael@0 | 599 | } else { |
michael@0 | 600 | return "UTF-16BE,version=1"; |
michael@0 | 601 | } |
michael@0 | 602 | } |
michael@0 | 603 | |
michael@0 | 604 | static const UConverterImpl _UTF16BEImpl={ |
michael@0 | 605 | UCNV_UTF16_BigEndian, |
michael@0 | 606 | |
michael@0 | 607 | NULL, |
michael@0 | 608 | NULL, |
michael@0 | 609 | |
michael@0 | 610 | _UTF16BEOpen, |
michael@0 | 611 | NULL, |
michael@0 | 612 | _UTF16BEReset, |
michael@0 | 613 | |
michael@0 | 614 | _UTF16BEToUnicodeWithOffsets, |
michael@0 | 615 | _UTF16BEToUnicodeWithOffsets, |
michael@0 | 616 | _UTF16BEFromUnicodeWithOffsets, |
michael@0 | 617 | _UTF16BEFromUnicodeWithOffsets, |
michael@0 | 618 | _UTF16BEGetNextUChar, |
michael@0 | 619 | |
michael@0 | 620 | NULL, |
michael@0 | 621 | _UTF16BEGetName, |
michael@0 | 622 | NULL, |
michael@0 | 623 | NULL, |
michael@0 | 624 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 625 | }; |
michael@0 | 626 | |
michael@0 | 627 | static const UConverterStaticData _UTF16BEStaticData={ |
michael@0 | 628 | sizeof(UConverterStaticData), |
michael@0 | 629 | "UTF-16BE", |
michael@0 | 630 | 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, |
michael@0 | 631 | { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, |
michael@0 | 632 | 0, |
michael@0 | 633 | 0, |
michael@0 | 634 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 635 | }; |
michael@0 | 636 | |
michael@0 | 637 | |
michael@0 | 638 | const UConverterSharedData _UTF16BEData={ |
michael@0 | 639 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 640 | NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, |
michael@0 | 641 | 0 |
michael@0 | 642 | }; |
michael@0 | 643 | |
michael@0 | 644 | /* UTF-16LE ----------------------------------------------------------------- */ |
michael@0 | 645 | |
michael@0 | 646 | static void |
michael@0 | 647 | _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, |
michael@0 | 648 | UErrorCode *pErrorCode) { |
michael@0 | 649 | UConverter *cnv; |
michael@0 | 650 | const UChar *source; |
michael@0 | 651 | char *target; |
michael@0 | 652 | int32_t *offsets; |
michael@0 | 653 | |
michael@0 | 654 | uint32_t targetCapacity, length, sourceIndex; |
michael@0 | 655 | UChar c, trail; |
michael@0 | 656 | char overflow[4]; |
michael@0 | 657 | |
michael@0 | 658 | source=pArgs->source; |
michael@0 | 659 | length=(int32_t)(pArgs->sourceLimit-source); |
michael@0 | 660 | if(length<=0) { |
michael@0 | 661 | /* no input, nothing to do */ |
michael@0 | 662 | return; |
michael@0 | 663 | } |
michael@0 | 664 | |
michael@0 | 665 | cnv=pArgs->converter; |
michael@0 | 666 | |
michael@0 | 667 | /* write the BOM if necessary */ |
michael@0 | 668 | if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { |
michael@0 | 669 | static const char bom[]={ (char)0xff, (char)0xfe }; |
michael@0 | 670 | ucnv_fromUWriteBytes(cnv, |
michael@0 | 671 | bom, 2, |
michael@0 | 672 | &pArgs->target, pArgs->targetLimit, |
michael@0 | 673 | &pArgs->offsets, -1, |
michael@0 | 674 | pErrorCode); |
michael@0 | 675 | cnv->fromUnicodeStatus=0; |
michael@0 | 676 | } |
michael@0 | 677 | |
michael@0 | 678 | target=pArgs->target; |
michael@0 | 679 | if(target >= pArgs->targetLimit) { |
michael@0 | 680 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 681 | return; |
michael@0 | 682 | } |
michael@0 | 683 | |
michael@0 | 684 | targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
michael@0 | 685 | offsets=pArgs->offsets; |
michael@0 | 686 | sourceIndex=0; |
michael@0 | 687 | |
michael@0 | 688 | /* c!=0 indicates in several places outside the main loops that a surrogate was found */ |
michael@0 | 689 | |
michael@0 | 690 | if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { |
michael@0 | 691 | /* the last buffer ended with a lead surrogate, output the surrogate pair */ |
michael@0 | 692 | ++source; |
michael@0 | 693 | --length; |
michael@0 | 694 | target[0]=(uint8_t)c; |
michael@0 | 695 | target[1]=(uint8_t)(c>>8); |
michael@0 | 696 | target[2]=(uint8_t)trail; |
michael@0 | 697 | target[3]=(uint8_t)(trail>>8); |
michael@0 | 698 | target+=4; |
michael@0 | 699 | targetCapacity-=4; |
michael@0 | 700 | if(offsets!=NULL) { |
michael@0 | 701 | *offsets++=-1; |
michael@0 | 702 | *offsets++=-1; |
michael@0 | 703 | *offsets++=-1; |
michael@0 | 704 | *offsets++=-1; |
michael@0 | 705 | } |
michael@0 | 706 | sourceIndex=1; |
michael@0 | 707 | cnv->fromUChar32=c=0; |
michael@0 | 708 | } |
michael@0 | 709 | |
michael@0 | 710 | if(c==0) { |
michael@0 | 711 | /* copy an even number of bytes for complete UChars */ |
michael@0 | 712 | uint32_t count=2*length; |
michael@0 | 713 | if(count>targetCapacity) { |
michael@0 | 714 | count=targetCapacity&~1; |
michael@0 | 715 | } |
michael@0 | 716 | /* count is even */ |
michael@0 | 717 | targetCapacity-=count; |
michael@0 | 718 | count>>=1; |
michael@0 | 719 | length-=count; |
michael@0 | 720 | |
michael@0 | 721 | if(offsets==NULL) { |
michael@0 | 722 | while(count>0) { |
michael@0 | 723 | c=*source++; |
michael@0 | 724 | if(U16_IS_SINGLE(c)) { |
michael@0 | 725 | target[0]=(uint8_t)c; |
michael@0 | 726 | target[1]=(uint8_t)(c>>8); |
michael@0 | 727 | target+=2; |
michael@0 | 728 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
michael@0 | 729 | ++source; |
michael@0 | 730 | --count; |
michael@0 | 731 | target[0]=(uint8_t)c; |
michael@0 | 732 | target[1]=(uint8_t)(c>>8); |
michael@0 | 733 | target[2]=(uint8_t)trail; |
michael@0 | 734 | target[3]=(uint8_t)(trail>>8); |
michael@0 | 735 | target+=4; |
michael@0 | 736 | } else { |
michael@0 | 737 | break; |
michael@0 | 738 | } |
michael@0 | 739 | --count; |
michael@0 | 740 | } |
michael@0 | 741 | } else { |
michael@0 | 742 | while(count>0) { |
michael@0 | 743 | c=*source++; |
michael@0 | 744 | if(U16_IS_SINGLE(c)) { |
michael@0 | 745 | target[0]=(uint8_t)c; |
michael@0 | 746 | target[1]=(uint8_t)(c>>8); |
michael@0 | 747 | target+=2; |
michael@0 | 748 | *offsets++=sourceIndex; |
michael@0 | 749 | *offsets++=sourceIndex++; |
michael@0 | 750 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { |
michael@0 | 751 | ++source; |
michael@0 | 752 | --count; |
michael@0 | 753 | target[0]=(uint8_t)c; |
michael@0 | 754 | target[1]=(uint8_t)(c>>8); |
michael@0 | 755 | target[2]=(uint8_t)trail; |
michael@0 | 756 | target[3]=(uint8_t)(trail>>8); |
michael@0 | 757 | target+=4; |
michael@0 | 758 | *offsets++=sourceIndex; |
michael@0 | 759 | *offsets++=sourceIndex; |
michael@0 | 760 | *offsets++=sourceIndex; |
michael@0 | 761 | *offsets++=sourceIndex; |
michael@0 | 762 | sourceIndex+=2; |
michael@0 | 763 | } else { |
michael@0 | 764 | break; |
michael@0 | 765 | } |
michael@0 | 766 | --count; |
michael@0 | 767 | } |
michael@0 | 768 | } |
michael@0 | 769 | |
michael@0 | 770 | if(count==0) { |
michael@0 | 771 | /* done with the loop for complete UChars */ |
michael@0 | 772 | if(length>0 && targetCapacity>0) { |
michael@0 | 773 | /* |
michael@0 | 774 | * there is more input and some target capacity - |
michael@0 | 775 | * it must be targetCapacity==1 because otherwise |
michael@0 | 776 | * the above would have copied more; |
michael@0 | 777 | * prepare for overflow output |
michael@0 | 778 | */ |
michael@0 | 779 | if(U16_IS_SINGLE(c=*source++)) { |
michael@0 | 780 | overflow[0]=(char)c; |
michael@0 | 781 | overflow[1]=(char)(c>>8); |
michael@0 | 782 | length=2; /* 2 bytes to output */ |
michael@0 | 783 | c=0; |
michael@0 | 784 | /* } else { keep c for surrogate handling, length will be set there */ |
michael@0 | 785 | } |
michael@0 | 786 | } else { |
michael@0 | 787 | length=0; |
michael@0 | 788 | c=0; |
michael@0 | 789 | } |
michael@0 | 790 | } else { |
michael@0 | 791 | /* keep c for surrogate handling, length will be set there */ |
michael@0 | 792 | targetCapacity+=2*count; |
michael@0 | 793 | } |
michael@0 | 794 | } else { |
michael@0 | 795 | length=0; /* from here on, length counts the bytes in overflow[] */ |
michael@0 | 796 | } |
michael@0 | 797 | |
michael@0 | 798 | if(c!=0) { |
michael@0 | 799 | /* |
michael@0 | 800 | * c is a surrogate, and |
michael@0 | 801 | * - source or target too short |
michael@0 | 802 | * - or the surrogate is unmatched |
michael@0 | 803 | */ |
michael@0 | 804 | length=0; |
michael@0 | 805 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 806 | if(source<pArgs->sourceLimit) { |
michael@0 | 807 | if(U16_IS_TRAIL(trail=*source)) { |
michael@0 | 808 | /* output the surrogate pair, will overflow (see conditions comment above) */ |
michael@0 | 809 | ++source; |
michael@0 | 810 | overflow[0]=(char)c; |
michael@0 | 811 | overflow[1]=(char)(c>>8); |
michael@0 | 812 | overflow[2]=(char)trail; |
michael@0 | 813 | overflow[3]=(char)(trail>>8); |
michael@0 | 814 | length=4; /* 4 bytes to output */ |
michael@0 | 815 | c=0; |
michael@0 | 816 | } else { |
michael@0 | 817 | /* unmatched lead surrogate */ |
michael@0 | 818 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 819 | } |
michael@0 | 820 | } else { |
michael@0 | 821 | /* see if the trail surrogate is in the next buffer */ |
michael@0 | 822 | } |
michael@0 | 823 | } else { |
michael@0 | 824 | /* unmatched trail surrogate */ |
michael@0 | 825 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 826 | } |
michael@0 | 827 | cnv->fromUChar32=c; |
michael@0 | 828 | } |
michael@0 | 829 | |
michael@0 | 830 | if(length>0) { |
michael@0 | 831 | /* output length bytes with overflow (length>targetCapacity>0) */ |
michael@0 | 832 | ucnv_fromUWriteBytes(cnv, |
michael@0 | 833 | overflow, length, |
michael@0 | 834 | &target, pArgs->targetLimit, |
michael@0 | 835 | &offsets, sourceIndex, |
michael@0 | 836 | pErrorCode); |
michael@0 | 837 | targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); |
michael@0 | 838 | } |
michael@0 | 839 | |
michael@0 | 840 | if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { |
michael@0 | 841 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 842 | } |
michael@0 | 843 | |
michael@0 | 844 | /* write back the updated pointers */ |
michael@0 | 845 | pArgs->source=source; |
michael@0 | 846 | pArgs->target=target; |
michael@0 | 847 | pArgs->offsets=offsets; |
michael@0 | 848 | } |
michael@0 | 849 | |
michael@0 | 850 | static void |
michael@0 | 851 | _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 852 | UErrorCode *pErrorCode) { |
michael@0 | 853 | UConverter *cnv; |
michael@0 | 854 | const uint8_t *source; |
michael@0 | 855 | UChar *target; |
michael@0 | 856 | int32_t *offsets; |
michael@0 | 857 | |
michael@0 | 858 | uint32_t targetCapacity, length, count, sourceIndex; |
michael@0 | 859 | UChar c, trail; |
michael@0 | 860 | |
michael@0 | 861 | if(pArgs->converter->mode<8) { |
michael@0 | 862 | _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 863 | return; |
michael@0 | 864 | } |
michael@0 | 865 | |
michael@0 | 866 | cnv=pArgs->converter; |
michael@0 | 867 | source=(const uint8_t *)pArgs->source; |
michael@0 | 868 | length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); |
michael@0 | 869 | if(length<=0 && cnv->toUnicodeStatus==0) { |
michael@0 | 870 | /* no input, nothing to do */ |
michael@0 | 871 | return; |
michael@0 | 872 | } |
michael@0 | 873 | |
michael@0 | 874 | target=pArgs->target; |
michael@0 | 875 | if(target >= pArgs->targetLimit) { |
michael@0 | 876 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 877 | return; |
michael@0 | 878 | } |
michael@0 | 879 | |
michael@0 | 880 | targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); |
michael@0 | 881 | offsets=pArgs->offsets; |
michael@0 | 882 | sourceIndex=0; |
michael@0 | 883 | c=0; |
michael@0 | 884 | |
michael@0 | 885 | /* complete a partial UChar or pair from the last call */ |
michael@0 | 886 | if(cnv->toUnicodeStatus!=0) { |
michael@0 | 887 | /* |
michael@0 | 888 | * special case: single byte from a previous buffer, |
michael@0 | 889 | * where the byte turned out not to belong to a trail surrogate |
michael@0 | 890 | * and the preceding, unmatched lead surrogate was put into toUBytes[] |
michael@0 | 891 | * for error handling |
michael@0 | 892 | */ |
michael@0 | 893 | cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; |
michael@0 | 894 | cnv->toULength=1; |
michael@0 | 895 | cnv->toUnicodeStatus=0; |
michael@0 | 896 | } |
michael@0 | 897 | if((count=cnv->toULength)!=0) { |
michael@0 | 898 | uint8_t *p=cnv->toUBytes; |
michael@0 | 899 | do { |
michael@0 | 900 | p[count++]=*source++; |
michael@0 | 901 | ++sourceIndex; |
michael@0 | 902 | --length; |
michael@0 | 903 | if(count==2) { |
michael@0 | 904 | c=((UChar)p[1]<<8)|p[0]; |
michael@0 | 905 | if(U16_IS_SINGLE(c)) { |
michael@0 | 906 | /* output the BMP code point */ |
michael@0 | 907 | *target++=c; |
michael@0 | 908 | if(offsets!=NULL) { |
michael@0 | 909 | *offsets++=-1; |
michael@0 | 910 | } |
michael@0 | 911 | --targetCapacity; |
michael@0 | 912 | count=0; |
michael@0 | 913 | c=0; |
michael@0 | 914 | break; |
michael@0 | 915 | } else if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 916 | /* continue collecting bytes for the trail surrogate */ |
michael@0 | 917 | c=0; /* avoid unnecessary surrogate handling below */ |
michael@0 | 918 | } else { |
michael@0 | 919 | /* fall through to error handling for an unmatched trail surrogate */ |
michael@0 | 920 | break; |
michael@0 | 921 | } |
michael@0 | 922 | } else if(count==4) { |
michael@0 | 923 | c=((UChar)p[1]<<8)|p[0]; |
michael@0 | 924 | trail=((UChar)p[3]<<8)|p[2]; |
michael@0 | 925 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 926 | /* output the surrogate pair */ |
michael@0 | 927 | *target++=c; |
michael@0 | 928 | if(targetCapacity>=2) { |
michael@0 | 929 | *target++=trail; |
michael@0 | 930 | if(offsets!=NULL) { |
michael@0 | 931 | *offsets++=-1; |
michael@0 | 932 | *offsets++=-1; |
michael@0 | 933 | } |
michael@0 | 934 | targetCapacity-=2; |
michael@0 | 935 | } else /* targetCapacity==1 */ { |
michael@0 | 936 | targetCapacity=0; |
michael@0 | 937 | cnv->UCharErrorBuffer[0]=trail; |
michael@0 | 938 | cnv->UCharErrorBufferLength=1; |
michael@0 | 939 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 940 | } |
michael@0 | 941 | count=0; |
michael@0 | 942 | c=0; |
michael@0 | 943 | break; |
michael@0 | 944 | } else { |
michael@0 | 945 | /* unmatched lead surrogate, handle here for consistent toUBytes[] */ |
michael@0 | 946 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 947 | |
michael@0 | 948 | /* back out reading the code unit after it */ |
michael@0 | 949 | if(((const uint8_t *)pArgs->source-source)>=2) { |
michael@0 | 950 | source-=2; |
michael@0 | 951 | } else { |
michael@0 | 952 | /* |
michael@0 | 953 | * if the trail unit's first byte was in a previous buffer, then |
michael@0 | 954 | * we need to put it into a special place because toUBytes[] will be |
michael@0 | 955 | * used for the lead unit's bytes |
michael@0 | 956 | */ |
michael@0 | 957 | cnv->toUnicodeStatus=0x100|p[2]; |
michael@0 | 958 | --source; |
michael@0 | 959 | } |
michael@0 | 960 | cnv->toULength=2; |
michael@0 | 961 | |
michael@0 | 962 | /* write back the updated pointers */ |
michael@0 | 963 | pArgs->source=(const char *)source; |
michael@0 | 964 | pArgs->target=target; |
michael@0 | 965 | pArgs->offsets=offsets; |
michael@0 | 966 | return; |
michael@0 | 967 | } |
michael@0 | 968 | } |
michael@0 | 969 | } while(length>0); |
michael@0 | 970 | cnv->toULength=(int8_t)count; |
michael@0 | 971 | } |
michael@0 | 972 | |
michael@0 | 973 | /* copy an even number of bytes for complete UChars */ |
michael@0 | 974 | count=2*targetCapacity; |
michael@0 | 975 | if(count>length) { |
michael@0 | 976 | count=length&~1; |
michael@0 | 977 | } |
michael@0 | 978 | if(c==0 && count>0) { |
michael@0 | 979 | length-=count; |
michael@0 | 980 | count>>=1; |
michael@0 | 981 | targetCapacity-=count; |
michael@0 | 982 | if(offsets==NULL) { |
michael@0 | 983 | do { |
michael@0 | 984 | c=((UChar)source[1]<<8)|source[0]; |
michael@0 | 985 | source+=2; |
michael@0 | 986 | if(U16_IS_SINGLE(c)) { |
michael@0 | 987 | *target++=c; |
michael@0 | 988 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
michael@0 | 989 | U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
michael@0 | 990 | ) { |
michael@0 | 991 | source+=2; |
michael@0 | 992 | --count; |
michael@0 | 993 | *target++=c; |
michael@0 | 994 | *target++=trail; |
michael@0 | 995 | } else { |
michael@0 | 996 | break; |
michael@0 | 997 | } |
michael@0 | 998 | } while(--count>0); |
michael@0 | 999 | } else { |
michael@0 | 1000 | do { |
michael@0 | 1001 | c=((UChar)source[1]<<8)|source[0]; |
michael@0 | 1002 | source+=2; |
michael@0 | 1003 | if(U16_IS_SINGLE(c)) { |
michael@0 | 1004 | *target++=c; |
michael@0 | 1005 | *offsets++=sourceIndex; |
michael@0 | 1006 | sourceIndex+=2; |
michael@0 | 1007 | } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && |
michael@0 | 1008 | U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) |
michael@0 | 1009 | ) { |
michael@0 | 1010 | source+=2; |
michael@0 | 1011 | --count; |
michael@0 | 1012 | *target++=c; |
michael@0 | 1013 | *target++=trail; |
michael@0 | 1014 | *offsets++=sourceIndex; |
michael@0 | 1015 | *offsets++=sourceIndex; |
michael@0 | 1016 | sourceIndex+=4; |
michael@0 | 1017 | } else { |
michael@0 | 1018 | break; |
michael@0 | 1019 | } |
michael@0 | 1020 | } while(--count>0); |
michael@0 | 1021 | } |
michael@0 | 1022 | |
michael@0 | 1023 | if(count==0) { |
michael@0 | 1024 | /* done with the loop for complete UChars */ |
michael@0 | 1025 | c=0; |
michael@0 | 1026 | } else { |
michael@0 | 1027 | /* keep c for surrogate handling, trail will be set there */ |
michael@0 | 1028 | length+=2*(count-1); /* one more byte pair was consumed than count decremented */ |
michael@0 | 1029 | targetCapacity+=count; |
michael@0 | 1030 | } |
michael@0 | 1031 | } |
michael@0 | 1032 | |
michael@0 | 1033 | if(c!=0) { |
michael@0 | 1034 | /* |
michael@0 | 1035 | * c is a surrogate, and |
michael@0 | 1036 | * - source or target too short |
michael@0 | 1037 | * - or the surrogate is unmatched |
michael@0 | 1038 | */ |
michael@0 | 1039 | cnv->toUBytes[0]=(uint8_t)c; |
michael@0 | 1040 | cnv->toUBytes[1]=(uint8_t)(c>>8); |
michael@0 | 1041 | cnv->toULength=2; |
michael@0 | 1042 | |
michael@0 | 1043 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1044 | if(length>=2) { |
michael@0 | 1045 | if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { |
michael@0 | 1046 | /* output the surrogate pair, will overflow (see conditions comment above) */ |
michael@0 | 1047 | source+=2; |
michael@0 | 1048 | length-=2; |
michael@0 | 1049 | *target++=c; |
michael@0 | 1050 | if(offsets!=NULL) { |
michael@0 | 1051 | *offsets++=sourceIndex; |
michael@0 | 1052 | } |
michael@0 | 1053 | cnv->UCharErrorBuffer[0]=trail; |
michael@0 | 1054 | cnv->UCharErrorBufferLength=1; |
michael@0 | 1055 | cnv->toULength=0; |
michael@0 | 1056 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1057 | } else { |
michael@0 | 1058 | /* unmatched lead surrogate */ |
michael@0 | 1059 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1060 | } |
michael@0 | 1061 | } else { |
michael@0 | 1062 | /* see if the trail surrogate is in the next buffer */ |
michael@0 | 1063 | } |
michael@0 | 1064 | } else { |
michael@0 | 1065 | /* unmatched trail surrogate */ |
michael@0 | 1066 | *pErrorCode=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1067 | } |
michael@0 | 1068 | } |
michael@0 | 1069 | |
michael@0 | 1070 | if(U_SUCCESS(*pErrorCode)) { |
michael@0 | 1071 | /* check for a remaining source byte */ |
michael@0 | 1072 | if(length>0) { |
michael@0 | 1073 | if(targetCapacity==0) { |
michael@0 | 1074 | *pErrorCode=U_BUFFER_OVERFLOW_ERROR; |
michael@0 | 1075 | } else { |
michael@0 | 1076 | /* it must be length==1 because otherwise the above would have copied more */ |
michael@0 | 1077 | cnv->toUBytes[cnv->toULength++]=*source++; |
michael@0 | 1078 | } |
michael@0 | 1079 | } |
michael@0 | 1080 | } |
michael@0 | 1081 | |
michael@0 | 1082 | /* write back the updated pointers */ |
michael@0 | 1083 | pArgs->source=(const char *)source; |
michael@0 | 1084 | pArgs->target=target; |
michael@0 | 1085 | pArgs->offsets=offsets; |
michael@0 | 1086 | } |
michael@0 | 1087 | |
michael@0 | 1088 | static UChar32 |
michael@0 | 1089 | _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { |
michael@0 | 1090 | const uint8_t *s, *sourceLimit; |
michael@0 | 1091 | UChar32 c; |
michael@0 | 1092 | |
michael@0 | 1093 | if(pArgs->converter->mode<8) { |
michael@0 | 1094 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
michael@0 | 1095 | } |
michael@0 | 1096 | |
michael@0 | 1097 | s=(const uint8_t *)pArgs->source; |
michael@0 | 1098 | sourceLimit=(const uint8_t *)pArgs->sourceLimit; |
michael@0 | 1099 | |
michael@0 | 1100 | if(s>=sourceLimit) { |
michael@0 | 1101 | /* no input */ |
michael@0 | 1102 | *err=U_INDEX_OUTOFBOUNDS_ERROR; |
michael@0 | 1103 | return 0xffff; |
michael@0 | 1104 | } |
michael@0 | 1105 | |
michael@0 | 1106 | if(s+2>sourceLimit) { |
michael@0 | 1107 | /* only one byte: truncated UChar */ |
michael@0 | 1108 | pArgs->converter->toUBytes[0]=*s++; |
michael@0 | 1109 | pArgs->converter->toULength=1; |
michael@0 | 1110 | pArgs->source=(const char *)s; |
michael@0 | 1111 | *err = U_TRUNCATED_CHAR_FOUND; |
michael@0 | 1112 | return 0xffff; |
michael@0 | 1113 | } |
michael@0 | 1114 | |
michael@0 | 1115 | /* get one UChar */ |
michael@0 | 1116 | c=((UChar32)s[1]<<8)|*s; |
michael@0 | 1117 | s+=2; |
michael@0 | 1118 | |
michael@0 | 1119 | /* check for a surrogate pair */ |
michael@0 | 1120 | if(U_IS_SURROGATE(c)) { |
michael@0 | 1121 | if(U16_IS_SURROGATE_LEAD(c)) { |
michael@0 | 1122 | if(s+2<=sourceLimit) { |
michael@0 | 1123 | UChar trail; |
michael@0 | 1124 | |
michael@0 | 1125 | /* get a second UChar and see if it is a trail surrogate */ |
michael@0 | 1126 | trail=((UChar)s[1]<<8)|*s; |
michael@0 | 1127 | if(U16_IS_TRAIL(trail)) { |
michael@0 | 1128 | c=U16_GET_SUPPLEMENTARY(c, trail); |
michael@0 | 1129 | s+=2; |
michael@0 | 1130 | } else { |
michael@0 | 1131 | /* unmatched lead surrogate */ |
michael@0 | 1132 | c=-2; |
michael@0 | 1133 | } |
michael@0 | 1134 | } else { |
michael@0 | 1135 | /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ |
michael@0 | 1136 | uint8_t *bytes=pArgs->converter->toUBytes; |
michael@0 | 1137 | s-=2; |
michael@0 | 1138 | pArgs->converter->toULength=(int8_t)(sourceLimit-s); |
michael@0 | 1139 | do { |
michael@0 | 1140 | *bytes++=*s++; |
michael@0 | 1141 | } while(s<sourceLimit); |
michael@0 | 1142 | |
michael@0 | 1143 | c=0xffff; |
michael@0 | 1144 | *err=U_TRUNCATED_CHAR_FOUND; |
michael@0 | 1145 | } |
michael@0 | 1146 | } else { |
michael@0 | 1147 | /* unmatched trail surrogate */ |
michael@0 | 1148 | c=-2; |
michael@0 | 1149 | } |
michael@0 | 1150 | |
michael@0 | 1151 | if(c<0) { |
michael@0 | 1152 | /* write the unmatched surrogate */ |
michael@0 | 1153 | uint8_t *bytes=pArgs->converter->toUBytes; |
michael@0 | 1154 | pArgs->converter->toULength=2; |
michael@0 | 1155 | *bytes=*(s-2); |
michael@0 | 1156 | bytes[1]=*(s-1); |
michael@0 | 1157 | |
michael@0 | 1158 | c=0xffff; |
michael@0 | 1159 | *err=U_ILLEGAL_CHAR_FOUND; |
michael@0 | 1160 | } |
michael@0 | 1161 | } |
michael@0 | 1162 | |
michael@0 | 1163 | pArgs->source=(const char *)s; |
michael@0 | 1164 | return c; |
michael@0 | 1165 | } |
michael@0 | 1166 | |
michael@0 | 1167 | static void |
michael@0 | 1168 | _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { |
michael@0 | 1169 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 1170 | /* reset toUnicode state */ |
michael@0 | 1171 | if(UCNV_GET_VERSION(cnv)==0) { |
michael@0 | 1172 | cnv->mode=8; /* no BOM handling */ |
michael@0 | 1173 | } else { |
michael@0 | 1174 | cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ |
michael@0 | 1175 | } |
michael@0 | 1176 | } |
michael@0 | 1177 | if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { |
michael@0 | 1178 | /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ |
michael@0 | 1179 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
michael@0 | 1180 | } |
michael@0 | 1181 | } |
michael@0 | 1182 | |
michael@0 | 1183 | static void |
michael@0 | 1184 | _UTF16LEOpen(UConverter *cnv, |
michael@0 | 1185 | UConverterLoadArgs *pArgs, |
michael@0 | 1186 | UErrorCode *pErrorCode) { |
michael@0 | 1187 | if(UCNV_GET_VERSION(cnv)<=1) { |
michael@0 | 1188 | _UTF16LEReset(cnv, UCNV_RESET_BOTH); |
michael@0 | 1189 | } else { |
michael@0 | 1190 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1191 | } |
michael@0 | 1192 | } |
michael@0 | 1193 | |
michael@0 | 1194 | static const char * |
michael@0 | 1195 | _UTF16LEGetName(const UConverter *cnv) { |
michael@0 | 1196 | if(UCNV_GET_VERSION(cnv)==0) { |
michael@0 | 1197 | return "UTF-16LE"; |
michael@0 | 1198 | } else { |
michael@0 | 1199 | return "UTF-16LE,version=1"; |
michael@0 | 1200 | } |
michael@0 | 1201 | } |
michael@0 | 1202 | |
michael@0 | 1203 | static const UConverterImpl _UTF16LEImpl={ |
michael@0 | 1204 | UCNV_UTF16_LittleEndian, |
michael@0 | 1205 | |
michael@0 | 1206 | NULL, |
michael@0 | 1207 | NULL, |
michael@0 | 1208 | |
michael@0 | 1209 | _UTF16LEOpen, |
michael@0 | 1210 | NULL, |
michael@0 | 1211 | _UTF16LEReset, |
michael@0 | 1212 | |
michael@0 | 1213 | _UTF16LEToUnicodeWithOffsets, |
michael@0 | 1214 | _UTF16LEToUnicodeWithOffsets, |
michael@0 | 1215 | _UTF16LEFromUnicodeWithOffsets, |
michael@0 | 1216 | _UTF16LEFromUnicodeWithOffsets, |
michael@0 | 1217 | _UTF16LEGetNextUChar, |
michael@0 | 1218 | |
michael@0 | 1219 | NULL, |
michael@0 | 1220 | _UTF16LEGetName, |
michael@0 | 1221 | NULL, |
michael@0 | 1222 | NULL, |
michael@0 | 1223 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 1224 | }; |
michael@0 | 1225 | |
michael@0 | 1226 | |
michael@0 | 1227 | static const UConverterStaticData _UTF16LEStaticData={ |
michael@0 | 1228 | sizeof(UConverterStaticData), |
michael@0 | 1229 | "UTF-16LE", |
michael@0 | 1230 | 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, |
michael@0 | 1231 | { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, |
michael@0 | 1232 | 0, |
michael@0 | 1233 | 0, |
michael@0 | 1234 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1235 | }; |
michael@0 | 1236 | |
michael@0 | 1237 | |
michael@0 | 1238 | const UConverterSharedData _UTF16LEData={ |
michael@0 | 1239 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1240 | NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, |
michael@0 | 1241 | 0 |
michael@0 | 1242 | }; |
michael@0 | 1243 | |
michael@0 | 1244 | /* UTF-16 (Detect BOM) ------------------------------------------------------ */ |
michael@0 | 1245 | |
michael@0 | 1246 | /* |
michael@0 | 1247 | * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE |
michael@0 | 1248 | * accordingly. |
michael@0 | 1249 | * This is a simpler version of the UTF-32 converter, with |
michael@0 | 1250 | * fewer states for shorter BOMs. |
michael@0 | 1251 | * |
michael@0 | 1252 | * State values: |
michael@0 | 1253 | * 0 initial state |
michael@0 | 1254 | * 1 saw first byte |
michael@0 | 1255 | * 2..5 - |
michael@0 | 1256 | * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 |
michael@0 | 1257 | * 8 UTF-16BE mode |
michael@0 | 1258 | * 9 UTF-16LE mode |
michael@0 | 1259 | * |
michael@0 | 1260 | * During detection: state==number of initial bytes seen so far. |
michael@0 | 1261 | * |
michael@0 | 1262 | * On output, emit U+FEFF as the first code point. |
michael@0 | 1263 | * |
michael@0 | 1264 | * Variants: |
michael@0 | 1265 | * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. |
michael@0 | 1266 | * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and |
michael@0 | 1267 | * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. |
michael@0 | 1268 | */ |
michael@0 | 1269 | |
michael@0 | 1270 | static void |
michael@0 | 1271 | _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { |
michael@0 | 1272 | if(choice<=UCNV_RESET_TO_UNICODE) { |
michael@0 | 1273 | /* reset toUnicode: state=0 */ |
michael@0 | 1274 | cnv->mode=0; |
michael@0 | 1275 | } |
michael@0 | 1276 | if(choice!=UCNV_RESET_TO_UNICODE) { |
michael@0 | 1277 | /* reset fromUnicode: prepare to output the UTF-16PE BOM */ |
michael@0 | 1278 | cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; |
michael@0 | 1279 | } |
michael@0 | 1280 | } |
michael@0 | 1281 | |
michael@0 | 1282 | static const UConverterSharedData _UTF16v2Data; |
michael@0 | 1283 | |
michael@0 | 1284 | static void |
michael@0 | 1285 | _UTF16Open(UConverter *cnv, |
michael@0 | 1286 | UConverterLoadArgs *pArgs, |
michael@0 | 1287 | UErrorCode *pErrorCode) { |
michael@0 | 1288 | if(UCNV_GET_VERSION(cnv)<=2) { |
michael@0 | 1289 | if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { |
michael@0 | 1290 | /* |
michael@0 | 1291 | * Switch implementation, and switch the staticData that's different |
michael@0 | 1292 | * and was copied into the UConverter. |
michael@0 | 1293 | * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) |
michael@0 | 1294 | * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. |
michael@0 | 1295 | */ |
michael@0 | 1296 | cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; |
michael@0 | 1297 | uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); |
michael@0 | 1298 | } |
michael@0 | 1299 | _UTF16Reset(cnv, UCNV_RESET_BOTH); |
michael@0 | 1300 | } else { |
michael@0 | 1301 | *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; |
michael@0 | 1302 | } |
michael@0 | 1303 | } |
michael@0 | 1304 | |
michael@0 | 1305 | static const char * |
michael@0 | 1306 | _UTF16GetName(const UConverter *cnv) { |
michael@0 | 1307 | if(UCNV_GET_VERSION(cnv)==0) { |
michael@0 | 1308 | return "UTF-16"; |
michael@0 | 1309 | } else if(UCNV_GET_VERSION(cnv)==1) { |
michael@0 | 1310 | return "UTF-16,version=1"; |
michael@0 | 1311 | } else { |
michael@0 | 1312 | return "UTF-16,version=2"; |
michael@0 | 1313 | } |
michael@0 | 1314 | } |
michael@0 | 1315 | |
michael@0 | 1316 | const UConverterSharedData _UTF16Data; |
michael@0 | 1317 | |
michael@0 | 1318 | #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) |
michael@0 | 1319 | #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) |
michael@0 | 1320 | #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data) |
michael@0 | 1321 | |
michael@0 | 1322 | static void |
michael@0 | 1323 | _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, |
michael@0 | 1324 | UErrorCode *pErrorCode) { |
michael@0 | 1325 | UConverter *cnv=pArgs->converter; |
michael@0 | 1326 | const char *source=pArgs->source; |
michael@0 | 1327 | const char *sourceLimit=pArgs->sourceLimit; |
michael@0 | 1328 | int32_t *offsets=pArgs->offsets; |
michael@0 | 1329 | |
michael@0 | 1330 | int32_t state, offsetDelta; |
michael@0 | 1331 | uint8_t b; |
michael@0 | 1332 | |
michael@0 | 1333 | state=cnv->mode; |
michael@0 | 1334 | |
michael@0 | 1335 | /* |
michael@0 | 1336 | * If we detect a BOM in this buffer, then we must add the BOM size to the |
michael@0 | 1337 | * offsets because the actual converter function will not see and count the BOM. |
michael@0 | 1338 | * offsetDelta will have the number of the BOM bytes that are in the current buffer. |
michael@0 | 1339 | */ |
michael@0 | 1340 | offsetDelta=0; |
michael@0 | 1341 | |
michael@0 | 1342 | while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { |
michael@0 | 1343 | switch(state) { |
michael@0 | 1344 | case 0: |
michael@0 | 1345 | cnv->toUBytes[0]=(uint8_t)*source++; |
michael@0 | 1346 | cnv->toULength=1; |
michael@0 | 1347 | state=1; |
michael@0 | 1348 | break; |
michael@0 | 1349 | case 1: |
michael@0 | 1350 | /* |
michael@0 | 1351 | * Only inside this switch case can the state variable |
michael@0 | 1352 | * temporarily take two additional values: |
michael@0 | 1353 | * 6: BOM error, continue with BE |
michael@0 | 1354 | * 7: BOM error, continue with LE |
michael@0 | 1355 | */ |
michael@0 | 1356 | b=*source; |
michael@0 | 1357 | if(cnv->toUBytes[0]==0xfe && b==0xff) { |
michael@0 | 1358 | if(IS_UTF16LE(cnv)) { |
michael@0 | 1359 | state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ |
michael@0 | 1360 | } else { |
michael@0 | 1361 | state=8; /* detect UTF-16BE */ |
michael@0 | 1362 | } |
michael@0 | 1363 | } else if(cnv->toUBytes[0]==0xff && b==0xfe) { |
michael@0 | 1364 | if(IS_UTF16BE(cnv)) { |
michael@0 | 1365 | state=6; /* illegal reverse BOM for Java "UnicodeBig" */ |
michael@0 | 1366 | } else { |
michael@0 | 1367 | state=9; /* detect UTF-16LE */ |
michael@0 | 1368 | } |
michael@0 | 1369 | } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { |
michael@0 | 1370 | state=6; /* illegal missing BOM for Java "Unicode" */ |
michael@0 | 1371 | } |
michael@0 | 1372 | if(state>=8) { |
michael@0 | 1373 | /* BOM detected, consume it */ |
michael@0 | 1374 | ++source; |
michael@0 | 1375 | cnv->toULength=0; |
michael@0 | 1376 | offsetDelta=(int32_t)(source-pArgs->source); |
michael@0 | 1377 | } else if(state<6) { |
michael@0 | 1378 | /* ok: no BOM, and not a reverse BOM */ |
michael@0 | 1379 | if(source!=pArgs->source) { |
michael@0 | 1380 | /* reset the source for a correct first offset */ |
michael@0 | 1381 | source=pArgs->source; |
michael@0 | 1382 | cnv->toULength=0; |
michael@0 | 1383 | } |
michael@0 | 1384 | if(IS_UTF16LE(cnv)) { |
michael@0 | 1385 | /* Make Java "UnicodeLittle" default to LE. */ |
michael@0 | 1386 | state=9; |
michael@0 | 1387 | } else { |
michael@0 | 1388 | /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ |
michael@0 | 1389 | state=8; |
michael@0 | 1390 | } |
michael@0 | 1391 | } else { |
michael@0 | 1392 | /* |
michael@0 | 1393 | * error: missing BOM, or reverse BOM |
michael@0 | 1394 | * UTF-16,version=1: Java-specific "Unicode" requires a BOM. |
michael@0 | 1395 | * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. |
michael@0 | 1396 | * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. |
michael@0 | 1397 | */ |
michael@0 | 1398 | /* report the non-BOM or reverse BOM as an illegal sequence */ |
michael@0 | 1399 | cnv->toUBytes[1]=b; |
michael@0 | 1400 | cnv->toULength=2; |
michael@0 | 1401 | pArgs->source=source+1; |
michael@0 | 1402 | /* continue with conversion if the callback resets the error */ |
michael@0 | 1403 | /* |
michael@0 | 1404 | * Make Java "Unicode" default to BE like standard UTF-16. |
michael@0 | 1405 | * Make Java "UnicodeBig" and "UnicodeLittle" default |
michael@0 | 1406 | * to their normal endiannesses. |
michael@0 | 1407 | */ |
michael@0 | 1408 | cnv->mode=state+2; |
michael@0 | 1409 | *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; |
michael@0 | 1410 | return; |
michael@0 | 1411 | } |
michael@0 | 1412 | /* convert the rest of the stream */ |
michael@0 | 1413 | cnv->mode=state; |
michael@0 | 1414 | continue; |
michael@0 | 1415 | case 8: |
michael@0 | 1416 | /* call UTF-16BE */ |
michael@0 | 1417 | pArgs->source=source; |
michael@0 | 1418 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 1419 | source=pArgs->source; |
michael@0 | 1420 | break; |
michael@0 | 1421 | case 9: |
michael@0 | 1422 | /* call UTF-16LE */ |
michael@0 | 1423 | pArgs->source=source; |
michael@0 | 1424 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 1425 | source=pArgs->source; |
michael@0 | 1426 | break; |
michael@0 | 1427 | default: |
michael@0 | 1428 | break; /* does not occur */ |
michael@0 | 1429 | } |
michael@0 | 1430 | } |
michael@0 | 1431 | |
michael@0 | 1432 | /* add BOM size to offsets - see comment at offsetDelta declaration */ |
michael@0 | 1433 | if(offsets!=NULL && offsetDelta!=0) { |
michael@0 | 1434 | int32_t *offsetsLimit=pArgs->offsets; |
michael@0 | 1435 | while(offsets<offsetsLimit) { |
michael@0 | 1436 | *offsets++ += offsetDelta; |
michael@0 | 1437 | } |
michael@0 | 1438 | } |
michael@0 | 1439 | |
michael@0 | 1440 | pArgs->source=source; |
michael@0 | 1441 | |
michael@0 | 1442 | if(source==sourceLimit && pArgs->flush) { |
michael@0 | 1443 | /* handle truncated input */ |
michael@0 | 1444 | switch(state) { |
michael@0 | 1445 | case 0: |
michael@0 | 1446 | break; /* no input at all, nothing to do */ |
michael@0 | 1447 | case 8: |
michael@0 | 1448 | _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 1449 | break; |
michael@0 | 1450 | case 9: |
michael@0 | 1451 | _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); |
michael@0 | 1452 | break; |
michael@0 | 1453 | default: |
michael@0 | 1454 | /* 0<state<8: framework will report truncation, nothing to do here */ |
michael@0 | 1455 | break; |
michael@0 | 1456 | } |
michael@0 | 1457 | } |
michael@0 | 1458 | |
michael@0 | 1459 | cnv->mode=state; |
michael@0 | 1460 | } |
michael@0 | 1461 | |
michael@0 | 1462 | static UChar32 |
michael@0 | 1463 | _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, |
michael@0 | 1464 | UErrorCode *pErrorCode) { |
michael@0 | 1465 | switch(pArgs->converter->mode) { |
michael@0 | 1466 | case 8: |
michael@0 | 1467 | return _UTF16BEGetNextUChar(pArgs, pErrorCode); |
michael@0 | 1468 | case 9: |
michael@0 | 1469 | return _UTF16LEGetNextUChar(pArgs, pErrorCode); |
michael@0 | 1470 | default: |
michael@0 | 1471 | return UCNV_GET_NEXT_UCHAR_USE_TO_U; |
michael@0 | 1472 | } |
michael@0 | 1473 | } |
michael@0 | 1474 | |
michael@0 | 1475 | static const UConverterImpl _UTF16Impl = { |
michael@0 | 1476 | UCNV_UTF16, |
michael@0 | 1477 | |
michael@0 | 1478 | NULL, |
michael@0 | 1479 | NULL, |
michael@0 | 1480 | |
michael@0 | 1481 | _UTF16Open, |
michael@0 | 1482 | NULL, |
michael@0 | 1483 | _UTF16Reset, |
michael@0 | 1484 | |
michael@0 | 1485 | _UTF16ToUnicodeWithOffsets, |
michael@0 | 1486 | _UTF16ToUnicodeWithOffsets, |
michael@0 | 1487 | _UTF16PEFromUnicodeWithOffsets, |
michael@0 | 1488 | _UTF16PEFromUnicodeWithOffsets, |
michael@0 | 1489 | _UTF16GetNextUChar, |
michael@0 | 1490 | |
michael@0 | 1491 | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
michael@0 | 1492 | _UTF16GetName, |
michael@0 | 1493 | NULL, |
michael@0 | 1494 | NULL, |
michael@0 | 1495 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 1496 | }; |
michael@0 | 1497 | |
michael@0 | 1498 | static const UConverterStaticData _UTF16StaticData = { |
michael@0 | 1499 | sizeof(UConverterStaticData), |
michael@0 | 1500 | "UTF-16", |
michael@0 | 1501 | 1204, /* CCSID for BOM sensitive UTF-16 */ |
michael@0 | 1502 | UCNV_IBM, UCNV_UTF16, 2, 2, |
michael@0 | 1503 | #if U_IS_BIG_ENDIAN |
michael@0 | 1504 | { 0xff, 0xfd, 0, 0 }, 2, |
michael@0 | 1505 | #else |
michael@0 | 1506 | { 0xfd, 0xff, 0, 0 }, 2, |
michael@0 | 1507 | #endif |
michael@0 | 1508 | FALSE, FALSE, |
michael@0 | 1509 | 0, |
michael@0 | 1510 | 0, |
michael@0 | 1511 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1512 | }; |
michael@0 | 1513 | |
michael@0 | 1514 | const UConverterSharedData _UTF16Data = { |
michael@0 | 1515 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1516 | NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, |
michael@0 | 1517 | 0 |
michael@0 | 1518 | }; |
michael@0 | 1519 | |
michael@0 | 1520 | static const UConverterImpl _UTF16v2Impl = { |
michael@0 | 1521 | UCNV_UTF16, |
michael@0 | 1522 | |
michael@0 | 1523 | NULL, |
michael@0 | 1524 | NULL, |
michael@0 | 1525 | |
michael@0 | 1526 | _UTF16Open, |
michael@0 | 1527 | NULL, |
michael@0 | 1528 | _UTF16Reset, |
michael@0 | 1529 | |
michael@0 | 1530 | _UTF16ToUnicodeWithOffsets, |
michael@0 | 1531 | _UTF16ToUnicodeWithOffsets, |
michael@0 | 1532 | _UTF16BEFromUnicodeWithOffsets, |
michael@0 | 1533 | _UTF16BEFromUnicodeWithOffsets, |
michael@0 | 1534 | _UTF16GetNextUChar, |
michael@0 | 1535 | |
michael@0 | 1536 | NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ |
michael@0 | 1537 | _UTF16GetName, |
michael@0 | 1538 | NULL, |
michael@0 | 1539 | NULL, |
michael@0 | 1540 | ucnv_getNonSurrogateUnicodeSet |
michael@0 | 1541 | }; |
michael@0 | 1542 | |
michael@0 | 1543 | static const UConverterStaticData _UTF16v2StaticData = { |
michael@0 | 1544 | sizeof(UConverterStaticData), |
michael@0 | 1545 | "UTF-16,version=2", |
michael@0 | 1546 | 1204, /* CCSID for BOM sensitive UTF-16 */ |
michael@0 | 1547 | UCNV_IBM, UCNV_UTF16, 2, 2, |
michael@0 | 1548 | { 0xff, 0xfd, 0, 0 }, 2, |
michael@0 | 1549 | FALSE, FALSE, |
michael@0 | 1550 | 0, |
michael@0 | 1551 | 0, |
michael@0 | 1552 | { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ |
michael@0 | 1553 | }; |
michael@0 | 1554 | |
michael@0 | 1555 | static const UConverterSharedData _UTF16v2Data = { |
michael@0 | 1556 | sizeof(UConverterSharedData), ~((uint32_t) 0), |
michael@0 | 1557 | NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, |
michael@0 | 1558 | 0 |
michael@0 | 1559 | }; |
michael@0 | 1560 | |
michael@0 | 1561 | #endif |