intl/icu/source/common/ucnv_u16.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/icu/source/common/ucnv_u16.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1561 @@
     1.4 +/*  
     1.5 +**********************************************************************
     1.6 +*   Copyright (C) 2002-2010, International Business Machines
     1.7 +*   Corporation and others.  All Rights Reserved.
     1.8 +**********************************************************************
     1.9 +*   file name:  ucnv_u16.c
    1.10 +*   encoding:   US-ASCII
    1.11 +*   tab size:   8 (not used)
    1.12 +*   indentation:4
    1.13 +*
    1.14 +*   created on: 2002jul01
    1.15 +*   created by: Markus W. Scherer
    1.16 +*
    1.17 +*   UTF-16 converter implementation. Used to be in ucnv_utf.c.
    1.18 +*/
    1.19 +
    1.20 +#include "unicode/utypes.h"
    1.21 +
    1.22 +#if !UCONFIG_NO_CONVERSION
    1.23 +
    1.24 +#include "unicode/ucnv.h"
    1.25 +#include "ucnv_bld.h"
    1.26 +#include "ucnv_cnv.h"
    1.27 +#include "cmemory.h"
    1.28 +
    1.29 +enum {
    1.30 +    UCNV_NEED_TO_WRITE_BOM=1
    1.31 +};
    1.32 +
    1.33 +/*
    1.34 + * The UTF-16 toUnicode implementation is also used for the Java-specific
    1.35 + * "with BOM" variants of UTF-16BE and UTF-16LE.
    1.36 + */
    1.37 +static void
    1.38 +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    1.39 +                           UErrorCode *pErrorCode);
    1.40 +
    1.41 +/* UTF-16BE ----------------------------------------------------------------- */
    1.42 +
    1.43 +#if U_IS_BIG_ENDIAN
    1.44 +#   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
    1.45 +#else
    1.46 +#   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
    1.47 +#endif
    1.48 +
    1.49 +
    1.50 +static void
    1.51 +_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    1.52 +                               UErrorCode *pErrorCode) {
    1.53 +    UConverter *cnv;
    1.54 +    const UChar *source;
    1.55 +    char *target;
    1.56 +    int32_t *offsets;
    1.57 +
    1.58 +    uint32_t targetCapacity, length, sourceIndex;
    1.59 +    UChar c, trail;
    1.60 +    char overflow[4];
    1.61 +
    1.62 +    source=pArgs->source;
    1.63 +    length=(int32_t)(pArgs->sourceLimit-source);
    1.64 +    if(length<=0) {
    1.65 +        /* no input, nothing to do */
    1.66 +        return;
    1.67 +    }
    1.68 +
    1.69 +    cnv=pArgs->converter;
    1.70 +
    1.71 +    /* write the BOM if necessary */
    1.72 +    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    1.73 +        static const char bom[]={ (char)0xfe, (char)0xff };
    1.74 +        ucnv_fromUWriteBytes(cnv,
    1.75 +                             bom, 2,
    1.76 +                             &pArgs->target, pArgs->targetLimit,
    1.77 +                             &pArgs->offsets, -1,
    1.78 +                             pErrorCode);
    1.79 +        cnv->fromUnicodeStatus=0;
    1.80 +    }
    1.81 +
    1.82 +    target=pArgs->target;
    1.83 +    if(target >= pArgs->targetLimit) {
    1.84 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    1.85 +        return;
    1.86 +    }
    1.87 +
    1.88 +    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
    1.89 +    offsets=pArgs->offsets;
    1.90 +    sourceIndex=0;
    1.91 +
    1.92 +    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
    1.93 +
    1.94 +    if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
    1.95 +        /* the last buffer ended with a lead surrogate, output the surrogate pair */
    1.96 +        ++source;
    1.97 +        --length;
    1.98 +        target[0]=(uint8_t)(c>>8);
    1.99 +        target[1]=(uint8_t)c;
   1.100 +        target[2]=(uint8_t)(trail>>8);
   1.101 +        target[3]=(uint8_t)trail;
   1.102 +        target+=4;
   1.103 +        targetCapacity-=4;
   1.104 +        if(offsets!=NULL) {
   1.105 +            *offsets++=-1;
   1.106 +            *offsets++=-1;
   1.107 +            *offsets++=-1;
   1.108 +            *offsets++=-1;
   1.109 +        }
   1.110 +        sourceIndex=1;
   1.111 +        cnv->fromUChar32=c=0;
   1.112 +    }
   1.113 +
   1.114 +    if(c==0) {
   1.115 +        /* copy an even number of bytes for complete UChars */
   1.116 +        uint32_t count=2*length;
   1.117 +        if(count>targetCapacity) {
   1.118 +            count=targetCapacity&~1;
   1.119 +        }
   1.120 +        /* count is even */
   1.121 +        targetCapacity-=count;
   1.122 +        count>>=1;
   1.123 +        length-=count;
   1.124 +
   1.125 +        if(offsets==NULL) {
   1.126 +            while(count>0) {
   1.127 +                c=*source++;
   1.128 +                if(U16_IS_SINGLE(c)) {
   1.129 +                    target[0]=(uint8_t)(c>>8);
   1.130 +                    target[1]=(uint8_t)c;
   1.131 +                    target+=2;
   1.132 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   1.133 +                    ++source;
   1.134 +                    --count;
   1.135 +                    target[0]=(uint8_t)(c>>8);
   1.136 +                    target[1]=(uint8_t)c;
   1.137 +                    target[2]=(uint8_t)(trail>>8);
   1.138 +                    target[3]=(uint8_t)trail;
   1.139 +                    target+=4;
   1.140 +                } else {
   1.141 +                    break;
   1.142 +                }
   1.143 +                --count;
   1.144 +            }
   1.145 +        } else {
   1.146 +            while(count>0) {
   1.147 +                c=*source++;
   1.148 +                if(U16_IS_SINGLE(c)) {
   1.149 +                    target[0]=(uint8_t)(c>>8);
   1.150 +                    target[1]=(uint8_t)c;
   1.151 +                    target+=2;
   1.152 +                    *offsets++=sourceIndex;
   1.153 +                    *offsets++=sourceIndex++;
   1.154 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   1.155 +                    ++source;
   1.156 +                    --count;
   1.157 +                    target[0]=(uint8_t)(c>>8);
   1.158 +                    target[1]=(uint8_t)c;
   1.159 +                    target[2]=(uint8_t)(trail>>8);
   1.160 +                    target[3]=(uint8_t)trail;
   1.161 +                    target+=4;
   1.162 +                    *offsets++=sourceIndex;
   1.163 +                    *offsets++=sourceIndex;
   1.164 +                    *offsets++=sourceIndex;
   1.165 +                    *offsets++=sourceIndex;
   1.166 +                    sourceIndex+=2;
   1.167 +                } else {
   1.168 +                    break;
   1.169 +                }
   1.170 +                --count;
   1.171 +            }
   1.172 +        }
   1.173 +
   1.174 +        if(count==0) {
   1.175 +            /* done with the loop for complete UChars */
   1.176 +            if(length>0 && targetCapacity>0) {
   1.177 +                /*
   1.178 +                 * there is more input and some target capacity -
   1.179 +                 * it must be targetCapacity==1 because otherwise
   1.180 +                 * the above would have copied more;
   1.181 +                 * prepare for overflow output
   1.182 +                 */
   1.183 +                if(U16_IS_SINGLE(c=*source++)) {
   1.184 +                    overflow[0]=(char)(c>>8);
   1.185 +                    overflow[1]=(char)c;
   1.186 +                    length=2; /* 2 bytes to output */
   1.187 +                    c=0;
   1.188 +                /* } else { keep c for surrogate handling, length will be set there */
   1.189 +                }
   1.190 +            } else {
   1.191 +                length=0;
   1.192 +                c=0;
   1.193 +            }
   1.194 +        } else {
   1.195 +            /* keep c for surrogate handling, length will be set there */
   1.196 +            targetCapacity+=2*count;
   1.197 +        }
   1.198 +    } else {
   1.199 +        length=0; /* from here on, length counts the bytes in overflow[] */
   1.200 +    }
   1.201 +    
   1.202 +    if(c!=0) {
   1.203 +        /*
   1.204 +         * c is a surrogate, and
   1.205 +         * - source or target too short
   1.206 +         * - or the surrogate is unmatched
   1.207 +         */
   1.208 +        length=0;
   1.209 +        if(U16_IS_SURROGATE_LEAD(c)) {
   1.210 +            if(source<pArgs->sourceLimit) {
   1.211 +                if(U16_IS_TRAIL(trail=*source)) {
   1.212 +                    /* output the surrogate pair, will overflow (see conditions comment above) */
   1.213 +                    ++source;
   1.214 +                    overflow[0]=(char)(c>>8);
   1.215 +                    overflow[1]=(char)c;
   1.216 +                    overflow[2]=(char)(trail>>8);
   1.217 +                    overflow[3]=(char)trail;
   1.218 +                    length=4; /* 4 bytes to output */
   1.219 +                    c=0;
   1.220 +                } else {
   1.221 +                    /* unmatched lead surrogate */
   1.222 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.223 +                }
   1.224 +            } else {
   1.225 +                /* see if the trail surrogate is in the next buffer */
   1.226 +            }
   1.227 +        } else {
   1.228 +            /* unmatched trail surrogate */
   1.229 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.230 +        }
   1.231 +        cnv->fromUChar32=c;
   1.232 +    }
   1.233 +
   1.234 +    if(length>0) {
   1.235 +        /* output length bytes with overflow (length>targetCapacity>0) */
   1.236 +        ucnv_fromUWriteBytes(cnv,
   1.237 +                             overflow, length,
   1.238 +                             (char **)&target, pArgs->targetLimit,
   1.239 +                             &offsets, sourceIndex,
   1.240 +                             pErrorCode);
   1.241 +        targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
   1.242 +    }
   1.243 +
   1.244 +    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
   1.245 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.246 +    }
   1.247 +
   1.248 +    /* write back the updated pointers */
   1.249 +    pArgs->source=source;
   1.250 +    pArgs->target=(char *)target;
   1.251 +    pArgs->offsets=offsets;
   1.252 +}
   1.253 +
   1.254 +static void
   1.255 +_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1.256 +                             UErrorCode *pErrorCode) {
   1.257 +    UConverter *cnv;
   1.258 +    const uint8_t *source;
   1.259 +    UChar *target;
   1.260 +    int32_t *offsets;
   1.261 +
   1.262 +    uint32_t targetCapacity, length, count, sourceIndex;
   1.263 +    UChar c, trail;
   1.264 +
   1.265 +    if(pArgs->converter->mode<8) {
   1.266 +        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
   1.267 +        return;
   1.268 +    }
   1.269 +
   1.270 +    cnv=pArgs->converter;
   1.271 +    source=(const uint8_t *)pArgs->source;
   1.272 +    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
   1.273 +    if(length<=0 && cnv->toUnicodeStatus==0) {
   1.274 +        /* no input, nothing to do */
   1.275 +        return;
   1.276 +    }
   1.277 +
   1.278 +    target=pArgs->target;
   1.279 +    if(target >= pArgs->targetLimit) {
   1.280 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.281 +        return;
   1.282 +    }
   1.283 +
   1.284 +    targetCapacity=(uint32_t)(pArgs->targetLimit-target);
   1.285 +    offsets=pArgs->offsets;
   1.286 +    sourceIndex=0;
   1.287 +    c=0;
   1.288 +
   1.289 +    /* complete a partial UChar or pair from the last call */
   1.290 +    if(cnv->toUnicodeStatus!=0) {
   1.291 +        /*
   1.292 +         * special case: single byte from a previous buffer,
   1.293 +         * where the byte turned out not to belong to a trail surrogate
   1.294 +         * and the preceding, unmatched lead surrogate was put into toUBytes[]
   1.295 +         * for error handling
   1.296 +         */
   1.297 +        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
   1.298 +        cnv->toULength=1;
   1.299 +        cnv->toUnicodeStatus=0;
   1.300 +    }
   1.301 +    if((count=cnv->toULength)!=0) {
   1.302 +        uint8_t *p=cnv->toUBytes;
   1.303 +        do {
   1.304 +            p[count++]=*source++;
   1.305 +            ++sourceIndex;
   1.306 +            --length;
   1.307 +            if(count==2) {
   1.308 +                c=((UChar)p[0]<<8)|p[1];
   1.309 +                if(U16_IS_SINGLE(c)) {
   1.310 +                    /* output the BMP code point */
   1.311 +                    *target++=c;
   1.312 +                    if(offsets!=NULL) {
   1.313 +                        *offsets++=-1;
   1.314 +                    }
   1.315 +                    --targetCapacity;
   1.316 +                    count=0;
   1.317 +                    c=0;
   1.318 +                    break;
   1.319 +                } else if(U16_IS_SURROGATE_LEAD(c)) {
   1.320 +                    /* continue collecting bytes for the trail surrogate */
   1.321 +                    c=0; /* avoid unnecessary surrogate handling below */
   1.322 +                } else {
   1.323 +                    /* fall through to error handling for an unmatched trail surrogate */
   1.324 +                    break;
   1.325 +                }
   1.326 +            } else if(count==4) {
   1.327 +                c=((UChar)p[0]<<8)|p[1];
   1.328 +                trail=((UChar)p[2]<<8)|p[3];
   1.329 +                if(U16_IS_TRAIL(trail)) {
   1.330 +                    /* output the surrogate pair */
   1.331 +                    *target++=c;
   1.332 +                    if(targetCapacity>=2) {
   1.333 +                        *target++=trail;
   1.334 +                        if(offsets!=NULL) {
   1.335 +                            *offsets++=-1;
   1.336 +                            *offsets++=-1;
   1.337 +                        }
   1.338 +                        targetCapacity-=2;
   1.339 +                    } else /* targetCapacity==1 */ {
   1.340 +                        targetCapacity=0;
   1.341 +                        cnv->UCharErrorBuffer[0]=trail;
   1.342 +                        cnv->UCharErrorBufferLength=1;
   1.343 +                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.344 +                    }
   1.345 +                    count=0;
   1.346 +                    c=0;
   1.347 +                    break;
   1.348 +                } else {
   1.349 +                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
   1.350 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.351 +
   1.352 +                    /* back out reading the code unit after it */
   1.353 +                    if(((const uint8_t *)pArgs->source-source)>=2) {
   1.354 +                        source-=2;
   1.355 +                    } else {
   1.356 +                        /*
   1.357 +                         * if the trail unit's first byte was in a previous buffer, then
   1.358 +                         * we need to put it into a special place because toUBytes[] will be
   1.359 +                         * used for the lead unit's bytes
   1.360 +                         */
   1.361 +                        cnv->toUnicodeStatus=0x100|p[2];
   1.362 +                        --source;
   1.363 +                    }
   1.364 +                    cnv->toULength=2;
   1.365 +
   1.366 +                    /* write back the updated pointers */
   1.367 +                    pArgs->source=(const char *)source;
   1.368 +                    pArgs->target=target;
   1.369 +                    pArgs->offsets=offsets;
   1.370 +                    return;
   1.371 +                }
   1.372 +            }
   1.373 +        } while(length>0);
   1.374 +        cnv->toULength=(int8_t)count;
   1.375 +    }
   1.376 +
   1.377 +    /* copy an even number of bytes for complete UChars */
   1.378 +    count=2*targetCapacity;
   1.379 +    if(count>length) {
   1.380 +        count=length&~1;
   1.381 +    }
   1.382 +    if(c==0 && count>0) {
   1.383 +        length-=count;
   1.384 +        count>>=1;
   1.385 +        targetCapacity-=count;
   1.386 +        if(offsets==NULL) {
   1.387 +            do {
   1.388 +                c=((UChar)source[0]<<8)|source[1];
   1.389 +                source+=2;
   1.390 +                if(U16_IS_SINGLE(c)) {
   1.391 +                    *target++=c;
   1.392 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1.393 +                          U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
   1.394 +                ) {
   1.395 +                    source+=2;
   1.396 +                    --count;
   1.397 +                    *target++=c;
   1.398 +                    *target++=trail;
   1.399 +                } else {
   1.400 +                    break;
   1.401 +                }
   1.402 +            } while(--count>0);
   1.403 +        } else {
   1.404 +            do {
   1.405 +                c=((UChar)source[0]<<8)|source[1];
   1.406 +                source+=2;
   1.407 +                if(U16_IS_SINGLE(c)) {
   1.408 +                    *target++=c;
   1.409 +                    *offsets++=sourceIndex;
   1.410 +                    sourceIndex+=2;
   1.411 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1.412 +                          U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
   1.413 +                ) {
   1.414 +                    source+=2;
   1.415 +                    --count;
   1.416 +                    *target++=c;
   1.417 +                    *target++=trail;
   1.418 +                    *offsets++=sourceIndex;
   1.419 +                    *offsets++=sourceIndex;
   1.420 +                    sourceIndex+=4;
   1.421 +                } else {
   1.422 +                    break;
   1.423 +                }
   1.424 +            } while(--count>0);
   1.425 +        }
   1.426 +
   1.427 +        if(count==0) {
   1.428 +            /* done with the loop for complete UChars */
   1.429 +            c=0;
   1.430 +        } else {
   1.431 +            /* keep c for surrogate handling, trail will be set there */
   1.432 +            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
   1.433 +            targetCapacity+=count;
   1.434 +        }
   1.435 +    }
   1.436 +
   1.437 +    if(c!=0) {
   1.438 +        /*
   1.439 +         * c is a surrogate, and
   1.440 +         * - source or target too short
   1.441 +         * - or the surrogate is unmatched
   1.442 +         */
   1.443 +        cnv->toUBytes[0]=(uint8_t)(c>>8);
   1.444 +        cnv->toUBytes[1]=(uint8_t)c;
   1.445 +        cnv->toULength=2;
   1.446 +
   1.447 +        if(U16_IS_SURROGATE_LEAD(c)) {
   1.448 +            if(length>=2) {
   1.449 +                if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
   1.450 +                    /* output the surrogate pair, will overflow (see conditions comment above) */
   1.451 +                    source+=2;
   1.452 +                    length-=2;
   1.453 +                    *target++=c;
   1.454 +                    if(offsets!=NULL) {
   1.455 +                        *offsets++=sourceIndex;
   1.456 +                    }
   1.457 +                    cnv->UCharErrorBuffer[0]=trail;
   1.458 +                    cnv->UCharErrorBufferLength=1;
   1.459 +                    cnv->toULength=0;
   1.460 +                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.461 +                } else {
   1.462 +                    /* unmatched lead surrogate */
   1.463 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.464 +                }
   1.465 +            } else {
   1.466 +                /* see if the trail surrogate is in the next buffer */
   1.467 +            }
   1.468 +        } else {
   1.469 +            /* unmatched trail surrogate */
   1.470 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.471 +        }
   1.472 +    }
   1.473 +
   1.474 +    if(U_SUCCESS(*pErrorCode)) {
   1.475 +        /* check for a remaining source byte */
   1.476 +        if(length>0) {
   1.477 +            if(targetCapacity==0) {
   1.478 +                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.479 +            } else {
   1.480 +                /* it must be length==1 because otherwise the above would have copied more */
   1.481 +                cnv->toUBytes[cnv->toULength++]=*source++;
   1.482 +            }
   1.483 +        }
   1.484 +    }
   1.485 +
   1.486 +    /* write back the updated pointers */
   1.487 +    pArgs->source=(const char *)source;
   1.488 +    pArgs->target=target;
   1.489 +    pArgs->offsets=offsets;
   1.490 +}
   1.491 +
   1.492 +static UChar32
   1.493 +_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
   1.494 +    const uint8_t *s, *sourceLimit;
   1.495 +    UChar32 c;
   1.496 +
   1.497 +    if(pArgs->converter->mode<8) {
   1.498 +        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1.499 +    }
   1.500 +
   1.501 +    s=(const uint8_t *)pArgs->source;
   1.502 +    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1.503 +
   1.504 +    if(s>=sourceLimit) {
   1.505 +        /* no input */
   1.506 +        *err=U_INDEX_OUTOFBOUNDS_ERROR;
   1.507 +        return 0xffff;
   1.508 +    }
   1.509 +
   1.510 +    if(s+2>sourceLimit) {
   1.511 +        /* only one byte: truncated UChar */
   1.512 +        pArgs->converter->toUBytes[0]=*s++;
   1.513 +        pArgs->converter->toULength=1;
   1.514 +        pArgs->source=(const char *)s;
   1.515 +        *err = U_TRUNCATED_CHAR_FOUND;
   1.516 +        return 0xffff;
   1.517 +    }
   1.518 +
   1.519 +    /* get one UChar */
   1.520 +    c=((UChar32)*s<<8)|s[1];
   1.521 +    s+=2;
   1.522 +
   1.523 +    /* check for a surrogate pair */
   1.524 +    if(U_IS_SURROGATE(c)) {
   1.525 +        if(U16_IS_SURROGATE_LEAD(c)) {
   1.526 +            if(s+2<=sourceLimit) {
   1.527 +                UChar trail;
   1.528 +
   1.529 +                /* get a second UChar and see if it is a trail surrogate */
   1.530 +                trail=((UChar)*s<<8)|s[1];
   1.531 +                if(U16_IS_TRAIL(trail)) {
   1.532 +                    c=U16_GET_SUPPLEMENTARY(c, trail);
   1.533 +                    s+=2;
   1.534 +                } else {
   1.535 +                    /* unmatched lead surrogate */
   1.536 +                    c=-2;
   1.537 +                }
   1.538 +            } else {
   1.539 +                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
   1.540 +                uint8_t *bytes=pArgs->converter->toUBytes;
   1.541 +                s-=2;
   1.542 +                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
   1.543 +                do {
   1.544 +                    *bytes++=*s++;
   1.545 +                } while(s<sourceLimit);
   1.546 +
   1.547 +                c=0xffff;
   1.548 +                *err=U_TRUNCATED_CHAR_FOUND;
   1.549 +            }
   1.550 +        } else {
   1.551 +            /* unmatched trail surrogate */
   1.552 +            c=-2;
   1.553 +        }
   1.554 +
   1.555 +        if(c<0) {
   1.556 +            /* write the unmatched surrogate */
   1.557 +            uint8_t *bytes=pArgs->converter->toUBytes;
   1.558 +            pArgs->converter->toULength=2;
   1.559 +            *bytes=*(s-2);
   1.560 +            bytes[1]=*(s-1);
   1.561 +
   1.562 +            c=0xffff;
   1.563 +            *err=U_ILLEGAL_CHAR_FOUND;
   1.564 +        }
   1.565 +    }
   1.566 +
   1.567 +    pArgs->source=(const char *)s;
   1.568 +    return c;
   1.569 +} 
   1.570 +
   1.571 +static void
   1.572 +_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
   1.573 +    if(choice<=UCNV_RESET_TO_UNICODE) {
   1.574 +        /* reset toUnicode state */
   1.575 +        if(UCNV_GET_VERSION(cnv)==0) {
   1.576 +            cnv->mode=8; /* no BOM handling */
   1.577 +        } else {
   1.578 +            cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
   1.579 +        }
   1.580 +    }
   1.581 +    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
   1.582 +        /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
   1.583 +        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1.584 +    }
   1.585 +}
   1.586 +
   1.587 +static void
   1.588 +_UTF16BEOpen(UConverter *cnv,
   1.589 +             UConverterLoadArgs *pArgs,
   1.590 +             UErrorCode *pErrorCode) {
   1.591 +    if(UCNV_GET_VERSION(cnv)<=1) {
   1.592 +        _UTF16BEReset(cnv, UCNV_RESET_BOTH);
   1.593 +    } else {
   1.594 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1.595 +    }
   1.596 +}
   1.597 +
   1.598 +static const char *
   1.599 +_UTF16BEGetName(const UConverter *cnv) {
   1.600 +    if(UCNV_GET_VERSION(cnv)==0) {
   1.601 +        return "UTF-16BE";
   1.602 +    } else {
   1.603 +        return "UTF-16BE,version=1";
   1.604 +    }
   1.605 +}
   1.606 +
   1.607 +static const UConverterImpl _UTF16BEImpl={
   1.608 +    UCNV_UTF16_BigEndian,
   1.609 +
   1.610 +    NULL,
   1.611 +    NULL,
   1.612 +
   1.613 +    _UTF16BEOpen,
   1.614 +    NULL,
   1.615 +    _UTF16BEReset,
   1.616 +
   1.617 +    _UTF16BEToUnicodeWithOffsets,
   1.618 +    _UTF16BEToUnicodeWithOffsets,
   1.619 +    _UTF16BEFromUnicodeWithOffsets,
   1.620 +    _UTF16BEFromUnicodeWithOffsets,
   1.621 +    _UTF16BEGetNextUChar,
   1.622 +
   1.623 +    NULL,
   1.624 +    _UTF16BEGetName,
   1.625 +    NULL,
   1.626 +    NULL,
   1.627 +    ucnv_getNonSurrogateUnicodeSet
   1.628 +};
   1.629 +
   1.630 +static const UConverterStaticData _UTF16BEStaticData={
   1.631 +    sizeof(UConverterStaticData),
   1.632 +    "UTF-16BE",
   1.633 +    1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
   1.634 +    { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
   1.635 +    0,
   1.636 +    0,
   1.637 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1.638 +};
   1.639 +
   1.640 +
   1.641 +const UConverterSharedData _UTF16BEData={
   1.642 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
   1.643 +    NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, 
   1.644 +    0
   1.645 +};
   1.646 +
   1.647 +/* UTF-16LE ----------------------------------------------------------------- */
   1.648 +
   1.649 +static void
   1.650 +_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   1.651 +                               UErrorCode *pErrorCode) {
   1.652 +    UConverter *cnv;
   1.653 +    const UChar *source;
   1.654 +    char *target;
   1.655 +    int32_t *offsets;
   1.656 +
   1.657 +    uint32_t targetCapacity, length, sourceIndex;
   1.658 +    UChar c, trail;
   1.659 +    char overflow[4];
   1.660 +
   1.661 +    source=pArgs->source;
   1.662 +    length=(int32_t)(pArgs->sourceLimit-source);
   1.663 +    if(length<=0) {
   1.664 +        /* no input, nothing to do */
   1.665 +        return;
   1.666 +    }
   1.667 +
   1.668 +    cnv=pArgs->converter;
   1.669 +
   1.670 +    /* write the BOM if necessary */
   1.671 +    if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   1.672 +        static const char bom[]={ (char)0xff, (char)0xfe };
   1.673 +        ucnv_fromUWriteBytes(cnv,
   1.674 +                             bom, 2,
   1.675 +                             &pArgs->target, pArgs->targetLimit,
   1.676 +                             &pArgs->offsets, -1,
   1.677 +                             pErrorCode);
   1.678 +        cnv->fromUnicodeStatus=0;
   1.679 +    }
   1.680 +
   1.681 +    target=pArgs->target;
   1.682 +    if(target >= pArgs->targetLimit) {
   1.683 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.684 +        return;
   1.685 +    }
   1.686 +
   1.687 +    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
   1.688 +    offsets=pArgs->offsets;
   1.689 +    sourceIndex=0;
   1.690 +
   1.691 +    /* c!=0 indicates in several places outside the main loops that a surrogate was found */
   1.692 +
   1.693 +    if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
   1.694 +        /* the last buffer ended with a lead surrogate, output the surrogate pair */
   1.695 +        ++source;
   1.696 +        --length;
   1.697 +        target[0]=(uint8_t)c;
   1.698 +        target[1]=(uint8_t)(c>>8);
   1.699 +        target[2]=(uint8_t)trail;
   1.700 +        target[3]=(uint8_t)(trail>>8);
   1.701 +        target+=4;
   1.702 +        targetCapacity-=4;
   1.703 +        if(offsets!=NULL) {
   1.704 +            *offsets++=-1;
   1.705 +            *offsets++=-1;
   1.706 +            *offsets++=-1;
   1.707 +            *offsets++=-1;
   1.708 +        }
   1.709 +        sourceIndex=1;
   1.710 +        cnv->fromUChar32=c=0;
   1.711 +    }
   1.712 +
   1.713 +    if(c==0) {
   1.714 +        /* copy an even number of bytes for complete UChars */
   1.715 +        uint32_t count=2*length;
   1.716 +        if(count>targetCapacity) {
   1.717 +            count=targetCapacity&~1;
   1.718 +        }
   1.719 +        /* count is even */
   1.720 +        targetCapacity-=count;
   1.721 +        count>>=1;
   1.722 +        length-=count;
   1.723 +
   1.724 +        if(offsets==NULL) {
   1.725 +            while(count>0) {
   1.726 +                c=*source++;
   1.727 +                if(U16_IS_SINGLE(c)) {
   1.728 +                    target[0]=(uint8_t)c;
   1.729 +                    target[1]=(uint8_t)(c>>8);
   1.730 +                    target+=2;
   1.731 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   1.732 +                    ++source;
   1.733 +                    --count;
   1.734 +                    target[0]=(uint8_t)c;
   1.735 +                    target[1]=(uint8_t)(c>>8);
   1.736 +                    target[2]=(uint8_t)trail;
   1.737 +                    target[3]=(uint8_t)(trail>>8);
   1.738 +                    target+=4;
   1.739 +                } else {
   1.740 +                    break;
   1.741 +                }
   1.742 +                --count;
   1.743 +            }
   1.744 +        } else {
   1.745 +            while(count>0) {
   1.746 +                c=*source++;
   1.747 +                if(U16_IS_SINGLE(c)) {
   1.748 +                    target[0]=(uint8_t)c;
   1.749 +                    target[1]=(uint8_t)(c>>8);
   1.750 +                    target+=2;
   1.751 +                    *offsets++=sourceIndex;
   1.752 +                    *offsets++=sourceIndex++;
   1.753 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   1.754 +                    ++source;
   1.755 +                    --count;
   1.756 +                    target[0]=(uint8_t)c;
   1.757 +                    target[1]=(uint8_t)(c>>8);
   1.758 +                    target[2]=(uint8_t)trail;
   1.759 +                    target[3]=(uint8_t)(trail>>8);
   1.760 +                    target+=4;
   1.761 +                    *offsets++=sourceIndex;
   1.762 +                    *offsets++=sourceIndex;
   1.763 +                    *offsets++=sourceIndex;
   1.764 +                    *offsets++=sourceIndex;
   1.765 +                    sourceIndex+=2;
   1.766 +                } else {
   1.767 +                    break;
   1.768 +                }
   1.769 +                --count;
   1.770 +            }
   1.771 +        }
   1.772 +
   1.773 +        if(count==0) {
   1.774 +            /* done with the loop for complete UChars */
   1.775 +            if(length>0 && targetCapacity>0) {
   1.776 +                /*
   1.777 +                 * there is more input and some target capacity -
   1.778 +                 * it must be targetCapacity==1 because otherwise
   1.779 +                 * the above would have copied more;
   1.780 +                 * prepare for overflow output
   1.781 +                 */
   1.782 +                if(U16_IS_SINGLE(c=*source++)) {
   1.783 +                    overflow[0]=(char)c;
   1.784 +                    overflow[1]=(char)(c>>8);
   1.785 +                    length=2; /* 2 bytes to output */
   1.786 +                    c=0;
   1.787 +                /* } else { keep c for surrogate handling, length will be set there */
   1.788 +                }
   1.789 +            } else {
   1.790 +                length=0;
   1.791 +                c=0;
   1.792 +            }
   1.793 +        } else {
   1.794 +            /* keep c for surrogate handling, length will be set there */
   1.795 +            targetCapacity+=2*count;
   1.796 +        }
   1.797 +    } else {
   1.798 +        length=0; /* from here on, length counts the bytes in overflow[] */
   1.799 +    }
   1.800 +    
   1.801 +    if(c!=0) {
   1.802 +        /*
   1.803 +         * c is a surrogate, and
   1.804 +         * - source or target too short
   1.805 +         * - or the surrogate is unmatched
   1.806 +         */
   1.807 +        length=0;
   1.808 +        if(U16_IS_SURROGATE_LEAD(c)) {
   1.809 +            if(source<pArgs->sourceLimit) {
   1.810 +                if(U16_IS_TRAIL(trail=*source)) {
   1.811 +                    /* output the surrogate pair, will overflow (see conditions comment above) */
   1.812 +                    ++source;
   1.813 +                    overflow[0]=(char)c;
   1.814 +                    overflow[1]=(char)(c>>8);
   1.815 +                    overflow[2]=(char)trail;
   1.816 +                    overflow[3]=(char)(trail>>8);
   1.817 +                    length=4; /* 4 bytes to output */
   1.818 +                    c=0;
   1.819 +                } else {
   1.820 +                    /* unmatched lead surrogate */
   1.821 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.822 +                }
   1.823 +            } else {
   1.824 +                /* see if the trail surrogate is in the next buffer */
   1.825 +            }
   1.826 +        } else {
   1.827 +            /* unmatched trail surrogate */
   1.828 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.829 +        }
   1.830 +        cnv->fromUChar32=c;
   1.831 +    }
   1.832 +
   1.833 +    if(length>0) {
   1.834 +        /* output length bytes with overflow (length>targetCapacity>0) */
   1.835 +        ucnv_fromUWriteBytes(cnv,
   1.836 +                             overflow, length,
   1.837 +                             &target, pArgs->targetLimit,
   1.838 +                             &offsets, sourceIndex,
   1.839 +                             pErrorCode);
   1.840 +        targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
   1.841 +    }
   1.842 +
   1.843 +    if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
   1.844 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.845 +    }
   1.846 +
   1.847 +    /* write back the updated pointers */
   1.848 +    pArgs->source=source;
   1.849 +    pArgs->target=target;
   1.850 +    pArgs->offsets=offsets;
   1.851 +}
   1.852 +
   1.853 +static void
   1.854 +_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1.855 +                             UErrorCode *pErrorCode) {
   1.856 +    UConverter *cnv;
   1.857 +    const uint8_t *source;
   1.858 +    UChar *target;
   1.859 +    int32_t *offsets;
   1.860 +
   1.861 +    uint32_t targetCapacity, length, count, sourceIndex;
   1.862 +    UChar c, trail;
   1.863 +
   1.864 +    if(pArgs->converter->mode<8) {
   1.865 +        _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
   1.866 +        return;
   1.867 +    }
   1.868 +
   1.869 +    cnv=pArgs->converter;
   1.870 +    source=(const uint8_t *)pArgs->source;
   1.871 +    length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
   1.872 +    if(length<=0 && cnv->toUnicodeStatus==0) {
   1.873 +        /* no input, nothing to do */
   1.874 +        return;
   1.875 +    }
   1.876 +
   1.877 +    target=pArgs->target;
   1.878 +    if(target >= pArgs->targetLimit) {
   1.879 +        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.880 +        return;
   1.881 +    }
   1.882 +
   1.883 +    targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
   1.884 +    offsets=pArgs->offsets;
   1.885 +    sourceIndex=0;
   1.886 +    c=0;
   1.887 +
   1.888 +    /* complete a partial UChar or pair from the last call */
   1.889 +    if(cnv->toUnicodeStatus!=0) {
   1.890 +        /*
   1.891 +         * special case: single byte from a previous buffer,
   1.892 +         * where the byte turned out not to belong to a trail surrogate
   1.893 +         * and the preceding, unmatched lead surrogate was put into toUBytes[]
   1.894 +         * for error handling
   1.895 +         */
   1.896 +        cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
   1.897 +        cnv->toULength=1;
   1.898 +        cnv->toUnicodeStatus=0;
   1.899 +    }
   1.900 +    if((count=cnv->toULength)!=0) {
   1.901 +        uint8_t *p=cnv->toUBytes;
   1.902 +        do {
   1.903 +            p[count++]=*source++;
   1.904 +            ++sourceIndex;
   1.905 +            --length;
   1.906 +            if(count==2) {
   1.907 +                c=((UChar)p[1]<<8)|p[0];
   1.908 +                if(U16_IS_SINGLE(c)) {
   1.909 +                    /* output the BMP code point */
   1.910 +                    *target++=c;
   1.911 +                    if(offsets!=NULL) {
   1.912 +                        *offsets++=-1;
   1.913 +                    }
   1.914 +                    --targetCapacity;
   1.915 +                    count=0;
   1.916 +                    c=0;
   1.917 +                    break;
   1.918 +                } else if(U16_IS_SURROGATE_LEAD(c)) {
   1.919 +                    /* continue collecting bytes for the trail surrogate */
   1.920 +                    c=0; /* avoid unnecessary surrogate handling below */
   1.921 +                } else {
   1.922 +                    /* fall through to error handling for an unmatched trail surrogate */
   1.923 +                    break;
   1.924 +                }
   1.925 +            } else if(count==4) {
   1.926 +                c=((UChar)p[1]<<8)|p[0];
   1.927 +                trail=((UChar)p[3]<<8)|p[2];
   1.928 +                if(U16_IS_TRAIL(trail)) {
   1.929 +                    /* output the surrogate pair */
   1.930 +                    *target++=c;
   1.931 +                    if(targetCapacity>=2) {
   1.932 +                        *target++=trail;
   1.933 +                        if(offsets!=NULL) {
   1.934 +                            *offsets++=-1;
   1.935 +                            *offsets++=-1;
   1.936 +                        }
   1.937 +                        targetCapacity-=2;
   1.938 +                    } else /* targetCapacity==1 */ {
   1.939 +                        targetCapacity=0;
   1.940 +                        cnv->UCharErrorBuffer[0]=trail;
   1.941 +                        cnv->UCharErrorBufferLength=1;
   1.942 +                        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1.943 +                    }
   1.944 +                    count=0;
   1.945 +                    c=0;
   1.946 +                    break;
   1.947 +                } else {
   1.948 +                    /* unmatched lead surrogate, handle here for consistent toUBytes[] */
   1.949 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1.950 +
   1.951 +                    /* back out reading the code unit after it */
   1.952 +                    if(((const uint8_t *)pArgs->source-source)>=2) {
   1.953 +                        source-=2;
   1.954 +                    } else {
   1.955 +                        /*
   1.956 +                         * if the trail unit's first byte was in a previous buffer, then
   1.957 +                         * we need to put it into a special place because toUBytes[] will be
   1.958 +                         * used for the lead unit's bytes
   1.959 +                         */
   1.960 +                        cnv->toUnicodeStatus=0x100|p[2];
   1.961 +                        --source;
   1.962 +                    }
   1.963 +                    cnv->toULength=2;
   1.964 +
   1.965 +                    /* write back the updated pointers */
   1.966 +                    pArgs->source=(const char *)source;
   1.967 +                    pArgs->target=target;
   1.968 +                    pArgs->offsets=offsets;
   1.969 +                    return;
   1.970 +                }
   1.971 +            }
   1.972 +        } while(length>0);
   1.973 +        cnv->toULength=(int8_t)count;
   1.974 +    }
   1.975 +
   1.976 +    /* copy an even number of bytes for complete UChars */
   1.977 +    count=2*targetCapacity;
   1.978 +    if(count>length) {
   1.979 +        count=length&~1;
   1.980 +    }
   1.981 +    if(c==0 && count>0) {
   1.982 +        length-=count;
   1.983 +        count>>=1;
   1.984 +        targetCapacity-=count;
   1.985 +        if(offsets==NULL) {
   1.986 +            do {
   1.987 +                c=((UChar)source[1]<<8)|source[0];
   1.988 +                source+=2;
   1.989 +                if(U16_IS_SINGLE(c)) {
   1.990 +                    *target++=c;
   1.991 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1.992 +                          U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
   1.993 +                ) {
   1.994 +                    source+=2;
   1.995 +                    --count;
   1.996 +                    *target++=c;
   1.997 +                    *target++=trail;
   1.998 +                } else {
   1.999 +                    break;
  1.1000 +                }
  1.1001 +            } while(--count>0);
  1.1002 +        } else {
  1.1003 +            do {
  1.1004 +                c=((UChar)source[1]<<8)|source[0];
  1.1005 +                source+=2;
  1.1006 +                if(U16_IS_SINGLE(c)) {
  1.1007 +                    *target++=c;
  1.1008 +                    *offsets++=sourceIndex;
  1.1009 +                    sourceIndex+=2;
  1.1010 +                } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  1.1011 +                          U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
  1.1012 +                ) {
  1.1013 +                    source+=2;
  1.1014 +                    --count;
  1.1015 +                    *target++=c;
  1.1016 +                    *target++=trail;
  1.1017 +                    *offsets++=sourceIndex;
  1.1018 +                    *offsets++=sourceIndex;
  1.1019 +                    sourceIndex+=4;
  1.1020 +                } else {
  1.1021 +                    break;
  1.1022 +                }
  1.1023 +            } while(--count>0);
  1.1024 +        }
  1.1025 +
  1.1026 +        if(count==0) {
  1.1027 +            /* done with the loop for complete UChars */
  1.1028 +            c=0;
  1.1029 +        } else {
  1.1030 +            /* keep c for surrogate handling, trail will be set there */
  1.1031 +            length+=2*(count-1); /* one more byte pair was consumed than count decremented */
  1.1032 +            targetCapacity+=count;
  1.1033 +        }
  1.1034 +    }
  1.1035 +
  1.1036 +    if(c!=0) {
  1.1037 +        /*
  1.1038 +         * c is a surrogate, and
  1.1039 +         * - source or target too short
  1.1040 +         * - or the surrogate is unmatched
  1.1041 +         */
  1.1042 +        cnv->toUBytes[0]=(uint8_t)c;
  1.1043 +        cnv->toUBytes[1]=(uint8_t)(c>>8);
  1.1044 +        cnv->toULength=2;
  1.1045 +
  1.1046 +        if(U16_IS_SURROGATE_LEAD(c)) {
  1.1047 +            if(length>=2) {
  1.1048 +                if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
  1.1049 +                    /* output the surrogate pair, will overflow (see conditions comment above) */
  1.1050 +                    source+=2;
  1.1051 +                    length-=2;
  1.1052 +                    *target++=c;
  1.1053 +                    if(offsets!=NULL) {
  1.1054 +                        *offsets++=sourceIndex;
  1.1055 +                    }
  1.1056 +                    cnv->UCharErrorBuffer[0]=trail;
  1.1057 +                    cnv->UCharErrorBufferLength=1;
  1.1058 +                    cnv->toULength=0;
  1.1059 +                    *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1060 +                } else {
  1.1061 +                    /* unmatched lead surrogate */
  1.1062 +                    *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1063 +                }
  1.1064 +            } else {
  1.1065 +                /* see if the trail surrogate is in the next buffer */
  1.1066 +            }
  1.1067 +        } else {
  1.1068 +            /* unmatched trail surrogate */
  1.1069 +            *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1.1070 +        }
  1.1071 +    }
  1.1072 +
  1.1073 +    if(U_SUCCESS(*pErrorCode)) {
  1.1074 +        /* check for a remaining source byte */
  1.1075 +        if(length>0) {
  1.1076 +            if(targetCapacity==0) {
  1.1077 +                *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1.1078 +            } else {
  1.1079 +                /* it must be length==1 because otherwise the above would have copied more */
  1.1080 +                cnv->toUBytes[cnv->toULength++]=*source++;
  1.1081 +            }
  1.1082 +        }
  1.1083 +    }
  1.1084 +
  1.1085 +    /* write back the updated pointers */
  1.1086 +    pArgs->source=(const char *)source;
  1.1087 +    pArgs->target=target;
  1.1088 +    pArgs->offsets=offsets;
  1.1089 +}
  1.1090 +
  1.1091 +static UChar32
  1.1092 +_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
  1.1093 +    const uint8_t *s, *sourceLimit;
  1.1094 +    UChar32 c;
  1.1095 +
  1.1096 +    if(pArgs->converter->mode<8) {
  1.1097 +        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1.1098 +    }
  1.1099 +
  1.1100 +    s=(const uint8_t *)pArgs->source;
  1.1101 +    sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1.1102 +
  1.1103 +    if(s>=sourceLimit) {
  1.1104 +        /* no input */
  1.1105 +        *err=U_INDEX_OUTOFBOUNDS_ERROR;
  1.1106 +        return 0xffff;
  1.1107 +    }
  1.1108 +
  1.1109 +    if(s+2>sourceLimit) {
  1.1110 +        /* only one byte: truncated UChar */
  1.1111 +        pArgs->converter->toUBytes[0]=*s++;
  1.1112 +        pArgs->converter->toULength=1;
  1.1113 +        pArgs->source=(const char *)s;
  1.1114 +        *err = U_TRUNCATED_CHAR_FOUND;
  1.1115 +        return 0xffff;
  1.1116 +    }
  1.1117 +
  1.1118 +    /* get one UChar */
  1.1119 +    c=((UChar32)s[1]<<8)|*s;
  1.1120 +    s+=2;
  1.1121 +
  1.1122 +    /* check for a surrogate pair */
  1.1123 +    if(U_IS_SURROGATE(c)) {
  1.1124 +        if(U16_IS_SURROGATE_LEAD(c)) {
  1.1125 +            if(s+2<=sourceLimit) {
  1.1126 +                UChar trail;
  1.1127 +
  1.1128 +                /* get a second UChar and see if it is a trail surrogate */
  1.1129 +                trail=((UChar)s[1]<<8)|*s;
  1.1130 +                if(U16_IS_TRAIL(trail)) {
  1.1131 +                    c=U16_GET_SUPPLEMENTARY(c, trail);
  1.1132 +                    s+=2;
  1.1133 +                } else {
  1.1134 +                    /* unmatched lead surrogate */
  1.1135 +                    c=-2;
  1.1136 +                }
  1.1137 +            } else {
  1.1138 +                /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
  1.1139 +                uint8_t *bytes=pArgs->converter->toUBytes;
  1.1140 +                s-=2;
  1.1141 +                pArgs->converter->toULength=(int8_t)(sourceLimit-s);
  1.1142 +                do {
  1.1143 +                    *bytes++=*s++;
  1.1144 +                } while(s<sourceLimit);
  1.1145 +
  1.1146 +                c=0xffff;
  1.1147 +                *err=U_TRUNCATED_CHAR_FOUND;
  1.1148 +            }
  1.1149 +        } else {
  1.1150 +            /* unmatched trail surrogate */
  1.1151 +            c=-2;
  1.1152 +        }
  1.1153 +
  1.1154 +        if(c<0) {
  1.1155 +            /* write the unmatched surrogate */
  1.1156 +            uint8_t *bytes=pArgs->converter->toUBytes;
  1.1157 +            pArgs->converter->toULength=2;
  1.1158 +            *bytes=*(s-2);
  1.1159 +            bytes[1]=*(s-1);
  1.1160 +
  1.1161 +            c=0xffff;
  1.1162 +            *err=U_ILLEGAL_CHAR_FOUND;
  1.1163 +        }
  1.1164 +    }
  1.1165 +
  1.1166 +    pArgs->source=(const char *)s;
  1.1167 +    return c;
  1.1168 +} 
  1.1169 +
  1.1170 +static void
  1.1171 +_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
  1.1172 +    if(choice<=UCNV_RESET_TO_UNICODE) {
  1.1173 +        /* reset toUnicode state */
  1.1174 +        if(UCNV_GET_VERSION(cnv)==0) {
  1.1175 +            cnv->mode=8; /* no BOM handling */
  1.1176 +        } else {
  1.1177 +            cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
  1.1178 +        }
  1.1179 +    }
  1.1180 +    if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
  1.1181 +        /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
  1.1182 +        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1.1183 +    }
  1.1184 +}
  1.1185 +
  1.1186 +static void
  1.1187 +_UTF16LEOpen(UConverter *cnv,
  1.1188 +             UConverterLoadArgs *pArgs,
  1.1189 +             UErrorCode *pErrorCode) {
  1.1190 +    if(UCNV_GET_VERSION(cnv)<=1) {
  1.1191 +        _UTF16LEReset(cnv, UCNV_RESET_BOTH);
  1.1192 +    } else {
  1.1193 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.1194 +    }
  1.1195 +}
  1.1196 +
  1.1197 +static const char *
  1.1198 +_UTF16LEGetName(const UConverter *cnv) {
  1.1199 +    if(UCNV_GET_VERSION(cnv)==0) {
  1.1200 +        return "UTF-16LE";
  1.1201 +    } else {
  1.1202 +        return "UTF-16LE,version=1";
  1.1203 +    }
  1.1204 +}
  1.1205 +
  1.1206 +static const UConverterImpl _UTF16LEImpl={
  1.1207 +    UCNV_UTF16_LittleEndian,
  1.1208 +
  1.1209 +    NULL,
  1.1210 +    NULL,
  1.1211 +
  1.1212 +    _UTF16LEOpen,
  1.1213 +    NULL,
  1.1214 +    _UTF16LEReset,
  1.1215 +
  1.1216 +    _UTF16LEToUnicodeWithOffsets,
  1.1217 +    _UTF16LEToUnicodeWithOffsets,
  1.1218 +    _UTF16LEFromUnicodeWithOffsets,
  1.1219 +    _UTF16LEFromUnicodeWithOffsets,
  1.1220 +    _UTF16LEGetNextUChar,
  1.1221 +
  1.1222 +    NULL,
  1.1223 +    _UTF16LEGetName,
  1.1224 +    NULL,
  1.1225 +    NULL,
  1.1226 +    ucnv_getNonSurrogateUnicodeSet
  1.1227 +};
  1.1228 +
  1.1229 +
  1.1230 +static const UConverterStaticData _UTF16LEStaticData={
  1.1231 +    sizeof(UConverterStaticData),
  1.1232 +    "UTF-16LE",
  1.1233 +    1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
  1.1234 +    { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
  1.1235 +    0,
  1.1236 +    0,
  1.1237 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1238 +};
  1.1239 +
  1.1240 +
  1.1241 +const UConverterSharedData _UTF16LEData={
  1.1242 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
  1.1243 +    NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 
  1.1244 +    0
  1.1245 +};
  1.1246 +
  1.1247 +/* UTF-16 (Detect BOM) ------------------------------------------------------ */
  1.1248 +
  1.1249 +/*
  1.1250 + * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
  1.1251 + * accordingly.
  1.1252 + * This is a simpler version of the UTF-32 converter, with
  1.1253 + * fewer states for shorter BOMs.
  1.1254 + *
  1.1255 + * State values:
  1.1256 + * 0    initial state
  1.1257 + * 1    saw first byte
  1.1258 + * 2..5 -
  1.1259 + * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
  1.1260 + * 8    UTF-16BE mode
  1.1261 + * 9    UTF-16LE mode
  1.1262 + *
  1.1263 + * During detection: state==number of initial bytes seen so far.
  1.1264 + *
  1.1265 + * On output, emit U+FEFF as the first code point.
  1.1266 + *
  1.1267 + * Variants:
  1.1268 + * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
  1.1269 + * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
  1.1270 + *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
  1.1271 + */
  1.1272 +
  1.1273 +static void
  1.1274 +_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
  1.1275 +    if(choice<=UCNV_RESET_TO_UNICODE) {
  1.1276 +        /* reset toUnicode: state=0 */
  1.1277 +        cnv->mode=0;
  1.1278 +    }
  1.1279 +    if(choice!=UCNV_RESET_TO_UNICODE) {
  1.1280 +        /* reset fromUnicode: prepare to output the UTF-16PE BOM */
  1.1281 +        cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1.1282 +    }
  1.1283 +}
  1.1284 +
  1.1285 +static const UConverterSharedData _UTF16v2Data;
  1.1286 +
  1.1287 +static void
  1.1288 +_UTF16Open(UConverter *cnv,
  1.1289 +           UConverterLoadArgs *pArgs,
  1.1290 +           UErrorCode *pErrorCode) {
  1.1291 +    if(UCNV_GET_VERSION(cnv)<=2) {
  1.1292 +        if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
  1.1293 +            /*
  1.1294 +             * Switch implementation, and switch the staticData that's different
  1.1295 +             * and was copied into the UConverter.
  1.1296 +             * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
  1.1297 +             * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
  1.1298 +             */
  1.1299 +            cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
  1.1300 +            uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
  1.1301 +        }
  1.1302 +        _UTF16Reset(cnv, UCNV_RESET_BOTH);
  1.1303 +    } else {
  1.1304 +        *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1.1305 +    }
  1.1306 +}
  1.1307 +
  1.1308 +static const char *
  1.1309 +_UTF16GetName(const UConverter *cnv) {
  1.1310 +    if(UCNV_GET_VERSION(cnv)==0) {
  1.1311 +        return "UTF-16";
  1.1312 +    } else if(UCNV_GET_VERSION(cnv)==1) {
  1.1313 +        return "UTF-16,version=1";
  1.1314 +    } else {
  1.1315 +        return "UTF-16,version=2";
  1.1316 +    }
  1.1317 +}
  1.1318 +
  1.1319 +const UConverterSharedData _UTF16Data;
  1.1320 +
  1.1321 +#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
  1.1322 +#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
  1.1323 +#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
  1.1324 +
  1.1325 +static void
  1.1326 +_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  1.1327 +                           UErrorCode *pErrorCode) {
  1.1328 +    UConverter *cnv=pArgs->converter;
  1.1329 +    const char *source=pArgs->source;
  1.1330 +    const char *sourceLimit=pArgs->sourceLimit;
  1.1331 +    int32_t *offsets=pArgs->offsets;
  1.1332 +
  1.1333 +    int32_t state, offsetDelta;
  1.1334 +    uint8_t b;
  1.1335 +
  1.1336 +    state=cnv->mode;
  1.1337 +
  1.1338 +    /*
  1.1339 +     * If we detect a BOM in this buffer, then we must add the BOM size to the
  1.1340 +     * offsets because the actual converter function will not see and count the BOM.
  1.1341 +     * offsetDelta will have the number of the BOM bytes that are in the current buffer.
  1.1342 +     */
  1.1343 +    offsetDelta=0;
  1.1344 +
  1.1345 +    while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
  1.1346 +        switch(state) {
  1.1347 +        case 0:
  1.1348 +            cnv->toUBytes[0]=(uint8_t)*source++;
  1.1349 +            cnv->toULength=1;
  1.1350 +            state=1;
  1.1351 +            break;
  1.1352 +        case 1:
  1.1353 +            /*
  1.1354 +             * Only inside this switch case can the state variable
  1.1355 +             * temporarily take two additional values:
  1.1356 +             * 6: BOM error, continue with BE
  1.1357 +             * 7: BOM error, continue with LE
  1.1358 +             */
  1.1359 +            b=*source;
  1.1360 +            if(cnv->toUBytes[0]==0xfe && b==0xff) {
  1.1361 +                if(IS_UTF16LE(cnv)) {
  1.1362 +                    state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
  1.1363 +                } else {
  1.1364 +                    state=8; /* detect UTF-16BE */
  1.1365 +                }
  1.1366 +            } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
  1.1367 +                if(IS_UTF16BE(cnv)) {
  1.1368 +                    state=6; /* illegal reverse BOM for Java "UnicodeBig" */
  1.1369 +                } else {
  1.1370 +                    state=9; /* detect UTF-16LE */
  1.1371 +                }
  1.1372 +            } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
  1.1373 +                state=6; /* illegal missing BOM for Java "Unicode" */
  1.1374 +            }
  1.1375 +            if(state>=8) {
  1.1376 +                /* BOM detected, consume it */
  1.1377 +                ++source;
  1.1378 +                cnv->toULength=0;
  1.1379 +                offsetDelta=(int32_t)(source-pArgs->source);
  1.1380 +            } else if(state<6) {
  1.1381 +                /* ok: no BOM, and not a reverse BOM */
  1.1382 +                if(source!=pArgs->source) {
  1.1383 +                    /* reset the source for a correct first offset */
  1.1384 +                    source=pArgs->source;
  1.1385 +                    cnv->toULength=0;
  1.1386 +                }
  1.1387 +                if(IS_UTF16LE(cnv)) {
  1.1388 +                    /* Make Java "UnicodeLittle" default to LE. */
  1.1389 +                    state=9;
  1.1390 +                } else {
  1.1391 +                    /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
  1.1392 +                    state=8;
  1.1393 +                }
  1.1394 +            } else {
  1.1395 +                /*
  1.1396 +                 * error: missing BOM, or reverse BOM
  1.1397 +                 * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
  1.1398 +                 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
  1.1399 +                 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
  1.1400 +                 */
  1.1401 +                /* report the non-BOM or reverse BOM as an illegal sequence */
  1.1402 +                cnv->toUBytes[1]=b;
  1.1403 +                cnv->toULength=2;
  1.1404 +                pArgs->source=source+1;
  1.1405 +                /* continue with conversion if the callback resets the error */
  1.1406 +                /*
  1.1407 +                 * Make Java "Unicode" default to BE like standard UTF-16.
  1.1408 +                 * Make Java "UnicodeBig" and "UnicodeLittle" default
  1.1409 +                 * to their normal endiannesses.
  1.1410 +                 */
  1.1411 +                cnv->mode=state+2;
  1.1412 +                *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
  1.1413 +                return;
  1.1414 +            }
  1.1415 +            /* convert the rest of the stream */
  1.1416 +            cnv->mode=state;
  1.1417 +            continue;
  1.1418 +        case 8:
  1.1419 +            /* call UTF-16BE */
  1.1420 +            pArgs->source=source;
  1.1421 +            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1.1422 +            source=pArgs->source;
  1.1423 +            break;
  1.1424 +        case 9:
  1.1425 +            /* call UTF-16LE */
  1.1426 +            pArgs->source=source;
  1.1427 +            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1.1428 +            source=pArgs->source;
  1.1429 +            break;
  1.1430 +        default:
  1.1431 +            break; /* does not occur */
  1.1432 +        }
  1.1433 +    }
  1.1434 +
  1.1435 +    /* add BOM size to offsets - see comment at offsetDelta declaration */
  1.1436 +    if(offsets!=NULL && offsetDelta!=0) {
  1.1437 +        int32_t *offsetsLimit=pArgs->offsets;
  1.1438 +        while(offsets<offsetsLimit) {
  1.1439 +            *offsets++ += offsetDelta;
  1.1440 +        }
  1.1441 +    }
  1.1442 +
  1.1443 +    pArgs->source=source;
  1.1444 +
  1.1445 +    if(source==sourceLimit && pArgs->flush) {
  1.1446 +        /* handle truncated input */
  1.1447 +        switch(state) {
  1.1448 +        case 0:
  1.1449 +            break; /* no input at all, nothing to do */
  1.1450 +        case 8:
  1.1451 +            _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1.1452 +            break;
  1.1453 +        case 9:
  1.1454 +            _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1.1455 +            break;
  1.1456 +        default:
  1.1457 +            /* 0<state<8: framework will report truncation, nothing to do here */
  1.1458 +            break;
  1.1459 +        }
  1.1460 +    }
  1.1461 +
  1.1462 +    cnv->mode=state;
  1.1463 +}
  1.1464 +
  1.1465 +static UChar32
  1.1466 +_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
  1.1467 +                   UErrorCode *pErrorCode) {
  1.1468 +    switch(pArgs->converter->mode) {
  1.1469 +    case 8:
  1.1470 +        return _UTF16BEGetNextUChar(pArgs, pErrorCode);
  1.1471 +    case 9:
  1.1472 +        return _UTF16LEGetNextUChar(pArgs, pErrorCode);
  1.1473 +    default:
  1.1474 +        return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1.1475 +    }
  1.1476 +}
  1.1477 +
  1.1478 +static const UConverterImpl _UTF16Impl = {
  1.1479 +    UCNV_UTF16,
  1.1480 +
  1.1481 +    NULL,
  1.1482 +    NULL,
  1.1483 +
  1.1484 +    _UTF16Open,
  1.1485 +    NULL,
  1.1486 +    _UTF16Reset,
  1.1487 +
  1.1488 +    _UTF16ToUnicodeWithOffsets,
  1.1489 +    _UTF16ToUnicodeWithOffsets,
  1.1490 +    _UTF16PEFromUnicodeWithOffsets,
  1.1491 +    _UTF16PEFromUnicodeWithOffsets,
  1.1492 +    _UTF16GetNextUChar,
  1.1493 +
  1.1494 +    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
  1.1495 +    _UTF16GetName,
  1.1496 +    NULL,
  1.1497 +    NULL,
  1.1498 +    ucnv_getNonSurrogateUnicodeSet
  1.1499 +};
  1.1500 +
  1.1501 +static const UConverterStaticData _UTF16StaticData = {
  1.1502 +    sizeof(UConverterStaticData),
  1.1503 +    "UTF-16",
  1.1504 +    1204, /* CCSID for BOM sensitive UTF-16 */
  1.1505 +    UCNV_IBM, UCNV_UTF16, 2, 2,
  1.1506 +#if U_IS_BIG_ENDIAN
  1.1507 +    { 0xff, 0xfd, 0, 0 }, 2,
  1.1508 +#else
  1.1509 +    { 0xfd, 0xff, 0, 0 }, 2,
  1.1510 +#endif
  1.1511 +    FALSE, FALSE,
  1.1512 +    0,
  1.1513 +    0,
  1.1514 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1515 +};
  1.1516 +
  1.1517 +const UConverterSharedData _UTF16Data = {
  1.1518 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
  1.1519 +    NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, 
  1.1520 +    0
  1.1521 +};
  1.1522 +
  1.1523 +static const UConverterImpl _UTF16v2Impl = {
  1.1524 +    UCNV_UTF16,
  1.1525 +
  1.1526 +    NULL,
  1.1527 +    NULL,
  1.1528 +
  1.1529 +    _UTF16Open,
  1.1530 +    NULL,
  1.1531 +    _UTF16Reset,
  1.1532 +
  1.1533 +    _UTF16ToUnicodeWithOffsets,
  1.1534 +    _UTF16ToUnicodeWithOffsets,
  1.1535 +    _UTF16BEFromUnicodeWithOffsets,
  1.1536 +    _UTF16BEFromUnicodeWithOffsets,
  1.1537 +    _UTF16GetNextUChar,
  1.1538 +
  1.1539 +    NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
  1.1540 +    _UTF16GetName,
  1.1541 +    NULL,
  1.1542 +    NULL,
  1.1543 +    ucnv_getNonSurrogateUnicodeSet
  1.1544 +};
  1.1545 +
  1.1546 +static const UConverterStaticData _UTF16v2StaticData = {
  1.1547 +    sizeof(UConverterStaticData),
  1.1548 +    "UTF-16,version=2",
  1.1549 +    1204, /* CCSID for BOM sensitive UTF-16 */
  1.1550 +    UCNV_IBM, UCNV_UTF16, 2, 2,
  1.1551 +    { 0xff, 0xfd, 0, 0 }, 2,
  1.1552 +    FALSE, FALSE,
  1.1553 +    0,
  1.1554 +    0,
  1.1555 +    { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1.1556 +};
  1.1557 +
  1.1558 +static const UConverterSharedData _UTF16v2Data = {
  1.1559 +    sizeof(UConverterSharedData), ~((uint32_t) 0),
  1.1560 +    NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, 
  1.1561 +    0
  1.1562 +};
  1.1563 +
  1.1564 +#endif

mercurial