The Tor Browser: intl/icu/source/common/ucnv

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*

     2 **********************************************************************

     3 *   Copyright (C) 2002-2010, International Business Machines

     4 *   Corporation and others.  All Rights Reserved.

     5 **********************************************************************

     6 *   file name:  ucnv_u16.c

     7 *   encoding:   US-ASCII

     8 *   tab size:   8 (not used)

     9 *   indentation:4

    10 *

    11 *   created on: 2002jul01

    12 *   created by: Markus W. Scherer

    13 *

    14 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.

    15 */

    17 #include "unicode/utypes.h"

    19 #if !UCONFIG_NO_CONVERSION

    21 #include "unicode/ucnv.h"

    22 #include "ucnv_bld.h"

    23 #include "ucnv_cnv.h"

    24 #include "cmemory.h"

    26 enum {

    27     UCNV_NEED_TO_WRITE_BOM=1

    28 };

    30 /*

    31  * The UTF-16 toUnicode implementation is also used for the Java-specific

    32  * "with BOM" variants of UTF-16BE and UTF-16LE.

    33  */

    34 static void

    35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

    36                            UErrorCode *pErrorCode);

    38 /* UTF-16BE ----------------------------------------------------------------- */

    40 #if U_IS_BIG_ENDIAN

    41 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets

    42 #else

    43 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets

    44 #endif

    47 static void

    48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

    49                                UErrorCode *pErrorCode) {

    50     UConverter *cnv;

    51     const UChar *source;

    52     char *target;

    53     int32_t *offsets;

    55     uint32_t targetCapacity, length, sourceIndex;

    56     UChar c, trail;

    57     char overflow[4];

    59     source=pArgs->source;

    60     length=(int32_t)(pArgs->sourceLimit-source);

    61     if(length<=0) {

    62         /* no input, nothing to do */

    63         return;

    64     }

    66     cnv=pArgs->converter;

    68     /* write the BOM if necessary */

    69     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

    70         static const char bom[]={ (char)0xfe, (char)0xff };

    71         ucnv_fromUWriteBytes(cnv,

    72                              bom, 2,

    73                              &pArgs->target, pArgs->targetLimit,

    74                              &pArgs->offsets, -1,

    75                              pErrorCode);

    76         cnv->fromUnicodeStatus=0;

    77     }

    79     target=pArgs->target;

    80     if(target >= pArgs->targetLimit) {

    81         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

    82         return;

    83     }

    85     targetCapacity=(uint32_t)(pArgs->targetLimit-target);

    86     offsets=pArgs->offsets;

    87     sourceIndex=0;

    89     /* c!=0 indicates in several places outside the main loops that a surrogate was found */

    91     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {

    92         /* the last buffer ended with a lead surrogate, output the surrogate pair */

    93         ++source;

    94         --length;

    95         target[0]=(uint8_t)(c>>8);

    96         target[1]=(uint8_t)c;

    97         target[2]=(uint8_t)(trail>>8);

    98         target[3]=(uint8_t)trail;

    99         target+=4;

   100         targetCapacity-=4;

   101         if(offsets!=NULL) {

   102             *offsets++=-1;

   103             *offsets++=-1;

   104             *offsets++=-1;

   105             *offsets++=-1;

   106         }

   107         sourceIndex=1;

   108         cnv->fromUChar32=c=0;

   109     }

   111     if(c==0) {

   112         /* copy an even number of bytes for complete UChars */

   113         uint32_t count=2*length;

   114         if(count>targetCapacity) {

   115             count=targetCapacity&~1;

   116         }

   117         /* count is even */

   118         targetCapacity-=count;

   119         count>>=1;

   120         length-=count;

   122         if(offsets==NULL) {

   123             while(count>0) {

   124                 c=*source++;

   125                 if(U16_IS_SINGLE(c)) {

   126                     target[0]=(uint8_t)(c>>8);

   127                     target[1]=(uint8_t)c;

   128                     target+=2;

   129                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {

   130                     ++source;

   131                     --count;

   132                     target[0]=(uint8_t)(c>>8);

   133                     target[1]=(uint8_t)c;

   134                     target[2]=(uint8_t)(trail>>8);

   135                     target[3]=(uint8_t)trail;

   136                     target+=4;

   137                 } else {

   138                     break;

   139                 }

   140                 --count;

   141             }

   142         } else {

   143             while(count>0) {

   144                 c=*source++;

   145                 if(U16_IS_SINGLE(c)) {

   146                     target[0]=(uint8_t)(c>>8);

   147                     target[1]=(uint8_t)c;

   148                     target+=2;

   149                     *offsets++=sourceIndex;

   150                     *offsets++=sourceIndex++;

   151                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {

   152                     ++source;

   153                     --count;

   154                     target[0]=(uint8_t)(c>>8);

   155                     target[1]=(uint8_t)c;

   156                     target[2]=(uint8_t)(trail>>8);

   157                     target[3]=(uint8_t)trail;

   158                     target+=4;

   159                     *offsets++=sourceIndex;

   160                     *offsets++=sourceIndex;

   161                     *offsets++=sourceIndex;

   162                     *offsets++=sourceIndex;

   163                     sourceIndex+=2;

   164                 } else {

   165                     break;

   166                 }

   167                 --count;

   168             }

   169         }

   171         if(count==0) {

   172             /* done with the loop for complete UChars */

   173             if(length>0 && targetCapacity>0) {

   174                 /*

   175                  * there is more input and some target capacity -

   176                  * it must be targetCapacity==1 because otherwise

   177                  * the above would have copied more;

   178                  * prepare for overflow output

   179                  */

   180                 if(U16_IS_SINGLE(c=*source++)) {

   181                     overflow[0]=(char)(c>>8);

   182                     overflow[1]=(char)c;

   183                     length=2; /* 2 bytes to output */

   184                     c=0;

   185                 /* } else { keep c for surrogate handling, length will be set there */

   186                 }

   187             } else {

   188                 length=0;

   189                 c=0;

   190             }

   191         } else {

   192             /* keep c for surrogate handling, length will be set there */

   193             targetCapacity+=2*count;

   194         }

   195     } else {

   196         length=0; /* from here on, length counts the bytes in overflow[] */

   197     }

   199     if(c!=0) {

   200         /*

   201          * c is a surrogate, and

   202          * - source or target too short

   203          * - or the surrogate is unmatched

   204          */

   205         length=0;

   206         if(U16_IS_SURROGATE_LEAD(c)) {

   207             if(source<pArgs->sourceLimit) {

   208                 if(U16_IS_TRAIL(trail=*source)) {

   209                     /* output the surrogate pair, will overflow (see conditions comment above) */

   210                     ++source;

   211                     overflow[0]=(char)(c>>8);

   212                     overflow[1]=(char)c;

   213                     overflow[2]=(char)(trail>>8);

   214                     overflow[3]=(char)trail;

   215                     length=4; /* 4 bytes to output */

   216                     c=0;

   217                 } else {

   218                     /* unmatched lead surrogate */

   219                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   220                 }

   221             } else {

   222                 /* see if the trail surrogate is in the next buffer */

   223             }

   224         } else {

   225             /* unmatched trail surrogate */

   226             *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   227         }

   228         cnv->fromUChar32=c;

   229     }

   231     if(length>0) {

   232         /* output length bytes with overflow (length>targetCapacity>0) */

   233         ucnv_fromUWriteBytes(cnv,

   234                              overflow, length,

   235                              (char **)&target, pArgs->targetLimit,

   236                              &offsets, sourceIndex,

   237                              pErrorCode);

   238         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);

   239     }

   241     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {

   242         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   243     }

   245     /* write back the updated pointers */

   246     pArgs->source=source;

   247     pArgs->target=(char *)target;

   248     pArgs->offsets=offsets;

   249 }

   251 static void

   252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

   253                              UErrorCode *pErrorCode) {

   254     UConverter *cnv;

   255     const uint8_t *source;

   256     UChar *target;

   257     int32_t *offsets;

   259     uint32_t targetCapacity, length, count, sourceIndex;

   260     UChar c, trail;

   262     if(pArgs->converter->mode<8) {

   263         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);

   264         return;

   265     }

   267     cnv=pArgs->converter;

   268     source=(const uint8_t *)pArgs->source;

   269     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);

   270     if(length<=0 && cnv->toUnicodeStatus==0) {

   271         /* no input, nothing to do */

   272         return;

   273     }

   275     target=pArgs->target;

   276     if(target >= pArgs->targetLimit) {

   277         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   278         return;

   279     }

   281     targetCapacity=(uint32_t)(pArgs->targetLimit-target);

   282     offsets=pArgs->offsets;

   283     sourceIndex=0;

   284     c=0;

   286     /* complete a partial UChar or pair from the last call */

   287     if(cnv->toUnicodeStatus!=0) {

   288         /*

   289          * special case: single byte from a previous buffer,

   290          * where the byte turned out not to belong to a trail surrogate

   291          * and the preceding, unmatched lead surrogate was put into toUBytes[]

   292          * for error handling

   293          */

   294         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;

   295         cnv->toULength=1;

   296         cnv->toUnicodeStatus=0;

   297     }

   298     if((count=cnv->toULength)!=0) {

   299         uint8_t *p=cnv->toUBytes;

   300         do {

   301             p[count++]=*source++;

   302             ++sourceIndex;

   303             --length;

   304             if(count==2) {

   305                 c=((UChar)p[0]<<8)|p[1];

   306                 if(U16_IS_SINGLE(c)) {

   307                     /* output the BMP code point */

   308                     *target++=c;

   309                     if(offsets!=NULL) {

   310                         *offsets++=-1;

   311                     }

   312                     --targetCapacity;

   313                     count=0;

   314                     c=0;

   315                     break;

   316                 } else if(U16_IS_SURROGATE_LEAD(c)) {

   317                     /* continue collecting bytes for the trail surrogate */

   318                     c=0; /* avoid unnecessary surrogate handling below */

   319                 } else {

   320                     /* fall through to error handling for an unmatched trail surrogate */

   321                     break;

   322                 }

   323             } else if(count==4) {

   324                 c=((UChar)p[0]<<8)|p[1];

   325                 trail=((UChar)p[2]<<8)|p[3];

   326                 if(U16_IS_TRAIL(trail)) {

   327                     /* output the surrogate pair */

   328                     *target++=c;

   329                     if(targetCapacity>=2) {

   330                         *target++=trail;

   331                         if(offsets!=NULL) {

   332                             *offsets++=-1;

   333                             *offsets++=-1;

   334                         }

   335                         targetCapacity-=2;

   336                     } else /* targetCapacity==1 */ {

   337                         targetCapacity=0;

   338                         cnv->UCharErrorBuffer[0]=trail;

   339                         cnv->UCharErrorBufferLength=1;

   340                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   341                     }

   342                     count=0;

   343                     c=0;

   344                     break;

   345                 } else {

   346                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */

   347                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   349                     /* back out reading the code unit after it */

   350                     if(((const uint8_t *)pArgs->source-source)>=2) {

   351                         source-=2;

   352                     } else {

   353                         /*

   354                          * if the trail unit's first byte was in a previous buffer, then

   355                          * we need to put it into a special place because toUBytes[] will be

   356                          * used for the lead unit's bytes

   357                          */

   358                         cnv->toUnicodeStatus=0x100|p[2];

   359                         --source;

   360                     }

   361                     cnv->toULength=2;

   363                     /* write back the updated pointers */

   364                     pArgs->source=(const char *)source;

   365                     pArgs->target=target;

   366                     pArgs->offsets=offsets;

   367                     return;

   368                 }

   369             }

   370         } while(length>0);

   371         cnv->toULength=(int8_t)count;

   372     }

   374     /* copy an even number of bytes for complete UChars */

   375     count=2*targetCapacity;

   376     if(count>length) {

   377         count=length&~1;

   378     }

   379     if(c==0 && count>0) {

   380         length-=count;

   381         count>>=1;

   382         targetCapacity-=count;

   383         if(offsets==NULL) {

   384             do {

   385                 c=((UChar)source[0]<<8)|source[1];

   386                 source+=2;

   387                 if(U16_IS_SINGLE(c)) {

   388                     *target++=c;

   389                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

   390                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])

   391                 ) {

   392                     source+=2;

   393                     --count;

   394                     *target++=c;

   395                     *target++=trail;

   396                 } else {

   397                     break;

   398                 }

   399             } while(--count>0);

   400         } else {

   401             do {

   402                 c=((UChar)source[0]<<8)|source[1];

   403                 source+=2;

   404                 if(U16_IS_SINGLE(c)) {

   405                     *target++=c;

   406                     *offsets++=sourceIndex;

   407                     sourceIndex+=2;

   408                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

   409                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])

   410                 ) {

   411                     source+=2;

   412                     --count;

   413                     *target++=c;

   414                     *target++=trail;

   415                     *offsets++=sourceIndex;

   416                     *offsets++=sourceIndex;

   417                     sourceIndex+=4;

   418                 } else {

   419                     break;

   420                 }

   421             } while(--count>0);

   422         }

   424         if(count==0) {

   425             /* done with the loop for complete UChars */

   426             c=0;

   427         } else {

   428             /* keep c for surrogate handling, trail will be set there */

   429             length+=2*(count-1); /* one more byte pair was consumed than count decremented */

   430             targetCapacity+=count;

   431         }

   432     }

   434     if(c!=0) {

   435         /*

   436          * c is a surrogate, and

   437          * - source or target too short

   438          * - or the surrogate is unmatched

   439          */

   440         cnv->toUBytes[0]=(uint8_t)(c>>8);

   441         cnv->toUBytes[1]=(uint8_t)c;

   442         cnv->toULength=2;

   444         if(U16_IS_SURROGATE_LEAD(c)) {

   445             if(length>=2) {

   446                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {

   447                     /* output the surrogate pair, will overflow (see conditions comment above) */

   448                     source+=2;

   449                     length-=2;

   450                     *target++=c;

   451                     if(offsets!=NULL) {

   452                         *offsets++=sourceIndex;

   453                     }

   454                     cnv->UCharErrorBuffer[0]=trail;

   455                     cnv->UCharErrorBufferLength=1;

   456                     cnv->toULength=0;

   457                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   458                 } else {

   459                     /* unmatched lead surrogate */

   460                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   461                 }

   462             } else {

   463                 /* see if the trail surrogate is in the next buffer */

   464             }

   465         } else {

   466             /* unmatched trail surrogate */

   467             *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   468         }

   469     }

   471     if(U_SUCCESS(*pErrorCode)) {

   472         /* check for a remaining source byte */

   473         if(length>0) {

   474             if(targetCapacity==0) {

   475                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   476             } else {

   477                 /* it must be length==1 because otherwise the above would have copied more */

   478                 cnv->toUBytes[cnv->toULength++]=*source++;

   479             }

   480         }

   481     }

   483     /* write back the updated pointers */

   484     pArgs->source=(const char *)source;

   485     pArgs->target=target;

   486     pArgs->offsets=offsets;

   487 }

   489 static UChar32

   490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {

   491     const uint8_t *s, *sourceLimit;

   492     UChar32 c;

   494     if(pArgs->converter->mode<8) {

   495         return UCNV_GET_NEXT_UCHAR_USE_TO_U;

   496     }

   498     s=(const uint8_t *)pArgs->source;

   499     sourceLimit=(const uint8_t *)pArgs->sourceLimit;

   501     if(s>=sourceLimit) {

   502         /* no input */

   503         *err=U_INDEX_OUTOFBOUNDS_ERROR;

   504         return 0xffff;

   505     }

   507     if(s+2>sourceLimit) {

   508         /* only one byte: truncated UChar */

   509         pArgs->converter->toUBytes[0]=*s++;

   510         pArgs->converter->toULength=1;

   511         pArgs->source=(const char *)s;

   512         *err = U_TRUNCATED_CHAR_FOUND;

   513         return 0xffff;

   514     }

   516     /* get one UChar */

   517     c=((UChar32)*s<<8)|s[1];

   518     s+=2;

   520     /* check for a surrogate pair */

   521     if(U_IS_SURROGATE(c)) {

   522         if(U16_IS_SURROGATE_LEAD(c)) {

   523             if(s+2<=sourceLimit) {

   524                 UChar trail;

   526                 /* get a second UChar and see if it is a trail surrogate */

   527                 trail=((UChar)*s<<8)|s[1];

   528                 if(U16_IS_TRAIL(trail)) {

   529                     c=U16_GET_SUPPLEMENTARY(c, trail);

   530                     s+=2;

   531                 } else {

   532                     /* unmatched lead surrogate */

   533                     c=-2;

   534                 }

   535             } else {

   536                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */

   537                 uint8_t *bytes=pArgs->converter->toUBytes;

   538                 s-=2;

   539                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);

   540                 do {

   541                     *bytes++=*s++;

   542                 } while(s<sourceLimit);

   544                 c=0xffff;

   545                 *err=U_TRUNCATED_CHAR_FOUND;

   546             }

   547         } else {

   548             /* unmatched trail surrogate */

   549             c=-2;

   550         }

   552         if(c<0) {

   553             /* write the unmatched surrogate */

   554             uint8_t *bytes=pArgs->converter->toUBytes;

   555             pArgs->converter->toULength=2;

   556             *bytes=*(s-2);

   557             bytes[1]=*(s-1);

   559             c=0xffff;

   560             *err=U_ILLEGAL_CHAR_FOUND;

   561         }

   562     }

   564     pArgs->source=(const char *)s;

   565     return c;

   566 }

   568 static void

   569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {

   570     if(choice<=UCNV_RESET_TO_UNICODE) {

   571         /* reset toUnicode state */

   572         if(UCNV_GET_VERSION(cnv)==0) {

   573             cnv->mode=8; /* no BOM handling */

   574         } else {

   575             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */

   576         }

   577     }

   578     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {

   579         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */

   580         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

   581     }

   582 }

   584 static void

   585 _UTF16BEOpen(UConverter *cnv,

   586              UConverterLoadArgs *pArgs,

   587              UErrorCode *pErrorCode) {

   588     if(UCNV_GET_VERSION(cnv)<=1) {

   589         _UTF16BEReset(cnv, UCNV_RESET_BOTH);

   590     } else {

   591         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

   592     }

   593 }

   595 static const char *

   596 _UTF16BEGetName(const UConverter *cnv) {

   597     if(UCNV_GET_VERSION(cnv)==0) {

   598         return "UTF-16BE";

   599     } else {

   600         return "UTF-16BE,version=1";

   601     }

   602 }

   604 static const UConverterImpl _UTF16BEImpl={

   605     UCNV_UTF16_BigEndian,

   607     NULL,

   608     NULL,

   610     _UTF16BEOpen,

   611     NULL,

   612     _UTF16BEReset,

   614     _UTF16BEToUnicodeWithOffsets,

   615     _UTF16BEToUnicodeWithOffsets,

   616     _UTF16BEFromUnicodeWithOffsets,

   617     _UTF16BEFromUnicodeWithOffsets,

   618     _UTF16BEGetNextUChar,

   620     NULL,

   621     _UTF16BEGetName,

   622     NULL,

   623     NULL,

   624     ucnv_getNonSurrogateUnicodeSet

   625 };

   627 static const UConverterStaticData _UTF16BEStaticData={

   628     sizeof(UConverterStaticData),

   629     "UTF-16BE",

   630     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,

   631     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,

   632     0,

   633     0,

   634     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

   635 };

   638 const UConverterSharedData _UTF16BEData={

   639     sizeof(UConverterSharedData), ~((uint32_t) 0),

   640     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,

   641     0

   642 };

   644 /* UTF-16LE ----------------------------------------------------------------- */

   646 static void

   647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,

   648                                UErrorCode *pErrorCode) {

   649     UConverter *cnv;

   650     const UChar *source;

   651     char *target;

   652     int32_t *offsets;

   654     uint32_t targetCapacity, length, sourceIndex;

   655     UChar c, trail;

   656     char overflow[4];

   658     source=pArgs->source;

   659     length=(int32_t)(pArgs->sourceLimit-source);

   660     if(length<=0) {

   661         /* no input, nothing to do */

   662         return;

   663     }

   665     cnv=pArgs->converter;

   667     /* write the BOM if necessary */

   668     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {

   669         static const char bom[]={ (char)0xff, (char)0xfe };

   670         ucnv_fromUWriteBytes(cnv,

   671                              bom, 2,

   672                              &pArgs->target, pArgs->targetLimit,

   673                              &pArgs->offsets, -1,

   674                              pErrorCode);

   675         cnv->fromUnicodeStatus=0;

   676     }

   678     target=pArgs->target;

   679     if(target >= pArgs->targetLimit) {

   680         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   681         return;

   682     }

   684     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);

   685     offsets=pArgs->offsets;

   686     sourceIndex=0;

   688     /* c!=0 indicates in several places outside the main loops that a surrogate was found */

   690     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {

   691         /* the last buffer ended with a lead surrogate, output the surrogate pair */

   692         ++source;

   693         --length;

   694         target[0]=(uint8_t)c;

   695         target[1]=(uint8_t)(c>>8);

   696         target[2]=(uint8_t)trail;

   697         target[3]=(uint8_t)(trail>>8);

   698         target+=4;

   699         targetCapacity-=4;

   700         if(offsets!=NULL) {

   701             *offsets++=-1;

   702             *offsets++=-1;

   703             *offsets++=-1;

   704             *offsets++=-1;

   705         }

   706         sourceIndex=1;

   707         cnv->fromUChar32=c=0;

   708     }

   710     if(c==0) {

   711         /* copy an even number of bytes for complete UChars */

   712         uint32_t count=2*length;

   713         if(count>targetCapacity) {

   714             count=targetCapacity&~1;

   715         }

   716         /* count is even */

   717         targetCapacity-=count;

   718         count>>=1;

   719         length-=count;

   721         if(offsets==NULL) {

   722             while(count>0) {

   723                 c=*source++;

   724                 if(U16_IS_SINGLE(c)) {

   725                     target[0]=(uint8_t)c;

   726                     target[1]=(uint8_t)(c>>8);

   727                     target+=2;

   728                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {

   729                     ++source;

   730                     --count;

   731                     target[0]=(uint8_t)c;

   732                     target[1]=(uint8_t)(c>>8);

   733                     target[2]=(uint8_t)trail;

   734                     target[3]=(uint8_t)(trail>>8);

   735                     target+=4;

   736                 } else {

   737                     break;

   738                 }

   739                 --count;

   740             }

   741         } else {

   742             while(count>0) {

   743                 c=*source++;

   744                 if(U16_IS_SINGLE(c)) {

   745                     target[0]=(uint8_t)c;

   746                     target[1]=(uint8_t)(c>>8);

   747                     target+=2;

   748                     *offsets++=sourceIndex;

   749                     *offsets++=sourceIndex++;

   750                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {

   751                     ++source;

   752                     --count;

   753                     target[0]=(uint8_t)c;

   754                     target[1]=(uint8_t)(c>>8);

   755                     target[2]=(uint8_t)trail;

   756                     target[3]=(uint8_t)(trail>>8);

   757                     target+=4;

   758                     *offsets++=sourceIndex;

   759                     *offsets++=sourceIndex;

   760                     *offsets++=sourceIndex;

   761                     *offsets++=sourceIndex;

   762                     sourceIndex+=2;

   763                 } else {

   764                     break;

   765                 }

   766                 --count;

   767             }

   768         }

   770         if(count==0) {

   771             /* done with the loop for complete UChars */

   772             if(length>0 && targetCapacity>0) {

   773                 /*

   774                  * there is more input and some target capacity -

   775                  * it must be targetCapacity==1 because otherwise

   776                  * the above would have copied more;

   777                  * prepare for overflow output

   778                  */

   779                 if(U16_IS_SINGLE(c=*source++)) {

   780                     overflow[0]=(char)c;

   781                     overflow[1]=(char)(c>>8);

   782                     length=2; /* 2 bytes to output */

   783                     c=0;

   784                 /* } else { keep c for surrogate handling, length will be set there */

   785                 }

   786             } else {

   787                 length=0;

   788                 c=0;

   789             }

   790         } else {

   791             /* keep c for surrogate handling, length will be set there */

   792             targetCapacity+=2*count;

   793         }

   794     } else {

   795         length=0; /* from here on, length counts the bytes in overflow[] */

   796     }

   798     if(c!=0) {

   799         /*

   800          * c is a surrogate, and

   801          * - source or target too short

   802          * - or the surrogate is unmatched

   803          */

   804         length=0;

   805         if(U16_IS_SURROGATE_LEAD(c)) {

   806             if(source<pArgs->sourceLimit) {

   807                 if(U16_IS_TRAIL(trail=*source)) {

   808                     /* output the surrogate pair, will overflow (see conditions comment above) */

   809                     ++source;

   810                     overflow[0]=(char)c;

   811                     overflow[1]=(char)(c>>8);

   812                     overflow[2]=(char)trail;

   813                     overflow[3]=(char)(trail>>8);

   814                     length=4; /* 4 bytes to output */

   815                     c=0;

   816                 } else {

   817                     /* unmatched lead surrogate */

   818                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   819                 }

   820             } else {

   821                 /* see if the trail surrogate is in the next buffer */

   822             }

   823         } else {

   824             /* unmatched trail surrogate */

   825             *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   826         }

   827         cnv->fromUChar32=c;

   828     }

   830     if(length>0) {

   831         /* output length bytes with overflow (length>targetCapacity>0) */

   832         ucnv_fromUWriteBytes(cnv,

   833                              overflow, length,

   834                              &target, pArgs->targetLimit,

   835                              &offsets, sourceIndex,

   836                              pErrorCode);

   837         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);

   838     }

   840     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {

   841         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   842     }

   844     /* write back the updated pointers */

   845     pArgs->source=source;

   846     pArgs->target=target;

   847     pArgs->offsets=offsets;

   848 }

   850 static void

   851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

   852                              UErrorCode *pErrorCode) {

   853     UConverter *cnv;

   854     const uint8_t *source;

   855     UChar *target;

   856     int32_t *offsets;

   858     uint32_t targetCapacity, length, count, sourceIndex;

   859     UChar c, trail;

   861     if(pArgs->converter->mode<8) {

   862         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);

   863         return;

   864     }

   866     cnv=pArgs->converter;

   867     source=(const uint8_t *)pArgs->source;

   868     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);

   869     if(length<=0 && cnv->toUnicodeStatus==0) {

   870         /* no input, nothing to do */

   871         return;

   872     }

   874     target=pArgs->target;

   875     if(target >= pArgs->targetLimit) {

   876         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   877         return;

   878     }

   880     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);

   881     offsets=pArgs->offsets;

   882     sourceIndex=0;

   883     c=0;

   885     /* complete a partial UChar or pair from the last call */

   886     if(cnv->toUnicodeStatus!=0) {

   887         /*

   888          * special case: single byte from a previous buffer,

   889          * where the byte turned out not to belong to a trail surrogate

   890          * and the preceding, unmatched lead surrogate was put into toUBytes[]

   891          * for error handling

   892          */

   893         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;

   894         cnv->toULength=1;

   895         cnv->toUnicodeStatus=0;

   896     }

   897     if((count=cnv->toULength)!=0) {

   898         uint8_t *p=cnv->toUBytes;

   899         do {

   900             p[count++]=*source++;

   901             ++sourceIndex;

   902             --length;

   903             if(count==2) {

   904                 c=((UChar)p[1]<<8)|p[0];

   905                 if(U16_IS_SINGLE(c)) {

   906                     /* output the BMP code point */

   907                     *target++=c;

   908                     if(offsets!=NULL) {

   909                         *offsets++=-1;

   910                     }

   911                     --targetCapacity;

   912                     count=0;

   913                     c=0;

   914                     break;

   915                 } else if(U16_IS_SURROGATE_LEAD(c)) {

   916                     /* continue collecting bytes for the trail surrogate */

   917                     c=0; /* avoid unnecessary surrogate handling below */

   918                 } else {

   919                     /* fall through to error handling for an unmatched trail surrogate */

   920                     break;

   921                 }

   922             } else if(count==4) {

   923                 c=((UChar)p[1]<<8)|p[0];

   924                 trail=((UChar)p[3]<<8)|p[2];

   925                 if(U16_IS_TRAIL(trail)) {

   926                     /* output the surrogate pair */

   927                     *target++=c;

   928                     if(targetCapacity>=2) {

   929                         *target++=trail;

   930                         if(offsets!=NULL) {

   931                             *offsets++=-1;

   932                             *offsets++=-1;

   933                         }

   934                         targetCapacity-=2;

   935                     } else /* targetCapacity==1 */ {

   936                         targetCapacity=0;

   937                         cnv->UCharErrorBuffer[0]=trail;

   938                         cnv->UCharErrorBufferLength=1;

   939                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

   940                     }

   941                     count=0;

   942                     c=0;

   943                     break;

   944                 } else {

   945                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */

   946                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

   948                     /* back out reading the code unit after it */

   949                     if(((const uint8_t *)pArgs->source-source)>=2) {

   950                         source-=2;

   951                     } else {

   952                         /*

   953                          * if the trail unit's first byte was in a previous buffer, then

   954                          * we need to put it into a special place because toUBytes[] will be

   955                          * used for the lead unit's bytes

   956                          */

   957                         cnv->toUnicodeStatus=0x100|p[2];

   958                         --source;

   959                     }

   960                     cnv->toULength=2;

   962                     /* write back the updated pointers */

   963                     pArgs->source=(const char *)source;

   964                     pArgs->target=target;

   965                     pArgs->offsets=offsets;

   966                     return;

   967                 }

   968             }

   969         } while(length>0);

   970         cnv->toULength=(int8_t)count;

   971     }

   973     /* copy an even number of bytes for complete UChars */

   974     count=2*targetCapacity;

   975     if(count>length) {

   976         count=length&~1;

   977     }

   978     if(c==0 && count>0) {

   979         length-=count;

   980         count>>=1;

   981         targetCapacity-=count;

   982         if(offsets==NULL) {

   983             do {

   984                 c=((UChar)source[1]<<8)|source[0];

   985                 source+=2;

   986                 if(U16_IS_SINGLE(c)) {

   987                     *target++=c;

   988                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

   989                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])

   990                 ) {

   991                     source+=2;

   992                     --count;

   993                     *target++=c;

   994                     *target++=trail;

   995                 } else {

   996                     break;

   997                 }

   998             } while(--count>0);

   999         } else {

  1000             do {

  1001                 c=((UChar)source[1]<<8)|source[0];

  1002                 source+=2;

  1003                 if(U16_IS_SINGLE(c)) {

  1004                     *target++=c;

  1005                     *offsets++=sourceIndex;

  1006                     sourceIndex+=2;

  1007                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&

  1008                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])

  1009                 ) {

  1010                     source+=2;

  1011                     --count;

  1012                     *target++=c;

  1013                     *target++=trail;

  1014                     *offsets++=sourceIndex;

  1015                     *offsets++=sourceIndex;

  1016                     sourceIndex+=4;

  1017                 } else {

  1018                     break;

  1019                 }

  1020             } while(--count>0);

  1021         }

  1023         if(count==0) {

  1024             /* done with the loop for complete UChars */

  1025             c=0;

  1026         } else {

  1027             /* keep c for surrogate handling, trail will be set there */

  1028             length+=2*(count-1); /* one more byte pair was consumed than count decremented */

  1029             targetCapacity+=count;

  1030         }

  1031     }

  1033     if(c!=0) {

  1034         /*

  1035          * c is a surrogate, and

  1036          * - source or target too short

  1037          * - or the surrogate is unmatched

  1038          */

  1039         cnv->toUBytes[0]=(uint8_t)c;

  1040         cnv->toUBytes[1]=(uint8_t)(c>>8);

  1041         cnv->toULength=2;

  1043         if(U16_IS_SURROGATE_LEAD(c)) {

  1044             if(length>=2) {

  1045                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {

  1046                     /* output the surrogate pair, will overflow (see conditions comment above) */

  1047                     source+=2;

  1048                     length-=2;

  1049                     *target++=c;

  1050                     if(offsets!=NULL) {

  1051                         *offsets++=sourceIndex;

  1052                     }

  1053                     cnv->UCharErrorBuffer[0]=trail;

  1054                     cnv->UCharErrorBufferLength=1;

  1055                     cnv->toULength=0;

  1056                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1057                 } else {

  1058                     /* unmatched lead surrogate */

  1059                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1060                 }

  1061             } else {

  1062                 /* see if the trail surrogate is in the next buffer */

  1063             }

  1064         } else {

  1065             /* unmatched trail surrogate */

  1066             *pErrorCode=U_ILLEGAL_CHAR_FOUND;

  1067         }

  1068     }

  1070     if(U_SUCCESS(*pErrorCode)) {

  1071         /* check for a remaining source byte */

  1072         if(length>0) {

  1073             if(targetCapacity==0) {

  1074                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;

  1075             } else {

  1076                 /* it must be length==1 because otherwise the above would have copied more */

  1077                 cnv->toUBytes[cnv->toULength++]=*source++;

  1078             }

  1079         }

  1080     }

  1082     /* write back the updated pointers */

  1083     pArgs->source=(const char *)source;

  1084     pArgs->target=target;

  1085     pArgs->offsets=offsets;

  1086 }

  1088 static UChar32

  1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {

  1090     const uint8_t *s, *sourceLimit;

  1091     UChar32 c;

  1093     if(pArgs->converter->mode<8) {

  1094         return UCNV_GET_NEXT_UCHAR_USE_TO_U;

  1095     }

  1097     s=(const uint8_t *)pArgs->source;

  1098     sourceLimit=(const uint8_t *)pArgs->sourceLimit;

  1100     if(s>=sourceLimit) {

  1101         /* no input */

  1102         *err=U_INDEX_OUTOFBOUNDS_ERROR;

  1103         return 0xffff;

  1104     }

  1106     if(s+2>sourceLimit) {

  1107         /* only one byte: truncated UChar */

  1108         pArgs->converter->toUBytes[0]=*s++;

  1109         pArgs->converter->toULength=1;

  1110         pArgs->source=(const char *)s;

  1111         *err = U_TRUNCATED_CHAR_FOUND;

  1112         return 0xffff;

  1113     }

  1115     /* get one UChar */

  1116     c=((UChar32)s[1]<<8)|*s;

  1117     s+=2;

  1119     /* check for a surrogate pair */

  1120     if(U_IS_SURROGATE(c)) {

  1121         if(U16_IS_SURROGATE_LEAD(c)) {

  1122             if(s+2<=sourceLimit) {

  1123                 UChar trail;

  1125                 /* get a second UChar and see if it is a trail surrogate */

  1126                 trail=((UChar)s[1]<<8)|*s;

  1127                 if(U16_IS_TRAIL(trail)) {

  1128                     c=U16_GET_SUPPLEMENTARY(c, trail);

  1129                     s+=2;

  1130                 } else {

  1131                     /* unmatched lead surrogate */

  1132                     c=-2;

  1133                 }

  1134             } else {

  1135                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */

  1136                 uint8_t *bytes=pArgs->converter->toUBytes;

  1137                 s-=2;

  1138                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);

  1139                 do {

  1140                     *bytes++=*s++;

  1141                 } while(s<sourceLimit);

  1143                 c=0xffff;

  1144                 *err=U_TRUNCATED_CHAR_FOUND;

  1145             }

  1146         } else {

  1147             /* unmatched trail surrogate */

  1148             c=-2;

  1149         }

  1151         if(c<0) {

  1152             /* write the unmatched surrogate */

  1153             uint8_t *bytes=pArgs->converter->toUBytes;

  1154             pArgs->converter->toULength=2;

  1155             *bytes=*(s-2);

  1156             bytes[1]=*(s-1);

  1158             c=0xffff;

  1159             *err=U_ILLEGAL_CHAR_FOUND;

  1160         }

  1161     }

  1163     pArgs->source=(const char *)s;

  1164     return c;

  1165 }

  1167 static void

  1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {

  1169     if(choice<=UCNV_RESET_TO_UNICODE) {

  1170         /* reset toUnicode state */

  1171         if(UCNV_GET_VERSION(cnv)==0) {

  1172             cnv->mode=8; /* no BOM handling */

  1173         } else {

  1174             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */

  1175         }

  1176     }

  1177     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {

  1178         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */

  1179         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

  1180     }

  1181 }

  1183 static void

  1184 _UTF16LEOpen(UConverter *cnv,

  1185              UConverterLoadArgs *pArgs,

  1186              UErrorCode *pErrorCode) {

  1187     if(UCNV_GET_VERSION(cnv)<=1) {

  1188         _UTF16LEReset(cnv, UCNV_RESET_BOTH);

  1189     } else {

  1190         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

  1191     }

  1192 }

  1194 static const char *

  1195 _UTF16LEGetName(const UConverter *cnv) {

  1196     if(UCNV_GET_VERSION(cnv)==0) {

  1197         return "UTF-16LE";

  1198     } else {

  1199         return "UTF-16LE,version=1";

  1200     }

  1201 }

  1203 static const UConverterImpl _UTF16LEImpl={

  1204     UCNV_UTF16_LittleEndian,

  1206     NULL,

  1207     NULL,

  1209     _UTF16LEOpen,

  1210     NULL,

  1211     _UTF16LEReset,

  1213     _UTF16LEToUnicodeWithOffsets,

  1214     _UTF16LEToUnicodeWithOffsets,

  1215     _UTF16LEFromUnicodeWithOffsets,

  1216     _UTF16LEFromUnicodeWithOffsets,

  1217     _UTF16LEGetNextUChar,

  1219     NULL,

  1220     _UTF16LEGetName,

  1221     NULL,

  1222     NULL,

  1223     ucnv_getNonSurrogateUnicodeSet

  1224 };

  1227 static const UConverterStaticData _UTF16LEStaticData={

  1228     sizeof(UConverterStaticData),

  1229     "UTF-16LE",

  1230     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,

  1231     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,

  1232     0,

  1233     0,

  1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1235 };

  1238 const UConverterSharedData _UTF16LEData={

  1239     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1240     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,

  1241     0

  1242 };

  1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */

  1246 /*

  1247  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE

  1248  * accordingly.

  1249  * This is a simpler version of the UTF-32 converter, with

  1250  * fewer states for shorter BOMs.

  1251  *

  1252  * State values:

  1253  * 0    initial state

  1254  * 1    saw first byte

  1255  * 2..5 -

  1256  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1

  1257  * 8    UTF-16BE mode

  1258  * 9    UTF-16LE mode

  1259  *

  1260  * During detection: state==number of initial bytes seen so far.

  1261  *

  1262  * On output, emit U+FEFF as the first code point.

  1263  *

  1264  * Variants:

  1265  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.

  1266  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and

  1267  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.

  1268  */

  1270 static void

  1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {

  1272     if(choice<=UCNV_RESET_TO_UNICODE) {

  1273         /* reset toUnicode: state=0 */

  1274         cnv->mode=0;

  1275     }

  1276     if(choice!=UCNV_RESET_TO_UNICODE) {

  1277         /* reset fromUnicode: prepare to output the UTF-16PE BOM */

  1278         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;

  1279     }

  1280 }

  1282 static const UConverterSharedData _UTF16v2Data;

  1284 static void

  1285 _UTF16Open(UConverter *cnv,

  1286            UConverterLoadArgs *pArgs,

  1287            UErrorCode *pErrorCode) {

  1288     if(UCNV_GET_VERSION(cnv)<=2) {

  1289         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {

  1290             /*

  1291              * Switch implementation, and switch the staticData that's different

  1292              * and was copied into the UConverter.

  1293              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)

  1294              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.

  1295              */

  1296             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;

  1297             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);

  1298         }

  1299         _UTF16Reset(cnv, UCNV_RESET_BOTH);

  1300     } else {

  1301         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;

  1302     }

  1303 }

  1305 static const char *

  1306 _UTF16GetName(const UConverter *cnv) {

  1307     if(UCNV_GET_VERSION(cnv)==0) {

  1308         return "UTF-16";

  1309     } else if(UCNV_GET_VERSION(cnv)==1) {

  1310         return "UTF-16,version=1";

  1311     } else {

  1312         return "UTF-16,version=2";

  1313     }

  1314 }

  1316 const UConverterSharedData _UTF16Data;

  1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)

  1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)

  1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)

  1322 static void

  1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,

  1324                            UErrorCode *pErrorCode) {

  1325     UConverter *cnv=pArgs->converter;

  1326     const char *source=pArgs->source;

  1327     const char *sourceLimit=pArgs->sourceLimit;

  1328     int32_t *offsets=pArgs->offsets;

  1330     int32_t state, offsetDelta;

  1331     uint8_t b;

  1333     state=cnv->mode;

  1335     /*

  1336      * If we detect a BOM in this buffer, then we must add the BOM size to the

  1337      * offsets because the actual converter function will not see and count the BOM.

  1338      * offsetDelta will have the number of the BOM bytes that are in the current buffer.

  1339      */

  1340     offsetDelta=0;

  1342     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {

  1343         switch(state) {

  1344         case 0:

  1345             cnv->toUBytes[0]=(uint8_t)*source++;

  1346             cnv->toULength=1;

  1347             state=1;

  1348             break;

  1349         case 1:

  1350             /*

  1351              * Only inside this switch case can the state variable

  1352              * temporarily take two additional values:

  1353              * 6: BOM error, continue with BE

  1354              * 7: BOM error, continue with LE

  1355              */

  1356             b=*source;

  1357             if(cnv->toUBytes[0]==0xfe && b==0xff) {

  1358                 if(IS_UTF16LE(cnv)) {

  1359                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */

  1360                 } else {

  1361                     state=8; /* detect UTF-16BE */

  1362                 }

  1363             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {

  1364                 if(IS_UTF16BE(cnv)) {

  1365                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */

  1366                 } else {

  1367                     state=9; /* detect UTF-16LE */

  1368                 }

  1369             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {

  1370                 state=6; /* illegal missing BOM for Java "Unicode" */

  1371             }

  1372             if(state>=8) {

  1373                 /* BOM detected, consume it */

  1374                 ++source;

  1375                 cnv->toULength=0;

  1376                 offsetDelta=(int32_t)(source-pArgs->source);

  1377             } else if(state<6) {

  1378                 /* ok: no BOM, and not a reverse BOM */

  1379                 if(source!=pArgs->source) {

  1380                     /* reset the source for a correct first offset */

  1381                     source=pArgs->source;

  1382                     cnv->toULength=0;

  1383                 }

  1384                 if(IS_UTF16LE(cnv)) {

  1385                     /* Make Java "UnicodeLittle" default to LE. */

  1386                     state=9;

  1387                 } else {

  1388                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */

  1389                     state=8;

  1390                 }

  1391             } else {

  1392                 /*

  1393                  * error: missing BOM, or reverse BOM

  1394                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.

  1395                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.

  1396                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.

  1397                  */

  1398                 /* report the non-BOM or reverse BOM as an illegal sequence */

  1399                 cnv->toUBytes[1]=b;

  1400                 cnv->toULength=2;

  1401                 pArgs->source=source+1;

  1402                 /* continue with conversion if the callback resets the error */

  1403                 /*

  1404                  * Make Java "Unicode" default to BE like standard UTF-16.

  1405                  * Make Java "UnicodeBig" and "UnicodeLittle" default

  1406                  * to their normal endiannesses.

  1407                  */

  1408                 cnv->mode=state+2;

  1409                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;

  1410                 return;

  1411             }

  1412             /* convert the rest of the stream */

  1413             cnv->mode=state;

  1414             continue;

  1415         case 8:

  1416             /* call UTF-16BE */

  1417             pArgs->source=source;

  1418             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);

  1419             source=pArgs->source;

  1420             break;

  1421         case 9:

  1422             /* call UTF-16LE */

  1423             pArgs->source=source;

  1424             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);

  1425             source=pArgs->source;

  1426             break;

  1427         default:

  1428             break; /* does not occur */

  1429         }

  1430     }

  1432     /* add BOM size to offsets - see comment at offsetDelta declaration */

  1433     if(offsets!=NULL && offsetDelta!=0) {

  1434         int32_t *offsetsLimit=pArgs->offsets;

  1435         while(offsets<offsetsLimit) {

  1436             *offsets++ += offsetDelta;

  1437         }

  1438     }

  1440     pArgs->source=source;

  1442     if(source==sourceLimit && pArgs->flush) {

  1443         /* handle truncated input */

  1444         switch(state) {

  1445         case 0:

  1446             break; /* no input at all, nothing to do */

  1447         case 8:

  1448             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);

  1449             break;

  1450         case 9:

  1451             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);

  1452             break;

  1453         default:

  1454             /* 0<state<8: framework will report truncation, nothing to do here */

  1455             break;

  1456         }

  1457     }

  1459     cnv->mode=state;

  1460 }

  1462 static UChar32

  1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,

  1464                    UErrorCode *pErrorCode) {

  1465     switch(pArgs->converter->mode) {

  1466     case 8:

  1467         return _UTF16BEGetNextUChar(pArgs, pErrorCode);

  1468     case 9:

  1469         return _UTF16LEGetNextUChar(pArgs, pErrorCode);

  1470     default:

  1471         return UCNV_GET_NEXT_UCHAR_USE_TO_U;

  1472     }

  1473 }

  1475 static const UConverterImpl _UTF16Impl = {

  1476     UCNV_UTF16,

  1478     NULL,

  1479     NULL,

  1481     _UTF16Open,

  1482     NULL,

  1483     _UTF16Reset,

  1485     _UTF16ToUnicodeWithOffsets,

  1486     _UTF16ToUnicodeWithOffsets,

  1487     _UTF16PEFromUnicodeWithOffsets,

  1488     _UTF16PEFromUnicodeWithOffsets,

  1489     _UTF16GetNextUChar,

  1491     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

  1492     _UTF16GetName,

  1493     NULL,

  1494     NULL,

  1495     ucnv_getNonSurrogateUnicodeSet

  1496 };

  1498 static const UConverterStaticData _UTF16StaticData = {

  1499     sizeof(UConverterStaticData),

  1500     "UTF-16",

  1501     1204, /* CCSID for BOM sensitive UTF-16 */

  1502     UCNV_IBM, UCNV_UTF16, 2, 2,

  1503 #if U_IS_BIG_ENDIAN

  1504     { 0xff, 0xfd, 0, 0 }, 2,

  1505 #else

  1506     { 0xfd, 0xff, 0, 0 }, 2,

  1507 #endif

  1508     FALSE, FALSE,

  1509     0,

  1510     0,

  1511     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1512 };

  1514 const UConverterSharedData _UTF16Data = {

  1515     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1516     NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,

  1517     0

  1518 };

  1520 static const UConverterImpl _UTF16v2Impl = {

  1521     UCNV_UTF16,

  1523     NULL,

  1524     NULL,

  1526     _UTF16Open,

  1527     NULL,

  1528     _UTF16Reset,

  1530     _UTF16ToUnicodeWithOffsets,

  1531     _UTF16ToUnicodeWithOffsets,

  1532     _UTF16BEFromUnicodeWithOffsets,

  1533     _UTF16BEFromUnicodeWithOffsets,

  1534     _UTF16GetNextUChar,

  1536     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */

  1537     _UTF16GetName,

  1538     NULL,

  1539     NULL,

  1540     ucnv_getNonSurrogateUnicodeSet

  1541 };

  1543 static const UConverterStaticData _UTF16v2StaticData = {

  1544     sizeof(UConverterStaticData),

  1545     "UTF-16,version=2",

  1546     1204, /* CCSID for BOM sensitive UTF-16 */

  1547     UCNV_IBM, UCNV_UTF16, 2, 2,

  1548     { 0xff, 0xfd, 0, 0 }, 2,

  1549     FALSE, FALSE,

  1550     0,

  1551     0,

  1552     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */

  1553 };

  1555 static const UConverterSharedData _UTF16v2Data = {

  1556     sizeof(UConverterSharedData), ~((uint32_t) 0),

  1557     NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,

  1558     0

  1559 };

  1561 #endif

The Tor Browser / file revision

intl/icu/source/common/ucnv_u16.c@6474c204b198

intl/icu/source/common/ucnv_u16.c