intl/icu/source/common/ucnv_u16.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*  
     2 **********************************************************************
     3 *   Copyright (C) 2002-2010, International Business Machines
     4 *   Corporation and others.  All Rights Reserved.
     5 **********************************************************************
     6 *   file name:  ucnv_u16.c
     7 *   encoding:   US-ASCII
     8 *   tab size:   8 (not used)
     9 *   indentation:4
    10 *
    11 *   created on: 2002jul01
    12 *   created by: Markus W. Scherer
    13 *
    14 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
    15 */
    17 #include "unicode/utypes.h"
    19 #if !UCONFIG_NO_CONVERSION
    21 #include "unicode/ucnv.h"
    22 #include "ucnv_bld.h"
    23 #include "ucnv_cnv.h"
    24 #include "cmemory.h"
    26 enum {
    27     UCNV_NEED_TO_WRITE_BOM=1
    28 };
    30 /*
    31  * The UTF-16 toUnicode implementation is also used for the Java-specific
    32  * "with BOM" variants of UTF-16BE and UTF-16LE.
    33  */
    34 static void
    35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    36                            UErrorCode *pErrorCode);
    38 /* UTF-16BE ----------------------------------------------------------------- */
    40 #if U_IS_BIG_ENDIAN
    41 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
    42 #else
    43 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
    44 #endif
    47 static void
    48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    49                                UErrorCode *pErrorCode) {
    50     UConverter *cnv;
    51     const UChar *source;
    52     char *target;
    53     int32_t *offsets;
    55     uint32_t targetCapacity, length, sourceIndex;
    56     UChar c, trail;
    57     char overflow[4];
    59     source=pArgs->source;
    60     length=(int32_t)(pArgs->sourceLimit-source);
    61     if(length<=0) {
    62         /* no input, nothing to do */
    63         return;
    64     }
    66     cnv=pArgs->converter;
    68     /* write the BOM if necessary */
    69     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    70         static const char bom[]={ (char)0xfe, (char)0xff };
    71         ucnv_fromUWriteBytes(cnv,
    72                              bom, 2,
    73                              &pArgs->target, pArgs->targetLimit,
    74                              &pArgs->offsets, -1,
    75                              pErrorCode);
    76         cnv->fromUnicodeStatus=0;
    77     }
    79     target=pArgs->target;
    80     if(target >= pArgs->targetLimit) {
    81         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    82         return;
    83     }
    85     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
    86     offsets=pArgs->offsets;
    87     sourceIndex=0;
    89     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
    91     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
    92         /* the last buffer ended with a lead surrogate, output the surrogate pair */
    93         ++source;
    94         --length;
    95         target[0]=(uint8_t)(c>>8);
    96         target[1]=(uint8_t)c;
    97         target[2]=(uint8_t)(trail>>8);
    98         target[3]=(uint8_t)trail;
    99         target+=4;
   100         targetCapacity-=4;
   101         if(offsets!=NULL) {
   102             *offsets++=-1;
   103             *offsets++=-1;
   104             *offsets++=-1;
   105             *offsets++=-1;
   106         }
   107         sourceIndex=1;
   108         cnv->fromUChar32=c=0;
   109     }
   111     if(c==0) {
   112         /* copy an even number of bytes for complete UChars */
   113         uint32_t count=2*length;
   114         if(count>targetCapacity) {
   115             count=targetCapacity&~1;
   116         }
   117         /* count is even */
   118         targetCapacity-=count;
   119         count>>=1;
   120         length-=count;
   122         if(offsets==NULL) {
   123             while(count>0) {
   124                 c=*source++;
   125                 if(U16_IS_SINGLE(c)) {
   126                     target[0]=(uint8_t)(c>>8);
   127                     target[1]=(uint8_t)c;
   128                     target+=2;
   129                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   130                     ++source;
   131                     --count;
   132                     target[0]=(uint8_t)(c>>8);
   133                     target[1]=(uint8_t)c;
   134                     target[2]=(uint8_t)(trail>>8);
   135                     target[3]=(uint8_t)trail;
   136                     target+=4;
   137                 } else {
   138                     break;
   139                 }
   140                 --count;
   141             }
   142         } else {
   143             while(count>0) {
   144                 c=*source++;
   145                 if(U16_IS_SINGLE(c)) {
   146                     target[0]=(uint8_t)(c>>8);
   147                     target[1]=(uint8_t)c;
   148                     target+=2;
   149                     *offsets++=sourceIndex;
   150                     *offsets++=sourceIndex++;
   151                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   152                     ++source;
   153                     --count;
   154                     target[0]=(uint8_t)(c>>8);
   155                     target[1]=(uint8_t)c;
   156                     target[2]=(uint8_t)(trail>>8);
   157                     target[3]=(uint8_t)trail;
   158                     target+=4;
   159                     *offsets++=sourceIndex;
   160                     *offsets++=sourceIndex;
   161                     *offsets++=sourceIndex;
   162                     *offsets++=sourceIndex;
   163                     sourceIndex+=2;
   164                 } else {
   165                     break;
   166                 }
   167                 --count;
   168             }
   169         }
   171         if(count==0) {
   172             /* done with the loop for complete UChars */
   173             if(length>0 && targetCapacity>0) {
   174                 /*
   175                  * there is more input and some target capacity -
   176                  * it must be targetCapacity==1 because otherwise
   177                  * the above would have copied more;
   178                  * prepare for overflow output
   179                  */
   180                 if(U16_IS_SINGLE(c=*source++)) {
   181                     overflow[0]=(char)(c>>8);
   182                     overflow[1]=(char)c;
   183                     length=2; /* 2 bytes to output */
   184                     c=0;
   185                 /* } else { keep c for surrogate handling, length will be set there */
   186                 }
   187             } else {
   188                 length=0;
   189                 c=0;
   190             }
   191         } else {
   192             /* keep c for surrogate handling, length will be set there */
   193             targetCapacity+=2*count;
   194         }
   195     } else {
   196         length=0; /* from here on, length counts the bytes in overflow[] */
   197     }
   199     if(c!=0) {
   200         /*
   201          * c is a surrogate, and
   202          * - source or target too short
   203          * - or the surrogate is unmatched
   204          */
   205         length=0;
   206         if(U16_IS_SURROGATE_LEAD(c)) {
   207             if(source<pArgs->sourceLimit) {
   208                 if(U16_IS_TRAIL(trail=*source)) {
   209                     /* output the surrogate pair, will overflow (see conditions comment above) */
   210                     ++source;
   211                     overflow[0]=(char)(c>>8);
   212                     overflow[1]=(char)c;
   213                     overflow[2]=(char)(trail>>8);
   214                     overflow[3]=(char)trail;
   215                     length=4; /* 4 bytes to output */
   216                     c=0;
   217                 } else {
   218                     /* unmatched lead surrogate */
   219                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   220                 }
   221             } else {
   222                 /* see if the trail surrogate is in the next buffer */
   223             }
   224         } else {
   225             /* unmatched trail surrogate */
   226             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   227         }
   228         cnv->fromUChar32=c;
   229     }
   231     if(length>0) {
   232         /* output length bytes with overflow (length>targetCapacity>0) */
   233         ucnv_fromUWriteBytes(cnv,
   234                              overflow, length,
   235                              (char **)&target, pArgs->targetLimit,
   236                              &offsets, sourceIndex,
   237                              pErrorCode);
   238         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
   239     }
   241     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
   242         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   243     }
   245     /* write back the updated pointers */
   246     pArgs->source=source;
   247     pArgs->target=(char *)target;
   248     pArgs->offsets=offsets;
   249 }
   251 static void
   252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   253                              UErrorCode *pErrorCode) {
   254     UConverter *cnv;
   255     const uint8_t *source;
   256     UChar *target;
   257     int32_t *offsets;
   259     uint32_t targetCapacity, length, count, sourceIndex;
   260     UChar c, trail;
   262     if(pArgs->converter->mode<8) {
   263         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
   264         return;
   265     }
   267     cnv=pArgs->converter;
   268     source=(const uint8_t *)pArgs->source;
   269     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
   270     if(length<=0 && cnv->toUnicodeStatus==0) {
   271         /* no input, nothing to do */
   272         return;
   273     }
   275     target=pArgs->target;
   276     if(target >= pArgs->targetLimit) {
   277         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   278         return;
   279     }
   281     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
   282     offsets=pArgs->offsets;
   283     sourceIndex=0;
   284     c=0;
   286     /* complete a partial UChar or pair from the last call */
   287     if(cnv->toUnicodeStatus!=0) {
   288         /*
   289          * special case: single byte from a previous buffer,
   290          * where the byte turned out not to belong to a trail surrogate
   291          * and the preceding, unmatched lead surrogate was put into toUBytes[]
   292          * for error handling
   293          */
   294         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
   295         cnv->toULength=1;
   296         cnv->toUnicodeStatus=0;
   297     }
   298     if((count=cnv->toULength)!=0) {
   299         uint8_t *p=cnv->toUBytes;
   300         do {
   301             p[count++]=*source++;
   302             ++sourceIndex;
   303             --length;
   304             if(count==2) {
   305                 c=((UChar)p[0]<<8)|p[1];
   306                 if(U16_IS_SINGLE(c)) {
   307                     /* output the BMP code point */
   308                     *target++=c;
   309                     if(offsets!=NULL) {
   310                         *offsets++=-1;
   311                     }
   312                     --targetCapacity;
   313                     count=0;
   314                     c=0;
   315                     break;
   316                 } else if(U16_IS_SURROGATE_LEAD(c)) {
   317                     /* continue collecting bytes for the trail surrogate */
   318                     c=0; /* avoid unnecessary surrogate handling below */
   319                 } else {
   320                     /* fall through to error handling for an unmatched trail surrogate */
   321                     break;
   322                 }
   323             } else if(count==4) {
   324                 c=((UChar)p[0]<<8)|p[1];
   325                 trail=((UChar)p[2]<<8)|p[3];
   326                 if(U16_IS_TRAIL(trail)) {
   327                     /* output the surrogate pair */
   328                     *target++=c;
   329                     if(targetCapacity>=2) {
   330                         *target++=trail;
   331                         if(offsets!=NULL) {
   332                             *offsets++=-1;
   333                             *offsets++=-1;
   334                         }
   335                         targetCapacity-=2;
   336                     } else /* targetCapacity==1 */ {
   337                         targetCapacity=0;
   338                         cnv->UCharErrorBuffer[0]=trail;
   339                         cnv->UCharErrorBufferLength=1;
   340                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   341                     }
   342                     count=0;
   343                     c=0;
   344                     break;
   345                 } else {
   346                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
   347                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   349                     /* back out reading the code unit after it */
   350                     if(((const uint8_t *)pArgs->source-source)>=2) {
   351                         source-=2;
   352                     } else {
   353                         /*
   354                          * if the trail unit's first byte was in a previous buffer, then
   355                          * we need to put it into a special place because toUBytes[] will be
   356                          * used for the lead unit's bytes
   357                          */
   358                         cnv->toUnicodeStatus=0x100|p[2];
   359                         --source;
   360                     }
   361                     cnv->toULength=2;
   363                     /* write back the updated pointers */
   364                     pArgs->source=(const char *)source;
   365                     pArgs->target=target;
   366                     pArgs->offsets=offsets;
   367                     return;
   368                 }
   369             }
   370         } while(length>0);
   371         cnv->toULength=(int8_t)count;
   372     }
   374     /* copy an even number of bytes for complete UChars */
   375     count=2*targetCapacity;
   376     if(count>length) {
   377         count=length&~1;
   378     }
   379     if(c==0 && count>0) {
   380         length-=count;
   381         count>>=1;
   382         targetCapacity-=count;
   383         if(offsets==NULL) {
   384             do {
   385                 c=((UChar)source[0]<<8)|source[1];
   386                 source+=2;
   387                 if(U16_IS_SINGLE(c)) {
   388                     *target++=c;
   389                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   390                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
   391                 ) {
   392                     source+=2;
   393                     --count;
   394                     *target++=c;
   395                     *target++=trail;
   396                 } else {
   397                     break;
   398                 }
   399             } while(--count>0);
   400         } else {
   401             do {
   402                 c=((UChar)source[0]<<8)|source[1];
   403                 source+=2;
   404                 if(U16_IS_SINGLE(c)) {
   405                     *target++=c;
   406                     *offsets++=sourceIndex;
   407                     sourceIndex+=2;
   408                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   409                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
   410                 ) {
   411                     source+=2;
   412                     --count;
   413                     *target++=c;
   414                     *target++=trail;
   415                     *offsets++=sourceIndex;
   416                     *offsets++=sourceIndex;
   417                     sourceIndex+=4;
   418                 } else {
   419                     break;
   420                 }
   421             } while(--count>0);
   422         }
   424         if(count==0) {
   425             /* done with the loop for complete UChars */
   426             c=0;
   427         } else {
   428             /* keep c for surrogate handling, trail will be set there */
   429             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
   430             targetCapacity+=count;
   431         }
   432     }
   434     if(c!=0) {
   435         /*
   436          * c is a surrogate, and
   437          * - source or target too short
   438          * - or the surrogate is unmatched
   439          */
   440         cnv->toUBytes[0]=(uint8_t)(c>>8);
   441         cnv->toUBytes[1]=(uint8_t)c;
   442         cnv->toULength=2;
   444         if(U16_IS_SURROGATE_LEAD(c)) {
   445             if(length>=2) {
   446                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
   447                     /* output the surrogate pair, will overflow (see conditions comment above) */
   448                     source+=2;
   449                     length-=2;
   450                     *target++=c;
   451                     if(offsets!=NULL) {
   452                         *offsets++=sourceIndex;
   453                     }
   454                     cnv->UCharErrorBuffer[0]=trail;
   455                     cnv->UCharErrorBufferLength=1;
   456                     cnv->toULength=0;
   457                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   458                 } else {
   459                     /* unmatched lead surrogate */
   460                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   461                 }
   462             } else {
   463                 /* see if the trail surrogate is in the next buffer */
   464             }
   465         } else {
   466             /* unmatched trail surrogate */
   467             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   468         }
   469     }
   471     if(U_SUCCESS(*pErrorCode)) {
   472         /* check for a remaining source byte */
   473         if(length>0) {
   474             if(targetCapacity==0) {
   475                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   476             } else {
   477                 /* it must be length==1 because otherwise the above would have copied more */
   478                 cnv->toUBytes[cnv->toULength++]=*source++;
   479             }
   480         }
   481     }
   483     /* write back the updated pointers */
   484     pArgs->source=(const char *)source;
   485     pArgs->target=target;
   486     pArgs->offsets=offsets;
   487 }
   489 static UChar32
   490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
   491     const uint8_t *s, *sourceLimit;
   492     UChar32 c;
   494     if(pArgs->converter->mode<8) {
   495         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   496     }
   498     s=(const uint8_t *)pArgs->source;
   499     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   501     if(s>=sourceLimit) {
   502         /* no input */
   503         *err=U_INDEX_OUTOFBOUNDS_ERROR;
   504         return 0xffff;
   505     }
   507     if(s+2>sourceLimit) {
   508         /* only one byte: truncated UChar */
   509         pArgs->converter->toUBytes[0]=*s++;
   510         pArgs->converter->toULength=1;
   511         pArgs->source=(const char *)s;
   512         *err = U_TRUNCATED_CHAR_FOUND;
   513         return 0xffff;
   514     }
   516     /* get one UChar */
   517     c=((UChar32)*s<<8)|s[1];
   518     s+=2;
   520     /* check for a surrogate pair */
   521     if(U_IS_SURROGATE(c)) {
   522         if(U16_IS_SURROGATE_LEAD(c)) {
   523             if(s+2<=sourceLimit) {
   524                 UChar trail;
   526                 /* get a second UChar and see if it is a trail surrogate */
   527                 trail=((UChar)*s<<8)|s[1];
   528                 if(U16_IS_TRAIL(trail)) {
   529                     c=U16_GET_SUPPLEMENTARY(c, trail);
   530                     s+=2;
   531                 } else {
   532                     /* unmatched lead surrogate */
   533                     c=-2;
   534                 }
   535             } else {
   536                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
   537                 uint8_t *bytes=pArgs->converter->toUBytes;
   538                 s-=2;
   539                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
   540                 do {
   541                     *bytes++=*s++;
   542                 } while(s<sourceLimit);
   544                 c=0xffff;
   545                 *err=U_TRUNCATED_CHAR_FOUND;
   546             }
   547         } else {
   548             /* unmatched trail surrogate */
   549             c=-2;
   550         }
   552         if(c<0) {
   553             /* write the unmatched surrogate */
   554             uint8_t *bytes=pArgs->converter->toUBytes;
   555             pArgs->converter->toULength=2;
   556             *bytes=*(s-2);
   557             bytes[1]=*(s-1);
   559             c=0xffff;
   560             *err=U_ILLEGAL_CHAR_FOUND;
   561         }
   562     }
   564     pArgs->source=(const char *)s;
   565     return c;
   566 } 
   568 static void
   569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
   570     if(choice<=UCNV_RESET_TO_UNICODE) {
   571         /* reset toUnicode state */
   572         if(UCNV_GET_VERSION(cnv)==0) {
   573             cnv->mode=8; /* no BOM handling */
   574         } else {
   575             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
   576         }
   577     }
   578     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
   579         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
   580         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   581     }
   582 }
   584 static void
   585 _UTF16BEOpen(UConverter *cnv,
   586              UConverterLoadArgs *pArgs,
   587              UErrorCode *pErrorCode) {
   588     if(UCNV_GET_VERSION(cnv)<=1) {
   589         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
   590     } else {
   591         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   592     }
   593 }
   595 static const char *
   596 _UTF16BEGetName(const UConverter *cnv) {
   597     if(UCNV_GET_VERSION(cnv)==0) {
   598         return "UTF-16BE";
   599     } else {
   600         return "UTF-16BE,version=1";
   601     }
   602 }
   604 static const UConverterImpl _UTF16BEImpl={
   605     UCNV_UTF16_BigEndian,
   607     NULL,
   608     NULL,
   610     _UTF16BEOpen,
   611     NULL,
   612     _UTF16BEReset,
   614     _UTF16BEToUnicodeWithOffsets,
   615     _UTF16BEToUnicodeWithOffsets,
   616     _UTF16BEFromUnicodeWithOffsets,
   617     _UTF16BEFromUnicodeWithOffsets,
   618     _UTF16BEGetNextUChar,
   620     NULL,
   621     _UTF16BEGetName,
   622     NULL,
   623     NULL,
   624     ucnv_getNonSurrogateUnicodeSet
   625 };
   627 static const UConverterStaticData _UTF16BEStaticData={
   628     sizeof(UConverterStaticData),
   629     "UTF-16BE",
   630     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
   631     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
   632     0,
   633     0,
   634     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   635 };
   638 const UConverterSharedData _UTF16BEData={
   639     sizeof(UConverterSharedData), ~((uint32_t) 0),
   640     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, 
   641     0
   642 };
   644 /* UTF-16LE ----------------------------------------------------------------- */
   646 static void
   647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
   648                                UErrorCode *pErrorCode) {
   649     UConverter *cnv;
   650     const UChar *source;
   651     char *target;
   652     int32_t *offsets;
   654     uint32_t targetCapacity, length, sourceIndex;
   655     UChar c, trail;
   656     char overflow[4];
   658     source=pArgs->source;
   659     length=(int32_t)(pArgs->sourceLimit-source);
   660     if(length<=0) {
   661         /* no input, nothing to do */
   662         return;
   663     }
   665     cnv=pArgs->converter;
   667     /* write the BOM if necessary */
   668     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
   669         static const char bom[]={ (char)0xff, (char)0xfe };
   670         ucnv_fromUWriteBytes(cnv,
   671                              bom, 2,
   672                              &pArgs->target, pArgs->targetLimit,
   673                              &pArgs->offsets, -1,
   674                              pErrorCode);
   675         cnv->fromUnicodeStatus=0;
   676     }
   678     target=pArgs->target;
   679     if(target >= pArgs->targetLimit) {
   680         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   681         return;
   682     }
   684     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
   685     offsets=pArgs->offsets;
   686     sourceIndex=0;
   688     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
   690     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
   691         /* the last buffer ended with a lead surrogate, output the surrogate pair */
   692         ++source;
   693         --length;
   694         target[0]=(uint8_t)c;
   695         target[1]=(uint8_t)(c>>8);
   696         target[2]=(uint8_t)trail;
   697         target[3]=(uint8_t)(trail>>8);
   698         target+=4;
   699         targetCapacity-=4;
   700         if(offsets!=NULL) {
   701             *offsets++=-1;
   702             *offsets++=-1;
   703             *offsets++=-1;
   704             *offsets++=-1;
   705         }
   706         sourceIndex=1;
   707         cnv->fromUChar32=c=0;
   708     }
   710     if(c==0) {
   711         /* copy an even number of bytes for complete UChars */
   712         uint32_t count=2*length;
   713         if(count>targetCapacity) {
   714             count=targetCapacity&~1;
   715         }
   716         /* count is even */
   717         targetCapacity-=count;
   718         count>>=1;
   719         length-=count;
   721         if(offsets==NULL) {
   722             while(count>0) {
   723                 c=*source++;
   724                 if(U16_IS_SINGLE(c)) {
   725                     target[0]=(uint8_t)c;
   726                     target[1]=(uint8_t)(c>>8);
   727                     target+=2;
   728                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   729                     ++source;
   730                     --count;
   731                     target[0]=(uint8_t)c;
   732                     target[1]=(uint8_t)(c>>8);
   733                     target[2]=(uint8_t)trail;
   734                     target[3]=(uint8_t)(trail>>8);
   735                     target+=4;
   736                 } else {
   737                     break;
   738                 }
   739                 --count;
   740             }
   741         } else {
   742             while(count>0) {
   743                 c=*source++;
   744                 if(U16_IS_SINGLE(c)) {
   745                     target[0]=(uint8_t)c;
   746                     target[1]=(uint8_t)(c>>8);
   747                     target+=2;
   748                     *offsets++=sourceIndex;
   749                     *offsets++=sourceIndex++;
   750                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
   751                     ++source;
   752                     --count;
   753                     target[0]=(uint8_t)c;
   754                     target[1]=(uint8_t)(c>>8);
   755                     target[2]=(uint8_t)trail;
   756                     target[3]=(uint8_t)(trail>>8);
   757                     target+=4;
   758                     *offsets++=sourceIndex;
   759                     *offsets++=sourceIndex;
   760                     *offsets++=sourceIndex;
   761                     *offsets++=sourceIndex;
   762                     sourceIndex+=2;
   763                 } else {
   764                     break;
   765                 }
   766                 --count;
   767             }
   768         }
   770         if(count==0) {
   771             /* done with the loop for complete UChars */
   772             if(length>0 && targetCapacity>0) {
   773                 /*
   774                  * there is more input and some target capacity -
   775                  * it must be targetCapacity==1 because otherwise
   776                  * the above would have copied more;
   777                  * prepare for overflow output
   778                  */
   779                 if(U16_IS_SINGLE(c=*source++)) {
   780                     overflow[0]=(char)c;
   781                     overflow[1]=(char)(c>>8);
   782                     length=2; /* 2 bytes to output */
   783                     c=0;
   784                 /* } else { keep c for surrogate handling, length will be set there */
   785                 }
   786             } else {
   787                 length=0;
   788                 c=0;
   789             }
   790         } else {
   791             /* keep c for surrogate handling, length will be set there */
   792             targetCapacity+=2*count;
   793         }
   794     } else {
   795         length=0; /* from here on, length counts the bytes in overflow[] */
   796     }
   798     if(c!=0) {
   799         /*
   800          * c is a surrogate, and
   801          * - source or target too short
   802          * - or the surrogate is unmatched
   803          */
   804         length=0;
   805         if(U16_IS_SURROGATE_LEAD(c)) {
   806             if(source<pArgs->sourceLimit) {
   807                 if(U16_IS_TRAIL(trail=*source)) {
   808                     /* output the surrogate pair, will overflow (see conditions comment above) */
   809                     ++source;
   810                     overflow[0]=(char)c;
   811                     overflow[1]=(char)(c>>8);
   812                     overflow[2]=(char)trail;
   813                     overflow[3]=(char)(trail>>8);
   814                     length=4; /* 4 bytes to output */
   815                     c=0;
   816                 } else {
   817                     /* unmatched lead surrogate */
   818                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   819                 }
   820             } else {
   821                 /* see if the trail surrogate is in the next buffer */
   822             }
   823         } else {
   824             /* unmatched trail surrogate */
   825             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   826         }
   827         cnv->fromUChar32=c;
   828     }
   830     if(length>0) {
   831         /* output length bytes with overflow (length>targetCapacity>0) */
   832         ucnv_fromUWriteBytes(cnv,
   833                              overflow, length,
   834                              &target, pArgs->targetLimit,
   835                              &offsets, sourceIndex,
   836                              pErrorCode);
   837         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
   838     }
   840     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
   841         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   842     }
   844     /* write back the updated pointers */
   845     pArgs->source=source;
   846     pArgs->target=target;
   847     pArgs->offsets=offsets;
   848 }
   850 static void
   851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   852                              UErrorCode *pErrorCode) {
   853     UConverter *cnv;
   854     const uint8_t *source;
   855     UChar *target;
   856     int32_t *offsets;
   858     uint32_t targetCapacity, length, count, sourceIndex;
   859     UChar c, trail;
   861     if(pArgs->converter->mode<8) {
   862         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
   863         return;
   864     }
   866     cnv=pArgs->converter;
   867     source=(const uint8_t *)pArgs->source;
   868     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
   869     if(length<=0 && cnv->toUnicodeStatus==0) {
   870         /* no input, nothing to do */
   871         return;
   872     }
   874     target=pArgs->target;
   875     if(target >= pArgs->targetLimit) {
   876         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   877         return;
   878     }
   880     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
   881     offsets=pArgs->offsets;
   882     sourceIndex=0;
   883     c=0;
   885     /* complete a partial UChar or pair from the last call */
   886     if(cnv->toUnicodeStatus!=0) {
   887         /*
   888          * special case: single byte from a previous buffer,
   889          * where the byte turned out not to belong to a trail surrogate
   890          * and the preceding, unmatched lead surrogate was put into toUBytes[]
   891          * for error handling
   892          */
   893         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
   894         cnv->toULength=1;
   895         cnv->toUnicodeStatus=0;
   896     }
   897     if((count=cnv->toULength)!=0) {
   898         uint8_t *p=cnv->toUBytes;
   899         do {
   900             p[count++]=*source++;
   901             ++sourceIndex;
   902             --length;
   903             if(count==2) {
   904                 c=((UChar)p[1]<<8)|p[0];
   905                 if(U16_IS_SINGLE(c)) {
   906                     /* output the BMP code point */
   907                     *target++=c;
   908                     if(offsets!=NULL) {
   909                         *offsets++=-1;
   910                     }
   911                     --targetCapacity;
   912                     count=0;
   913                     c=0;
   914                     break;
   915                 } else if(U16_IS_SURROGATE_LEAD(c)) {
   916                     /* continue collecting bytes for the trail surrogate */
   917                     c=0; /* avoid unnecessary surrogate handling below */
   918                 } else {
   919                     /* fall through to error handling for an unmatched trail surrogate */
   920                     break;
   921                 }
   922             } else if(count==4) {
   923                 c=((UChar)p[1]<<8)|p[0];
   924                 trail=((UChar)p[3]<<8)|p[2];
   925                 if(U16_IS_TRAIL(trail)) {
   926                     /* output the surrogate pair */
   927                     *target++=c;
   928                     if(targetCapacity>=2) {
   929                         *target++=trail;
   930                         if(offsets!=NULL) {
   931                             *offsets++=-1;
   932                             *offsets++=-1;
   933                         }
   934                         targetCapacity-=2;
   935                     } else /* targetCapacity==1 */ {
   936                         targetCapacity=0;
   937                         cnv->UCharErrorBuffer[0]=trail;
   938                         cnv->UCharErrorBufferLength=1;
   939                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   940                     }
   941                     count=0;
   942                     c=0;
   943                     break;
   944                 } else {
   945                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
   946                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   948                     /* back out reading the code unit after it */
   949                     if(((const uint8_t *)pArgs->source-source)>=2) {
   950                         source-=2;
   951                     } else {
   952                         /*
   953                          * if the trail unit's first byte was in a previous buffer, then
   954                          * we need to put it into a special place because toUBytes[] will be
   955                          * used for the lead unit's bytes
   956                          */
   957                         cnv->toUnicodeStatus=0x100|p[2];
   958                         --source;
   959                     }
   960                     cnv->toULength=2;
   962                     /* write back the updated pointers */
   963                     pArgs->source=(const char *)source;
   964                     pArgs->target=target;
   965                     pArgs->offsets=offsets;
   966                     return;
   967                 }
   968             }
   969         } while(length>0);
   970         cnv->toULength=(int8_t)count;
   971     }
   973     /* copy an even number of bytes for complete UChars */
   974     count=2*targetCapacity;
   975     if(count>length) {
   976         count=length&~1;
   977     }
   978     if(c==0 && count>0) {
   979         length-=count;
   980         count>>=1;
   981         targetCapacity-=count;
   982         if(offsets==NULL) {
   983             do {
   984                 c=((UChar)source[1]<<8)|source[0];
   985                 source+=2;
   986                 if(U16_IS_SINGLE(c)) {
   987                     *target++=c;
   988                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   989                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
   990                 ) {
   991                     source+=2;
   992                     --count;
   993                     *target++=c;
   994                     *target++=trail;
   995                 } else {
   996                     break;
   997                 }
   998             } while(--count>0);
   999         } else {
  1000             do {
  1001                 c=((UChar)source[1]<<8)|source[0];
  1002                 source+=2;
  1003                 if(U16_IS_SINGLE(c)) {
  1004                     *target++=c;
  1005                     *offsets++=sourceIndex;
  1006                     sourceIndex+=2;
  1007                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
  1008                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
  1009                 ) {
  1010                     source+=2;
  1011                     --count;
  1012                     *target++=c;
  1013                     *target++=trail;
  1014                     *offsets++=sourceIndex;
  1015                     *offsets++=sourceIndex;
  1016                     sourceIndex+=4;
  1017                 } else {
  1018                     break;
  1020             } while(--count>0);
  1023         if(count==0) {
  1024             /* done with the loop for complete UChars */
  1025             c=0;
  1026         } else {
  1027             /* keep c for surrogate handling, trail will be set there */
  1028             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
  1029             targetCapacity+=count;
  1033     if(c!=0) {
  1034         /*
  1035          * c is a surrogate, and
  1036          * - source or target too short
  1037          * - or the surrogate is unmatched
  1038          */
  1039         cnv->toUBytes[0]=(uint8_t)c;
  1040         cnv->toUBytes[1]=(uint8_t)(c>>8);
  1041         cnv->toULength=2;
  1043         if(U16_IS_SURROGATE_LEAD(c)) {
  1044             if(length>=2) {
  1045                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
  1046                     /* output the surrogate pair, will overflow (see conditions comment above) */
  1047                     source+=2;
  1048                     length-=2;
  1049                     *target++=c;
  1050                     if(offsets!=NULL) {
  1051                         *offsets++=sourceIndex;
  1053                     cnv->UCharErrorBuffer[0]=trail;
  1054                     cnv->UCharErrorBufferLength=1;
  1055                     cnv->toULength=0;
  1056                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1057                 } else {
  1058                     /* unmatched lead surrogate */
  1059                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1061             } else {
  1062                 /* see if the trail surrogate is in the next buffer */
  1064         } else {
  1065             /* unmatched trail surrogate */
  1066             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
  1070     if(U_SUCCESS(*pErrorCode)) {
  1071         /* check for a remaining source byte */
  1072         if(length>0) {
  1073             if(targetCapacity==0) {
  1074                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
  1075             } else {
  1076                 /* it must be length==1 because otherwise the above would have copied more */
  1077                 cnv->toUBytes[cnv->toULength++]=*source++;
  1082     /* write back the updated pointers */
  1083     pArgs->source=(const char *)source;
  1084     pArgs->target=target;
  1085     pArgs->offsets=offsets;
  1088 static UChar32
  1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
  1090     const uint8_t *s, *sourceLimit;
  1091     UChar32 c;
  1093     if(pArgs->converter->mode<8) {
  1094         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1097     s=(const uint8_t *)pArgs->source;
  1098     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
  1100     if(s>=sourceLimit) {
  1101         /* no input */
  1102         *err=U_INDEX_OUTOFBOUNDS_ERROR;
  1103         return 0xffff;
  1106     if(s+2>sourceLimit) {
  1107         /* only one byte: truncated UChar */
  1108         pArgs->converter->toUBytes[0]=*s++;
  1109         pArgs->converter->toULength=1;
  1110         pArgs->source=(const char *)s;
  1111         *err = U_TRUNCATED_CHAR_FOUND;
  1112         return 0xffff;
  1115     /* get one UChar */
  1116     c=((UChar32)s[1]<<8)|*s;
  1117     s+=2;
  1119     /* check for a surrogate pair */
  1120     if(U_IS_SURROGATE(c)) {
  1121         if(U16_IS_SURROGATE_LEAD(c)) {
  1122             if(s+2<=sourceLimit) {
  1123                 UChar trail;
  1125                 /* get a second UChar and see if it is a trail surrogate */
  1126                 trail=((UChar)s[1]<<8)|*s;
  1127                 if(U16_IS_TRAIL(trail)) {
  1128                     c=U16_GET_SUPPLEMENTARY(c, trail);
  1129                     s+=2;
  1130                 } else {
  1131                     /* unmatched lead surrogate */
  1132                     c=-2;
  1134             } else {
  1135                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
  1136                 uint8_t *bytes=pArgs->converter->toUBytes;
  1137                 s-=2;
  1138                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
  1139                 do {
  1140                     *bytes++=*s++;
  1141                 } while(s<sourceLimit);
  1143                 c=0xffff;
  1144                 *err=U_TRUNCATED_CHAR_FOUND;
  1146         } else {
  1147             /* unmatched trail surrogate */
  1148             c=-2;
  1151         if(c<0) {
  1152             /* write the unmatched surrogate */
  1153             uint8_t *bytes=pArgs->converter->toUBytes;
  1154             pArgs->converter->toULength=2;
  1155             *bytes=*(s-2);
  1156             bytes[1]=*(s-1);
  1158             c=0xffff;
  1159             *err=U_ILLEGAL_CHAR_FOUND;
  1163     pArgs->source=(const char *)s;
  1164     return c;
  1167 static void
  1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
  1169     if(choice<=UCNV_RESET_TO_UNICODE) {
  1170         /* reset toUnicode state */
  1171         if(UCNV_GET_VERSION(cnv)==0) {
  1172             cnv->mode=8; /* no BOM handling */
  1173         } else {
  1174             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
  1177     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
  1178         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
  1179         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1183 static void
  1184 _UTF16LEOpen(UConverter *cnv,
  1185              UConverterLoadArgs *pArgs,
  1186              UErrorCode *pErrorCode) {
  1187     if(UCNV_GET_VERSION(cnv)<=1) {
  1188         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
  1189     } else {
  1190         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1194 static const char *
  1195 _UTF16LEGetName(const UConverter *cnv) {
  1196     if(UCNV_GET_VERSION(cnv)==0) {
  1197         return "UTF-16LE";
  1198     } else {
  1199         return "UTF-16LE,version=1";
  1203 static const UConverterImpl _UTF16LEImpl={
  1204     UCNV_UTF16_LittleEndian,
  1206     NULL,
  1207     NULL,
  1209     _UTF16LEOpen,
  1210     NULL,
  1211     _UTF16LEReset,
  1213     _UTF16LEToUnicodeWithOffsets,
  1214     _UTF16LEToUnicodeWithOffsets,
  1215     _UTF16LEFromUnicodeWithOffsets,
  1216     _UTF16LEFromUnicodeWithOffsets,
  1217     _UTF16LEGetNextUChar,
  1219     NULL,
  1220     _UTF16LEGetName,
  1221     NULL,
  1222     NULL,
  1223     ucnv_getNonSurrogateUnicodeSet
  1224 };
  1227 static const UConverterStaticData _UTF16LEStaticData={
  1228     sizeof(UConverterStaticData),
  1229     "UTF-16LE",
  1230     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
  1231     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
  1232     0,
  1233     0,
  1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1235 };
  1238 const UConverterSharedData _UTF16LEData={
  1239     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1240     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 
  1242 };
  1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
  1246 /*
  1247  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
  1248  * accordingly.
  1249  * This is a simpler version of the UTF-32 converter, with
  1250  * fewer states for shorter BOMs.
  1252  * State values:
  1253  * 0    initial state
  1254  * 1    saw first byte
  1255  * 2..5 -
  1256  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
  1257  * 8    UTF-16BE mode
  1258  * 9    UTF-16LE mode
  1260  * During detection: state==number of initial bytes seen so far.
  1262  * On output, emit U+FEFF as the first code point.
  1264  * Variants:
  1265  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
  1266  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
  1267  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
  1268  */
  1270 static void
  1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
  1272     if(choice<=UCNV_RESET_TO_UNICODE) {
  1273         /* reset toUnicode: state=0 */
  1274         cnv->mode=0;
  1276     if(choice!=UCNV_RESET_TO_UNICODE) {
  1277         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
  1278         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
  1282 static const UConverterSharedData _UTF16v2Data;
  1284 static void
  1285 _UTF16Open(UConverter *cnv,
  1286            UConverterLoadArgs *pArgs,
  1287            UErrorCode *pErrorCode) {
  1288     if(UCNV_GET_VERSION(cnv)<=2) {
  1289         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
  1290             /*
  1291              * Switch implementation, and switch the staticData that's different
  1292              * and was copied into the UConverter.
  1293              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
  1294              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
  1295              */
  1296             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
  1297             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
  1299         _UTF16Reset(cnv, UCNV_RESET_BOTH);
  1300     } else {
  1301         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
  1305 static const char *
  1306 _UTF16GetName(const UConverter *cnv) {
  1307     if(UCNV_GET_VERSION(cnv)==0) {
  1308         return "UTF-16";
  1309     } else if(UCNV_GET_VERSION(cnv)==1) {
  1310         return "UTF-16,version=1";
  1311     } else {
  1312         return "UTF-16,version=2";
  1316 const UConverterSharedData _UTF16Data;
  1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
  1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
  1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
  1322 static void
  1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
  1324                            UErrorCode *pErrorCode) {
  1325     UConverter *cnv=pArgs->converter;
  1326     const char *source=pArgs->source;
  1327     const char *sourceLimit=pArgs->sourceLimit;
  1328     int32_t *offsets=pArgs->offsets;
  1330     int32_t state, offsetDelta;
  1331     uint8_t b;
  1333     state=cnv->mode;
  1335     /*
  1336      * If we detect a BOM in this buffer, then we must add the BOM size to the
  1337      * offsets because the actual converter function will not see and count the BOM.
  1338      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
  1339      */
  1340     offsetDelta=0;
  1342     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
  1343         switch(state) {
  1344         case 0:
  1345             cnv->toUBytes[0]=(uint8_t)*source++;
  1346             cnv->toULength=1;
  1347             state=1;
  1348             break;
  1349         case 1:
  1350             /*
  1351              * Only inside this switch case can the state variable
  1352              * temporarily take two additional values:
  1353              * 6: BOM error, continue with BE
  1354              * 7: BOM error, continue with LE
  1355              */
  1356             b=*source;
  1357             if(cnv->toUBytes[0]==0xfe && b==0xff) {
  1358                 if(IS_UTF16LE(cnv)) {
  1359                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
  1360                 } else {
  1361                     state=8; /* detect UTF-16BE */
  1363             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
  1364                 if(IS_UTF16BE(cnv)) {
  1365                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
  1366                 } else {
  1367                     state=9; /* detect UTF-16LE */
  1369             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
  1370                 state=6; /* illegal missing BOM for Java "Unicode" */
  1372             if(state>=8) {
  1373                 /* BOM detected, consume it */
  1374                 ++source;
  1375                 cnv->toULength=0;
  1376                 offsetDelta=(int32_t)(source-pArgs->source);
  1377             } else if(state<6) {
  1378                 /* ok: no BOM, and not a reverse BOM */
  1379                 if(source!=pArgs->source) {
  1380                     /* reset the source for a correct first offset */
  1381                     source=pArgs->source;
  1382                     cnv->toULength=0;
  1384                 if(IS_UTF16LE(cnv)) {
  1385                     /* Make Java "UnicodeLittle" default to LE. */
  1386                     state=9;
  1387                 } else {
  1388                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
  1389                     state=8;
  1391             } else {
  1392                 /*
  1393                  * error: missing BOM, or reverse BOM
  1394                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
  1395                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
  1396                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
  1397                  */
  1398                 /* report the non-BOM or reverse BOM as an illegal sequence */
  1399                 cnv->toUBytes[1]=b;
  1400                 cnv->toULength=2;
  1401                 pArgs->source=source+1;
  1402                 /* continue with conversion if the callback resets the error */
  1403                 /*
  1404                  * Make Java "Unicode" default to BE like standard UTF-16.
  1405                  * Make Java "UnicodeBig" and "UnicodeLittle" default
  1406                  * to their normal endiannesses.
  1407                  */
  1408                 cnv->mode=state+2;
  1409                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
  1410                 return;
  1412             /* convert the rest of the stream */
  1413             cnv->mode=state;
  1414             continue;
  1415         case 8:
  1416             /* call UTF-16BE */
  1417             pArgs->source=source;
  1418             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1419             source=pArgs->source;
  1420             break;
  1421         case 9:
  1422             /* call UTF-16LE */
  1423             pArgs->source=source;
  1424             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1425             source=pArgs->source;
  1426             break;
  1427         default:
  1428             break; /* does not occur */
  1432     /* add BOM size to offsets - see comment at offsetDelta declaration */
  1433     if(offsets!=NULL && offsetDelta!=0) {
  1434         int32_t *offsetsLimit=pArgs->offsets;
  1435         while(offsets<offsetsLimit) {
  1436             *offsets++ += offsetDelta;
  1440     pArgs->source=source;
  1442     if(source==sourceLimit && pArgs->flush) {
  1443         /* handle truncated input */
  1444         switch(state) {
  1445         case 0:
  1446             break; /* no input at all, nothing to do */
  1447         case 8:
  1448             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
  1449             break;
  1450         case 9:
  1451             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
  1452             break;
  1453         default:
  1454             /* 0<state<8: framework will report truncation, nothing to do here */
  1455             break;
  1459     cnv->mode=state;
  1462 static UChar32
  1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
  1464                    UErrorCode *pErrorCode) {
  1465     switch(pArgs->converter->mode) {
  1466     case 8:
  1467         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
  1468     case 9:
  1469         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
  1470     default:
  1471         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
  1475 static const UConverterImpl _UTF16Impl = {
  1476     UCNV_UTF16,
  1478     NULL,
  1479     NULL,
  1481     _UTF16Open,
  1482     NULL,
  1483     _UTF16Reset,
  1485     _UTF16ToUnicodeWithOffsets,
  1486     _UTF16ToUnicodeWithOffsets,
  1487     _UTF16PEFromUnicodeWithOffsets,
  1488     _UTF16PEFromUnicodeWithOffsets,
  1489     _UTF16GetNextUChar,
  1491     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
  1492     _UTF16GetName,
  1493     NULL,
  1494     NULL,
  1495     ucnv_getNonSurrogateUnicodeSet
  1496 };
  1498 static const UConverterStaticData _UTF16StaticData = {
  1499     sizeof(UConverterStaticData),
  1500     "UTF-16",
  1501     1204, /* CCSID for BOM sensitive UTF-16 */
  1502     UCNV_IBM, UCNV_UTF16, 2, 2,
  1503 #if U_IS_BIG_ENDIAN
  1504     { 0xff, 0xfd, 0, 0 }, 2,
  1505 #else
  1506     { 0xfd, 0xff, 0, 0 }, 2,
  1507 #endif
  1508     FALSE, FALSE,
  1509     0,
  1510     0,
  1511     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1512 };
  1514 const UConverterSharedData _UTF16Data = {
  1515     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1516     NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, 
  1518 };
  1520 static const UConverterImpl _UTF16v2Impl = {
  1521     UCNV_UTF16,
  1523     NULL,
  1524     NULL,
  1526     _UTF16Open,
  1527     NULL,
  1528     _UTF16Reset,
  1530     _UTF16ToUnicodeWithOffsets,
  1531     _UTF16ToUnicodeWithOffsets,
  1532     _UTF16BEFromUnicodeWithOffsets,
  1533     _UTF16BEFromUnicodeWithOffsets,
  1534     _UTF16GetNextUChar,
  1536     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
  1537     _UTF16GetName,
  1538     NULL,
  1539     NULL,
  1540     ucnv_getNonSurrogateUnicodeSet
  1541 };
  1543 static const UConverterStaticData _UTF16v2StaticData = {
  1544     sizeof(UConverterStaticData),
  1545     "UTF-16,version=2",
  1546     1204, /* CCSID for BOM sensitive UTF-16 */
  1547     UCNV_IBM, UCNV_UTF16, 2, 2,
  1548     { 0xff, 0xfd, 0, 0 }, 2,
  1549     FALSE, FALSE,
  1550     0,
  1551     0,
  1552     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
  1553 };
  1555 static const UConverterSharedData _UTF16v2Data = {
  1556     sizeof(UConverterSharedData), ~((uint32_t) 0),
  1557     NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, 
  1559 };
  1561 #endif

mercurial