The Tor Browser: comparison intl/icu/source/common/ucnv

--1:000000000000
+:7cf82970fd74
+/*
+**********************************************************************
+*   Copyright (C) 2002-2010, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  ucnv_u16.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2002jul01
+*   created by: Markus W. Scherer
+*
+*   UTF-16 converter implementation. Used to be in ucnv_utf.c.
+*/
+#include "unicode/utypes.h"
+#if !UCONFIG_NO_CONVERSION
+#include "unicode/ucnv.h"
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+#include "cmemory.h"
+enum {
+UCNV_NEED_TO_WRITE_BOM=1
+};
+/*
+* The UTF-16 toUnicode implementation is also used for the Java-specific
+* "with BOM" variants of UTF-16BE and UTF-16LE.
+*/
+static void
+_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode);
+/* UTF-16BE ----------------------------------------------------------------- */
+#if U_IS_BIG_ENDIAN
+#   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
+#else
+#   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
+#endif
+static void
+_UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv;
+const UChar *source;
+char *target;
+int32_t *offsets;
+uint32_t targetCapacity, length, sourceIndex;
+UChar c, trail;
+char overflow[4];
+source=pArgs->source;
+length=(int32_t)(pArgs->sourceLimit-source);
+if(length<=0) {
+/* no input, nothing to do */
+return;
+}
+cnv=pArgs->converter;
+/* write the BOM if necessary */
+if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ (char)0xfe, (char)0xff };
+ucnv_fromUWriteBytes(cnv,
+bom, 2,
+&pArgs->target, pArgs->targetLimit,
+&pArgs->offsets, -1,
+pErrorCode);
+cnv->fromUnicodeStatus=0;
+}
+target=pArgs->target;
+if(target >= pArgs->targetLimit) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+return;
+}
+targetCapacity=(uint32_t)(pArgs->targetLimit-target);
+offsets=pArgs->offsets;
+sourceIndex=0;
+/* c!=0 indicates in several places outside the main loops that a surrogate was found */
+if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
+/* the last buffer ended with a lead surrogate, output the surrogate pair */
+++source;
+--length;
+target[0]=(uint8_t)(c>>8);
+target[1]=(uint8_t)c;
+target[2]=(uint8_t)(trail>>8);
+target[3]=(uint8_t)trail;
+target+=4;
+targetCapacity-=4;
+if(offsets!=NULL) {
+*offsets++=-1;
+*offsets++=-1;
+*offsets++=-1;
+*offsets++=-1;
+}
+sourceIndex=1;
+cnv->fromUChar32=c=0;
+}
+if(c==0) {
+/* copy an even number of bytes for complete UChars */
+uint32_t count=2*length;
+if(count>targetCapacity) {
+count=targetCapacity&~1;
+}
+/* count is even */
+targetCapacity-=count;
+count>>=1;
+length-=count;
+if(offsets==NULL) {
+while(count>0) {
+c=*source++;
+if(U16_IS_SINGLE(c)) {
+target[0]=(uint8_t)(c>>8);
+target[1]=(uint8_t)c;
+target+=2;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+++source;
+--count;
+target[0]=(uint8_t)(c>>8);
+target[1]=(uint8_t)c;
+target[2]=(uint8_t)(trail>>8);
+target[3]=(uint8_t)trail;
+target+=4;
+} else {
+break;
+}
+--count;
+}
+} else {
+while(count>0) {
+c=*source++;
+if(U16_IS_SINGLE(c)) {
+target[0]=(uint8_t)(c>>8);
+target[1]=(uint8_t)c;
+target+=2;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex++;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+++source;
+--count;
+target[0]=(uint8_t)(c>>8);
+target[1]=(uint8_t)c;
+target[2]=(uint8_t)(trail>>8);
+target[3]=(uint8_t)trail;
+target+=4;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+sourceIndex+=2;
+} else {
+break;
+}
+--count;
+}
+}
+if(count==0) {
+/* done with the loop for complete UChars */
+if(length>0 && targetCapacity>0) {
+/*
+* there is more input and some target capacity -
+* it must be targetCapacity==1 because otherwise
+* the above would have copied more;
+* prepare for overflow output
+*/
+if(U16_IS_SINGLE(c=*source++)) {
+overflow[0]=(char)(c>>8);
+overflow[1]=(char)c;
+length=2; /* 2 bytes to output */
+c=0;
+/* } else { keep c for surrogate handling, length will be set there */
+}
+} else {
+length=0;
+c=0;
+}
+} else {
+/* keep c for surrogate handling, length will be set there */
+targetCapacity+=2*count;
+}
+} else {
+length=0; /* from here on, length counts the bytes in overflow[] */
+}
+if(c!=0) {
+/*
+* c is a surrogate, and
+* - source or target too short
+* - or the surrogate is unmatched
+*/
+length=0;
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(source<pArgs->sourceLimit) {
+if(U16_IS_TRAIL(trail=*source)) {
+/* output the surrogate pair, will overflow (see conditions comment above) */
+++source;
+overflow[0]=(char)(c>>8);
+overflow[1]=(char)c;
+overflow[2]=(char)(trail>>8);
+overflow[3]=(char)trail;
+length=4; /* 4 bytes to output */
+c=0;
+} else {
+/* unmatched lead surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+} else {
+/* see if the trail surrogate is in the next buffer */
+}
+} else {
+/* unmatched trail surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+cnv->fromUChar32=c;
+}
+if(length>0) {
+/* output length bytes with overflow (length>targetCapacity>0) */
+ucnv_fromUWriteBytes(cnv,
+overflow, length,
+(char **)&target, pArgs->targetLimit,
+&offsets, sourceIndex,
+pErrorCode);
+targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
+}
+if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+/* write back the updated pointers */
+pArgs->source=source;
+pArgs->target=(char *)target;
+pArgs->offsets=offsets;
+}
+static void
+_UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv;
+const uint8_t *source;
+UChar *target;
+int32_t *offsets;
+uint32_t targetCapacity, length, count, sourceIndex;
+UChar c, trail;
+if(pArgs->converter->mode<8) {
+_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
+return;
+}
+cnv=pArgs->converter;
+source=(const uint8_t *)pArgs->source;
+length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
+if(length<=0 && cnv->toUnicodeStatus==0) {
+/* no input, nothing to do */
+return;
+}
+target=pArgs->target;
+if(target >= pArgs->targetLimit) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+return;
+}
+targetCapacity=(uint32_t)(pArgs->targetLimit-target);
+offsets=pArgs->offsets;
+sourceIndex=0;
+c=0;
+/* complete a partial UChar or pair from the last call */
+if(cnv->toUnicodeStatus!=0) {
+/*
+* special case: single byte from a previous buffer,
+* where the byte turned out not to belong to a trail surrogate
+* and the preceding, unmatched lead surrogate was put into toUBytes[]
+* for error handling
+*/
+cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
+cnv->toULength=1;
+cnv->toUnicodeStatus=0;
+}
+if((count=cnv->toULength)!=0) {
+uint8_t *p=cnv->toUBytes;
+do {
+p[count++]=*source++;
+++sourceIndex;
+--length;
+if(count==2) {
+c=((UChar)p[0]<<8)|p[1];
+if(U16_IS_SINGLE(c)) {
+/* output the BMP code point */
+*target++=c;
+if(offsets!=NULL) {
+*offsets++=-1;
+}
+--targetCapacity;
+count=0;
+c=0;
+break;
+} else if(U16_IS_SURROGATE_LEAD(c)) {
+/* continue collecting bytes for the trail surrogate */
+c=0; /* avoid unnecessary surrogate handling below */
+} else {
+/* fall through to error handling for an unmatched trail surrogate */
+break;
+}
+} else if(count==4) {
+c=((UChar)p[0]<<8)|p[1];
+trail=((UChar)p[2]<<8)|p[3];
+if(U16_IS_TRAIL(trail)) {
+/* output the surrogate pair */
+*target++=c;
+if(targetCapacity>=2) {
+*target++=trail;
+if(offsets!=NULL) {
+*offsets++=-1;
+*offsets++=-1;
+}
+targetCapacity-=2;
+} else /* targetCapacity==1 */ {
+targetCapacity=0;
+cnv->UCharErrorBuffer[0]=trail;
+cnv->UCharErrorBufferLength=1;
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+count=0;
+c=0;
+break;
+} else {
+/* unmatched lead surrogate, handle here for consistent toUBytes[] */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+/* back out reading the code unit after it */
+if(((const uint8_t *)pArgs->source-source)>=2) {
+source-=2;
+} else {
+/*
+* if the trail unit's first byte was in a previous buffer, then
+* we need to put it into a special place because toUBytes[] will be
+* used for the lead unit's bytes
+*/
+cnv->toUnicodeStatus=0x100|p[2];
+--source;
+}
+cnv->toULength=2;
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+return;
+}
+}
+} while(length>0);
+cnv->toULength=(int8_t)count;
+}
+/* copy an even number of bytes for complete UChars */
+count=2*targetCapacity;
+if(count>length) {
+count=length&~1;
+}
+if(c==0 && count>0) {
+length-=count;
+count>>=1;
+targetCapacity-=count;
+if(offsets==NULL) {
+do {
+c=((UChar)source[0]<<8)|source[1];
+source+=2;
+if(U16_IS_SINGLE(c)) {
+*target++=c;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
+) {
+source+=2;
+--count;
+*target++=c;
+*target++=trail;
+} else {
+break;
+}
+} while(--count>0);
+} else {
+do {
+c=((UChar)source[0]<<8)|source[1];
+source+=2;
+if(U16_IS_SINGLE(c)) {
+*target++=c;
+*offsets++=sourceIndex;
+sourceIndex+=2;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
+) {
+source+=2;
+--count;
+*target++=c;
+*target++=trail;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+sourceIndex+=4;
+} else {
+break;
+}
+} while(--count>0);
+}
+if(count==0) {
+/* done with the loop for complete UChars */
+c=0;
+} else {
+/* keep c for surrogate handling, trail will be set there */
+length+=2*(count-1); /* one more byte pair was consumed than count decremented */
+targetCapacity+=count;
+}
+}
+if(c!=0) {
+/*
+* c is a surrogate, and
+* - source or target too short
+* - or the surrogate is unmatched
+*/
+cnv->toUBytes[0]=(uint8_t)(c>>8);
+cnv->toUBytes[1]=(uint8_t)c;
+cnv->toULength=2;
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(length>=2) {
+if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
+/* output the surrogate pair, will overflow (see conditions comment above) */
+source+=2;
+length-=2;
+*target++=c;
+if(offsets!=NULL) {
+*offsets++=sourceIndex;
+}
+cnv->UCharErrorBuffer[0]=trail;
+cnv->UCharErrorBufferLength=1;
+cnv->toULength=0;
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+} else {
+/* unmatched lead surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+} else {
+/* see if the trail surrogate is in the next buffer */
+}
+} else {
+/* unmatched trail surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+}
+if(U_SUCCESS(*pErrorCode)) {
+/* check for a remaining source byte */
+if(length>0) {
+if(targetCapacity==0) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+} else {
+/* it must be length==1 because otherwise the above would have copied more */
+cnv->toUBytes[cnv->toULength++]=*source++;
+}
+}
+}
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+}
+static UChar32
+_UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
+const uint8_t *s, *sourceLimit;
+UChar32 c;
+if(pArgs->converter->mode<8) {
+return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+}
+s=(const uint8_t *)pArgs->source;
+sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+if(s>=sourceLimit) {
+/* no input */
+*err=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+if(s+2>sourceLimit) {
+/* only one byte: truncated UChar */
+pArgs->converter->toUBytes[0]=*s++;
+pArgs->converter->toULength=1;
+pArgs->source=(const char *)s;
+*err = U_TRUNCATED_CHAR_FOUND;
+return 0xffff;
+}
+/* get one UChar */
+c=((UChar32)*s<<8)|s[1];
+s+=2;
+/* check for a surrogate pair */
+if(U_IS_SURROGATE(c)) {
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(s+2<=sourceLimit) {
+UChar trail;
+/* get a second UChar and see if it is a trail surrogate */
+trail=((UChar)*s<<8)|s[1];
+if(U16_IS_TRAIL(trail)) {
+c=U16_GET_SUPPLEMENTARY(c, trail);
+s+=2;
+} else {
+/* unmatched lead surrogate */
+c=-2;
+}
+} else {
+/* too few (2 or 3) bytes for a surrogate pair: truncated code point */
+uint8_t *bytes=pArgs->converter->toUBytes;
+s-=2;
+pArgs->converter->toULength=(int8_t)(sourceLimit-s);
+do {
+*bytes++=*s++;
+} while(s<sourceLimit);
+c=0xffff;
+*err=U_TRUNCATED_CHAR_FOUND;
+}
+} else {
+/* unmatched trail surrogate */
+c=-2;
+}
+if(c<0) {
+/* write the unmatched surrogate */
+uint8_t *bytes=pArgs->converter->toUBytes;
+pArgs->converter->toULength=2;
+*bytes=*(s-2);
+bytes[1]=*(s-1);
+c=0xffff;
+*err=U_ILLEGAL_CHAR_FOUND;
+}
+}
+pArgs->source=(const char *)s;
+return c;
+}
+static void
+_UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
+if(choice<=UCNV_RESET_TO_UNICODE) {
+/* reset toUnicode state */
+if(UCNV_GET_VERSION(cnv)==0) {
+cnv->mode=8; /* no BOM handling */
+} else {
+cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
+}
+}
+if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
+/* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
+cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
+}
+}
+static void
+_UTF16BEOpen(UConverter *cnv,
+UConverterLoadArgs *pArgs,
+UErrorCode *pErrorCode) {
+if(UCNV_GET_VERSION(cnv)<=1) {
+_UTF16BEReset(cnv, UCNV_RESET_BOTH);
+} else {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+}
+}
+static const char *
+_UTF16BEGetName(const UConverter *cnv) {
+if(UCNV_GET_VERSION(cnv)==0) {
+return "UTF-16BE";
+} else {
+return "UTF-16BE,version=1";
+}
+}
+static const UConverterImpl _UTF16BEImpl={
+UCNV_UTF16_BigEndian,
+NULL,
+NULL,
+_UTF16BEOpen,
+NULL,
+_UTF16BEReset,
+_UTF16BEToUnicodeWithOffsets,
+_UTF16BEToUnicodeWithOffsets,
+_UTF16BEFromUnicodeWithOffsets,
+_UTF16BEFromUnicodeWithOffsets,
+_UTF16BEGetNextUChar,
+NULL,
+_UTF16BEGetName,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+static const UConverterStaticData _UTF16BEStaticData={
+sizeof(UConverterStaticData),
+"UTF-16BE",
+1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
+{ 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF16BEData={
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
+0
+};
+/* UTF-16LE ----------------------------------------------------------------- */
+static void
+_UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv;
+const UChar *source;
+char *target;
+int32_t *offsets;
+uint32_t targetCapacity, length, sourceIndex;
+UChar c, trail;
+char overflow[4];
+source=pArgs->source;
+length=(int32_t)(pArgs->sourceLimit-source);
+if(length<=0) {
+/* no input, nothing to do */
+return;
+}
+cnv=pArgs->converter;
+/* write the BOM if necessary */
+if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ (char)0xff, (char)0xfe };
+ucnv_fromUWriteBytes(cnv,
+bom, 2,
+&pArgs->target, pArgs->targetLimit,
+&pArgs->offsets, -1,
+pErrorCode);
+cnv->fromUnicodeStatus=0;
+}
+target=pArgs->target;
+if(target >= pArgs->targetLimit) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+return;
+}
+targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
+offsets=pArgs->offsets;
+sourceIndex=0;
+/* c!=0 indicates in several places outside the main loops that a surrogate was found */
+if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
+/* the last buffer ended with a lead surrogate, output the surrogate pair */
+++source;
+--length;
+target[0]=(uint8_t)c;
+target[1]=(uint8_t)(c>>8);
+target[2]=(uint8_t)trail;
+target[3]=(uint8_t)(trail>>8);
+target+=4;
+targetCapacity-=4;
+if(offsets!=NULL) {
+*offsets++=-1;
+*offsets++=-1;
+*offsets++=-1;
+*offsets++=-1;
+}
+sourceIndex=1;
+cnv->fromUChar32=c=0;
+}
+if(c==0) {
+/* copy an even number of bytes for complete UChars */
+uint32_t count=2*length;
+if(count>targetCapacity) {
+count=targetCapacity&~1;
+}
+/* count is even */
+targetCapacity-=count;
+count>>=1;
+length-=count;
+if(offsets==NULL) {
+while(count>0) {
+c=*source++;
+if(U16_IS_SINGLE(c)) {
+target[0]=(uint8_t)c;
+target[1]=(uint8_t)(c>>8);
+target+=2;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+++source;
+--count;
+target[0]=(uint8_t)c;
+target[1]=(uint8_t)(c>>8);
+target[2]=(uint8_t)trail;
+target[3]=(uint8_t)(trail>>8);
+target+=4;
+} else {
+break;
+}
+--count;
+}
+} else {
+while(count>0) {
+c=*source++;
+if(U16_IS_SINGLE(c)) {
+target[0]=(uint8_t)c;
+target[1]=(uint8_t)(c>>8);
+target+=2;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex++;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
+++source;
+--count;
+target[0]=(uint8_t)c;
+target[1]=(uint8_t)(c>>8);
+target[2]=(uint8_t)trail;
+target[3]=(uint8_t)(trail>>8);
+target+=4;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+sourceIndex+=2;
+} else {
+break;
+}
+--count;
+}
+}
+if(count==0) {
+/* done with the loop for complete UChars */
+if(length>0 && targetCapacity>0) {
+/*
+* there is more input and some target capacity -
+* it must be targetCapacity==1 because otherwise
+* the above would have copied more;
+* prepare for overflow output
+*/
+if(U16_IS_SINGLE(c=*source++)) {
+overflow[0]=(char)c;
+overflow[1]=(char)(c>>8);
+length=2; /* 2 bytes to output */
+c=0;
+/* } else { keep c for surrogate handling, length will be set there */
+}
+} else {
+length=0;
+c=0;
+}
+} else {
+/* keep c for surrogate handling, length will be set there */
+targetCapacity+=2*count;
+}
+} else {
+length=0; /* from here on, length counts the bytes in overflow[] */
+}
+if(c!=0) {
+/*
+* c is a surrogate, and
+* - source or target too short
+* - or the surrogate is unmatched
+*/
+length=0;
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(source<pArgs->sourceLimit) {
+if(U16_IS_TRAIL(trail=*source)) {
+/* output the surrogate pair, will overflow (see conditions comment above) */
+++source;
+overflow[0]=(char)c;
+overflow[1]=(char)(c>>8);
+overflow[2]=(char)trail;
+overflow[3]=(char)(trail>>8);
+length=4; /* 4 bytes to output */
+c=0;
+} else {
+/* unmatched lead surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+} else {
+/* see if the trail surrogate is in the next buffer */
+}
+} else {
+/* unmatched trail surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+cnv->fromUChar32=c;
+}
+if(length>0) {
+/* output length bytes with overflow (length>targetCapacity>0) */
+ucnv_fromUWriteBytes(cnv,
+overflow, length,
+&target, pArgs->targetLimit,
+&offsets, sourceIndex,
+pErrorCode);
+targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
+}
+if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+/* write back the updated pointers */
+pArgs->source=source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+}
+static void
+_UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv;
+const uint8_t *source;
+UChar *target;
+int32_t *offsets;
+uint32_t targetCapacity, length, count, sourceIndex;
+UChar c, trail;
+if(pArgs->converter->mode<8) {
+_UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
+return;
+}
+cnv=pArgs->converter;
+source=(const uint8_t *)pArgs->source;
+length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
+if(length<=0 && cnv->toUnicodeStatus==0) {
+/* no input, nothing to do */
+return;
+}
+target=pArgs->target;
+if(target >= pArgs->targetLimit) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+return;
+}
+targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
+offsets=pArgs->offsets;
+sourceIndex=0;
+c=0;
+/* complete a partial UChar or pair from the last call */
+if(cnv->toUnicodeStatus!=0) {
+/*
+* special case: single byte from a previous buffer,
+* where the byte turned out not to belong to a trail surrogate
+* and the preceding, unmatched lead surrogate was put into toUBytes[]
+* for error handling
+*/
+cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
+cnv->toULength=1;
+cnv->toUnicodeStatus=0;
+}
+if((count=cnv->toULength)!=0) {
+uint8_t *p=cnv->toUBytes;
+do {
+p[count++]=*source++;
+++sourceIndex;
+--length;
+if(count==2) {
+c=((UChar)p[1]<<8)|p[0];
+if(U16_IS_SINGLE(c)) {
+/* output the BMP code point */
+*target++=c;
+if(offsets!=NULL) {
+*offsets++=-1;
+}
+--targetCapacity;
+count=0;
+c=0;
+break;
+} else if(U16_IS_SURROGATE_LEAD(c)) {
+/* continue collecting bytes for the trail surrogate */
+c=0; /* avoid unnecessary surrogate handling below */
+} else {
+/* fall through to error handling for an unmatched trail surrogate */
+break;
+}
+} else if(count==4) {
+c=((UChar)p[1]<<8)|p[0];
+trail=((UChar)p[3]<<8)|p[2];
+if(U16_IS_TRAIL(trail)) {
+/* output the surrogate pair */
+*target++=c;
+if(targetCapacity>=2) {
+*target++=trail;
+if(offsets!=NULL) {
+*offsets++=-1;
+*offsets++=-1;
+}
+targetCapacity-=2;
+} else /* targetCapacity==1 */ {
+targetCapacity=0;
+cnv->UCharErrorBuffer[0]=trail;
+cnv->UCharErrorBufferLength=1;
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+count=0;
+c=0;
+break;
+} else {
+/* unmatched lead surrogate, handle here for consistent toUBytes[] */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+/* back out reading the code unit after it */
+if(((const uint8_t *)pArgs->source-source)>=2) {
+source-=2;
+} else {
+/*
+* if the trail unit's first byte was in a previous buffer, then
+* we need to put it into a special place because toUBytes[] will be
+* used for the lead unit's bytes
+*/
+cnv->toUnicodeStatus=0x100|p[2];
+--source;
+}
+cnv->toULength=2;
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+return;
+}
+}
+} while(length>0);
+cnv->toULength=(int8_t)count;
+}
+/* copy an even number of bytes for complete UChars */
+count=2*targetCapacity;
+if(count>length) {
+count=length&~1;
+}
+if(c==0 && count>0) {
+length-=count;
+count>>=1;
+targetCapacity-=count;
+if(offsets==NULL) {
+do {
+c=((UChar)source[1]<<8)|source[0];
+source+=2;
+if(U16_IS_SINGLE(c)) {
+*target++=c;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
+) {
+source+=2;
+--count;
+*target++=c;
+*target++=trail;
+} else {
+break;
+}
+} while(--count>0);
+} else {
+do {
+c=((UChar)source[1]<<8)|source[0];
+source+=2;
+if(U16_IS_SINGLE(c)) {
+*target++=c;
+*offsets++=sourceIndex;
+sourceIndex+=2;
+} else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
+U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
+) {
+source+=2;
+--count;
+*target++=c;
+*target++=trail;
+*offsets++=sourceIndex;
+*offsets++=sourceIndex;
+sourceIndex+=4;
+} else {
+break;
+}
+} while(--count>0);
+}
+if(count==0) {
+/* done with the loop for complete UChars */
+c=0;
+} else {
+/* keep c for surrogate handling, trail will be set there */
+length+=2*(count-1); /* one more byte pair was consumed than count decremented */
+targetCapacity+=count;
+}
+}
+if(c!=0) {
+/*
+* c is a surrogate, and
+* - source or target too short
+* - or the surrogate is unmatched
+*/
+cnv->toUBytes[0]=(uint8_t)c;
+cnv->toUBytes[1]=(uint8_t)(c>>8);
+cnv->toULength=2;
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(length>=2) {
+if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
+/* output the surrogate pair, will overflow (see conditions comment above) */
+source+=2;
+length-=2;
+*target++=c;
+if(offsets!=NULL) {
+*offsets++=sourceIndex;
+}
+cnv->UCharErrorBuffer[0]=trail;
+cnv->UCharErrorBufferLength=1;
+cnv->toULength=0;
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+} else {
+/* unmatched lead surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+} else {
+/* see if the trail surrogate is in the next buffer */
+}
+} else {
+/* unmatched trail surrogate */
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+}
+}
+if(U_SUCCESS(*pErrorCode)) {
+/* check for a remaining source byte */
+if(length>0) {
+if(targetCapacity==0) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+} else {
+/* it must be length==1 because otherwise the above would have copied more */
+cnv->toUBytes[cnv->toULength++]=*source++;
+}
+}
+}
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+}
+static UChar32
+_UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
+const uint8_t *s, *sourceLimit;
+UChar32 c;
+if(pArgs->converter->mode<8) {
+return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+}
+s=(const uint8_t *)pArgs->source;
+sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+if(s>=sourceLimit) {
+/* no input */
+*err=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+if(s+2>sourceLimit) {
+/* only one byte: truncated UChar */
+pArgs->converter->toUBytes[0]=*s++;
+pArgs->converter->toULength=1;
+pArgs->source=(const char *)s;
+*err = U_TRUNCATED_CHAR_FOUND;
+return 0xffff;
+}
+/* get one UChar */
+c=((UChar32)s[1]<<8)|*s;
+s+=2;
+/* check for a surrogate pair */
+if(U_IS_SURROGATE(c)) {
+if(U16_IS_SURROGATE_LEAD(c)) {
+if(s+2<=sourceLimit) {
+UChar trail;
+/* get a second UChar and see if it is a trail surrogate */
+trail=((UChar)s[1]<<8)|*s;
+if(U16_IS_TRAIL(trail)) {
+c=U16_GET_SUPPLEMENTARY(c, trail);
+s+=2;
+} else {
+/* unmatched lead surrogate */
+c=-2;
+}
+} else {
+/* too few (2 or 3) bytes for a surrogate pair: truncated code point */
+uint8_t *bytes=pArgs->converter->toUBytes;
+s-=2;
+pArgs->converter->toULength=(int8_t)(sourceLimit-s);
+do {
+*bytes++=*s++;
+} while(s<sourceLimit);
+c=0xffff;
+*err=U_TRUNCATED_CHAR_FOUND;
+}
+} else {
+/* unmatched trail surrogate */
+c=-2;
+}
+if(c<0) {
+/* write the unmatched surrogate */
+uint8_t *bytes=pArgs->converter->toUBytes;
+pArgs->converter->toULength=2;
+*bytes=*(s-2);
+bytes[1]=*(s-1);
+c=0xffff;
+*err=U_ILLEGAL_CHAR_FOUND;
+}
+}
+pArgs->source=(const char *)s;
+return c;
+}
+static void
+_UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
+if(choice<=UCNV_RESET_TO_UNICODE) {
+/* reset toUnicode state */
+if(UCNV_GET_VERSION(cnv)==0) {
+cnv->mode=8; /* no BOM handling */
+} else {
+cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
+}
+}
+if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
+/* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
+cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
+}
+}
+static void
+_UTF16LEOpen(UConverter *cnv,
+UConverterLoadArgs *pArgs,
+UErrorCode *pErrorCode) {
+if(UCNV_GET_VERSION(cnv)<=1) {
+_UTF16LEReset(cnv, UCNV_RESET_BOTH);
+} else {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+}
+}
+static const char *
+_UTF16LEGetName(const UConverter *cnv) {
+if(UCNV_GET_VERSION(cnv)==0) {
+return "UTF-16LE";
+} else {
+return "UTF-16LE,version=1";
+}
+}
+static const UConverterImpl _UTF16LEImpl={
+UCNV_UTF16_LittleEndian,
+NULL,
+NULL,
+_UTF16LEOpen,
+NULL,
+_UTF16LEReset,
+_UTF16LEToUnicodeWithOffsets,
+_UTF16LEToUnicodeWithOffsets,
+_UTF16LEFromUnicodeWithOffsets,
+_UTF16LEFromUnicodeWithOffsets,
+_UTF16LEGetNextUChar,
+NULL,
+_UTF16LEGetName,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+static const UConverterStaticData _UTF16LEStaticData={
+sizeof(UConverterStaticData),
+"UTF-16LE",
+1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
+{ 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF16LEData={
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
+0
+};
+/* UTF-16 (Detect BOM) ------------------------------------------------------ */
+/*
+* Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
+* accordingly.
+* This is a simpler version of the UTF-32 converter, with
+* fewer states for shorter BOMs.
+*
+* State values:
+* 0    initial state
+* 1    saw first byte
+* 2..5 -
+* 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
+* 8    UTF-16BE mode
+* 9    UTF-16LE mode
+*
+* During detection: state==number of initial bytes seen so far.
+*
+* On output, emit U+FEFF as the first code point.
+*
+* Variants:
+* - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
+* - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
+*   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
+*/
+static void
+_UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
+if(choice<=UCNV_RESET_TO_UNICODE) {
+/* reset toUnicode: state=0 */
+cnv->mode=0;
+}
+if(choice!=UCNV_RESET_TO_UNICODE) {
+/* reset fromUnicode: prepare to output the UTF-16PE BOM */
+cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
+}
+}
+static const UConverterSharedData _UTF16v2Data;
+static void
+_UTF16Open(UConverter *cnv,
+UConverterLoadArgs *pArgs,
+UErrorCode *pErrorCode) {
+if(UCNV_GET_VERSION(cnv)<=2) {
+if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
+/*
+* Switch implementation, and switch the staticData that's different
+* and was copied into the UConverter.
+* (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
+* UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
+*/
+cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
+uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
+}
+_UTF16Reset(cnv, UCNV_RESET_BOTH);
+} else {
+*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+}
+}
+static const char *
+_UTF16GetName(const UConverter *cnv) {
+if(UCNV_GET_VERSION(cnv)==0) {
+return "UTF-16";
+} else if(UCNV_GET_VERSION(cnv)==1) {
+return "UTF-16,version=1";
+} else {
+return "UTF-16,version=2";
+}
+}
+const UConverterSharedData _UTF16Data;
+#define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
+#define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
+#define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
+static void
+_UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv=pArgs->converter;
+const char *source=pArgs->source;
+const char *sourceLimit=pArgs->sourceLimit;
+int32_t *offsets=pArgs->offsets;
+int32_t state, offsetDelta;
+uint8_t b;
+state=cnv->mode;
+/*
+* If we detect a BOM in this buffer, then we must add the BOM size to the
+* offsets because the actual converter function will not see and count the BOM.
+* offsetDelta will have the number of the BOM bytes that are in the current buffer.
+*/
+offsetDelta=0;
+while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
+switch(state) {
+case 0:
+cnv->toUBytes[0]=(uint8_t)*source++;
+cnv->toULength=1;
+state=1;
+break;
+case 1:
+/*
+* Only inside this switch case can the state variable
+* temporarily take two additional values:
+* 6: BOM error, continue with BE
+* 7: BOM error, continue with LE
+*/
+b=*source;
+if(cnv->toUBytes[0]==0xfe && b==0xff) {
+if(IS_UTF16LE(cnv)) {
+state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
+} else {
+state=8; /* detect UTF-16BE */
+}
+} else if(cnv->toUBytes[0]==0xff && b==0xfe) {
+if(IS_UTF16BE(cnv)) {
+state=6; /* illegal reverse BOM for Java "UnicodeBig" */
+} else {
+state=9; /* detect UTF-16LE */
+}
+} else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
+state=6; /* illegal missing BOM for Java "Unicode" */
+}
+if(state>=8) {
+/* BOM detected, consume it */
+++source;
+cnv->toULength=0;
+offsetDelta=(int32_t)(source-pArgs->source);
+} else if(state<6) {
+/* ok: no BOM, and not a reverse BOM */
+if(source!=pArgs->source) {
+/* reset the source for a correct first offset */
+source=pArgs->source;
+cnv->toULength=0;
+}
+if(IS_UTF16LE(cnv)) {
+/* Make Java "UnicodeLittle" default to LE. */
+state=9;
+} else {
+/* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
+state=8;
+}
+} else {
+/*
+* error: missing BOM, or reverse BOM
+* UTF-16,version=1: Java-specific "Unicode" requires a BOM.
+* UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
+* UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
+*/
+/* report the non-BOM or reverse BOM as an illegal sequence */
+cnv->toUBytes[1]=b;
+cnv->toULength=2;
+pArgs->source=source+1;
+/* continue with conversion if the callback resets the error */
+/*
+* Make Java "Unicode" default to BE like standard UTF-16.
+* Make Java "UnicodeBig" and "UnicodeLittle" default
+* to their normal endiannesses.
+*/
+cnv->mode=state+2;
+*pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
+return;
+}
+/* convert the rest of the stream */
+cnv->mode=state;
+continue;
+case 8:
+/* call UTF-16BE */
+pArgs->source=source;
+_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
+source=pArgs->source;
+break;
+case 9:
+/* call UTF-16LE */
+pArgs->source=source;
+_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
+source=pArgs->source;
+break;
+default:
+break; /* does not occur */
+}
+}
+/* add BOM size to offsets - see comment at offsetDelta declaration */
+if(offsets!=NULL && offsetDelta!=0) {
+int32_t *offsetsLimit=pArgs->offsets;
+while(offsets<offsetsLimit) {
+*offsets++ += offsetDelta;
+}
+}
+pArgs->source=source;
+if(source==sourceLimit && pArgs->flush) {
+/* handle truncated input */
+switch(state) {
+case 0:
+break; /* no input at all, nothing to do */
+case 8:
+_UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
+break;
+case 9:
+_UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
+break;
+default:
+/* 0<state<8: framework will report truncation, nothing to do here */
+break;
+}
+}
+cnv->mode=state;
+}
+static UChar32
+_UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+switch(pArgs->converter->mode) {
+case 8:
+return _UTF16BEGetNextUChar(pArgs, pErrorCode);
+case 9:
+return _UTF16LEGetNextUChar(pArgs, pErrorCode);
+default:
+return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+}
+}
+static const UConverterImpl _UTF16Impl = {
+UCNV_UTF16,
+NULL,
+NULL,
+_UTF16Open,
+NULL,
+_UTF16Reset,
+_UTF16ToUnicodeWithOffsets,
+_UTF16ToUnicodeWithOffsets,
+_UTF16PEFromUnicodeWithOffsets,
+_UTF16PEFromUnicodeWithOffsets,
+_UTF16GetNextUChar,
+NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
+_UTF16GetName,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+static const UConverterStaticData _UTF16StaticData = {
+sizeof(UConverterStaticData),
+"UTF-16",
+1204, /* CCSID for BOM sensitive UTF-16 */
+UCNV_IBM, UCNV_UTF16, 2, 2,
+#if U_IS_BIG_ENDIAN
+{ 0xff, 0xfd, 0, 0 }, 2,
+#else
+{ 0xfd, 0xff, 0, 0 }, 2,
+#endif
+FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF16Data = {
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
+0
+};
+static const UConverterImpl _UTF16v2Impl = {
+UCNV_UTF16,
+NULL,
+NULL,
+_UTF16Open,
+NULL,
+_UTF16Reset,
+_UTF16ToUnicodeWithOffsets,
+_UTF16ToUnicodeWithOffsets,
+_UTF16BEFromUnicodeWithOffsets,
+_UTF16BEFromUnicodeWithOffsets,
+_UTF16GetNextUChar,
+NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
+_UTF16GetName,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+static const UConverterStaticData _UTF16v2StaticData = {
+sizeof(UConverterStaticData),
+"UTF-16,version=2",
+1204, /* CCSID for BOM sensitive UTF-16 */
+UCNV_IBM, UCNV_UTF16, 2, 2,
+{ 0xff, 0xfd, 0, 0 }, 2,
+FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+static const UConverterSharedData _UTF16v2Data = {
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
+0
+};
+#endif

The Tor Browser / file comparison

comparison: intl/icu/source/common/ucnv_u16.c

intl/icu/source/common/ucnv_u16.c