michael@0: /* michael@0: ********************************************************************** michael@0: * Copyright (C) 2002-2010, International Business Machines michael@0: * Corporation and others. All Rights Reserved. michael@0: ********************************************************************** michael@0: * file name: ucnv_u16.c michael@0: * encoding: US-ASCII michael@0: * tab size: 8 (not used) michael@0: * indentation:4 michael@0: * michael@0: * created on: 2002jul01 michael@0: * created by: Markus W. Scherer michael@0: * michael@0: * UTF-16 converter implementation. Used to be in ucnv_utf.c. michael@0: */ michael@0: michael@0: #include "unicode/utypes.h" michael@0: michael@0: #if !UCONFIG_NO_CONVERSION michael@0: michael@0: #include "unicode/ucnv.h" michael@0: #include "ucnv_bld.h" michael@0: #include "ucnv_cnv.h" michael@0: #include "cmemory.h" michael@0: michael@0: enum { michael@0: UCNV_NEED_TO_WRITE_BOM=1 michael@0: }; michael@0: michael@0: /* michael@0: * The UTF-16 toUnicode implementation is also used for the Java-specific michael@0: * "with BOM" variants of UTF-16BE and UTF-16LE. michael@0: */ michael@0: static void michael@0: _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode); michael@0: michael@0: /* UTF-16BE ----------------------------------------------------------------- */ michael@0: michael@0: #if U_IS_BIG_ENDIAN michael@0: # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets michael@0: #else michael@0: # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets michael@0: #endif michael@0: michael@0: michael@0: static void michael@0: _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: const UChar *source; michael@0: char *target; michael@0: int32_t *offsets; michael@0: michael@0: uint32_t targetCapacity, length, sourceIndex; michael@0: UChar c, trail; michael@0: char overflow[4]; michael@0: michael@0: source=pArgs->source; michael@0: length=(int32_t)(pArgs->sourceLimit-source); michael@0: if(length<=0) { michael@0: /* no input, nothing to do */ michael@0: return; michael@0: } michael@0: michael@0: cnv=pArgs->converter; michael@0: michael@0: /* write the BOM if necessary */ michael@0: if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { michael@0: static const char bom[]={ (char)0xfe, (char)0xff }; michael@0: ucnv_fromUWriteBytes(cnv, michael@0: bom, 2, michael@0: &pArgs->target, pArgs->targetLimit, michael@0: &pArgs->offsets, -1, michael@0: pErrorCode); michael@0: cnv->fromUnicodeStatus=0; michael@0: } michael@0: michael@0: target=pArgs->target; michael@0: if(target >= pArgs->targetLimit) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: return; michael@0: } michael@0: michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-target); michael@0: offsets=pArgs->offsets; michael@0: sourceIndex=0; michael@0: michael@0: /* c!=0 indicates in several places outside the main loops that a surrogate was found */ michael@0: michael@0: if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { michael@0: /* the last buffer ended with a lead surrogate, output the surrogate pair */ michael@0: ++source; michael@0: --length; michael@0: target[0]=(uint8_t)(c>>8); michael@0: target[1]=(uint8_t)c; michael@0: target[2]=(uint8_t)(trail>>8); michael@0: target[3]=(uint8_t)trail; michael@0: target+=4; michael@0: targetCapacity-=4; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: } michael@0: sourceIndex=1; michael@0: cnv->fromUChar32=c=0; michael@0: } michael@0: michael@0: if(c==0) { michael@0: /* copy an even number of bytes for complete UChars */ michael@0: uint32_t count=2*length; michael@0: if(count>targetCapacity) { michael@0: count=targetCapacity&~1; michael@0: } michael@0: /* count is even */ michael@0: targetCapacity-=count; michael@0: count>>=1; michael@0: length-=count; michael@0: michael@0: if(offsets==NULL) { michael@0: while(count>0) { michael@0: c=*source++; michael@0: if(U16_IS_SINGLE(c)) { michael@0: target[0]=(uint8_t)(c>>8); michael@0: target[1]=(uint8_t)c; michael@0: target+=2; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { michael@0: ++source; michael@0: --count; michael@0: target[0]=(uint8_t)(c>>8); michael@0: target[1]=(uint8_t)c; michael@0: target[2]=(uint8_t)(trail>>8); michael@0: target[3]=(uint8_t)trail; michael@0: target+=4; michael@0: } else { michael@0: break; michael@0: } michael@0: --count; michael@0: } michael@0: } else { michael@0: while(count>0) { michael@0: c=*source++; michael@0: if(U16_IS_SINGLE(c)) { michael@0: target[0]=(uint8_t)(c>>8); michael@0: target[1]=(uint8_t)c; michael@0: target+=2; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex++; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { michael@0: ++source; michael@0: --count; michael@0: target[0]=(uint8_t)(c>>8); michael@0: target[1]=(uint8_t)c; michael@0: target[2]=(uint8_t)(trail>>8); michael@0: target[3]=(uint8_t)trail; michael@0: target+=4; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=2; michael@0: } else { michael@0: break; michael@0: } michael@0: --count; michael@0: } michael@0: } michael@0: michael@0: if(count==0) { michael@0: /* done with the loop for complete UChars */ michael@0: if(length>0 && targetCapacity>0) { michael@0: /* michael@0: * there is more input and some target capacity - michael@0: * it must be targetCapacity==1 because otherwise michael@0: * the above would have copied more; michael@0: * prepare for overflow output michael@0: */ michael@0: if(U16_IS_SINGLE(c=*source++)) { michael@0: overflow[0]=(char)(c>>8); michael@0: overflow[1]=(char)c; michael@0: length=2; /* 2 bytes to output */ michael@0: c=0; michael@0: /* } else { keep c for surrogate handling, length will be set there */ michael@0: } michael@0: } else { michael@0: length=0; michael@0: c=0; michael@0: } michael@0: } else { michael@0: /* keep c for surrogate handling, length will be set there */ michael@0: targetCapacity+=2*count; michael@0: } michael@0: } else { michael@0: length=0; /* from here on, length counts the bytes in overflow[] */ michael@0: } michael@0: michael@0: if(c!=0) { michael@0: /* michael@0: * c is a surrogate, and michael@0: * - source or target too short michael@0: * - or the surrogate is unmatched michael@0: */ michael@0: length=0; michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(sourcesourceLimit) { michael@0: if(U16_IS_TRAIL(trail=*source)) { michael@0: /* output the surrogate pair, will overflow (see conditions comment above) */ michael@0: ++source; michael@0: overflow[0]=(char)(c>>8); michael@0: overflow[1]=(char)c; michael@0: overflow[2]=(char)(trail>>8); michael@0: overflow[3]=(char)trail; michael@0: length=4; /* 4 bytes to output */ michael@0: c=0; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* see if the trail surrogate is in the next buffer */ michael@0: } michael@0: } else { michael@0: /* unmatched trail surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: cnv->fromUChar32=c; michael@0: } michael@0: michael@0: if(length>0) { michael@0: /* output length bytes with overflow (length>targetCapacity>0) */ michael@0: ucnv_fromUWriteBytes(cnv, michael@0: overflow, length, michael@0: (char **)&target, pArgs->targetLimit, michael@0: &offsets, sourceIndex, michael@0: pErrorCode); michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); michael@0: } michael@0: michael@0: if(U_SUCCESS(*pErrorCode) && sourcesourceLimit && targetCapacity==0) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=source; michael@0: pArgs->target=(char *)target; michael@0: pArgs->offsets=offsets; michael@0: } michael@0: michael@0: static void michael@0: _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: const uint8_t *source; michael@0: UChar *target; michael@0: int32_t *offsets; michael@0: michael@0: uint32_t targetCapacity, length, count, sourceIndex; michael@0: UChar c, trail; michael@0: michael@0: if(pArgs->converter->mode<8) { michael@0: _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: return; michael@0: } michael@0: michael@0: cnv=pArgs->converter; michael@0: source=(const uint8_t *)pArgs->source; michael@0: length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); michael@0: if(length<=0 && cnv->toUnicodeStatus==0) { michael@0: /* no input, nothing to do */ michael@0: return; michael@0: } michael@0: michael@0: target=pArgs->target; michael@0: if(target >= pArgs->targetLimit) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: return; michael@0: } michael@0: michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-target); michael@0: offsets=pArgs->offsets; michael@0: sourceIndex=0; michael@0: c=0; michael@0: michael@0: /* complete a partial UChar or pair from the last call */ michael@0: if(cnv->toUnicodeStatus!=0) { michael@0: /* michael@0: * special case: single byte from a previous buffer, michael@0: * where the byte turned out not to belong to a trail surrogate michael@0: * and the preceding, unmatched lead surrogate was put into toUBytes[] michael@0: * for error handling michael@0: */ michael@0: cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; michael@0: cnv->toULength=1; michael@0: cnv->toUnicodeStatus=0; michael@0: } michael@0: if((count=cnv->toULength)!=0) { michael@0: uint8_t *p=cnv->toUBytes; michael@0: do { michael@0: p[count++]=*source++; michael@0: ++sourceIndex; michael@0: --length; michael@0: if(count==2) { michael@0: c=((UChar)p[0]<<8)|p[1]; michael@0: if(U16_IS_SINGLE(c)) { michael@0: /* output the BMP code point */ michael@0: *target++=c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: } michael@0: --targetCapacity; michael@0: count=0; michael@0: c=0; michael@0: break; michael@0: } else if(U16_IS_SURROGATE_LEAD(c)) { michael@0: /* continue collecting bytes for the trail surrogate */ michael@0: c=0; /* avoid unnecessary surrogate handling below */ michael@0: } else { michael@0: /* fall through to error handling for an unmatched trail surrogate */ michael@0: break; michael@0: } michael@0: } else if(count==4) { michael@0: c=((UChar)p[0]<<8)|p[1]; michael@0: trail=((UChar)p[2]<<8)|p[3]; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: /* output the surrogate pair */ michael@0: *target++=c; michael@0: if(targetCapacity>=2) { michael@0: *target++=trail; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: } michael@0: targetCapacity-=2; michael@0: } else /* targetCapacity==1 */ { michael@0: targetCapacity=0; michael@0: cnv->UCharErrorBuffer[0]=trail; michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: count=0; michael@0: c=0; michael@0: break; michael@0: } else { michael@0: /* unmatched lead surrogate, handle here for consistent toUBytes[] */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: michael@0: /* back out reading the code unit after it */ michael@0: if(((const uint8_t *)pArgs->source-source)>=2) { michael@0: source-=2; michael@0: } else { michael@0: /* michael@0: * if the trail unit's first byte was in a previous buffer, then michael@0: * we need to put it into a special place because toUBytes[] will be michael@0: * used for the lead unit's bytes michael@0: */ michael@0: cnv->toUnicodeStatus=0x100|p[2]; michael@0: --source; michael@0: } michael@0: cnv->toULength=2; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: return; michael@0: } michael@0: } michael@0: } while(length>0); michael@0: cnv->toULength=(int8_t)count; michael@0: } michael@0: michael@0: /* copy an even number of bytes for complete UChars */ michael@0: count=2*targetCapacity; michael@0: if(count>length) { michael@0: count=length&~1; michael@0: } michael@0: if(c==0 && count>0) { michael@0: length-=count; michael@0: count>>=1; michael@0: targetCapacity-=count; michael@0: if(offsets==NULL) { michael@0: do { michael@0: c=((UChar)source[0]<<8)|source[1]; michael@0: source+=2; michael@0: if(U16_IS_SINGLE(c)) { michael@0: *target++=c; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && michael@0: U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) michael@0: ) { michael@0: source+=2; michael@0: --count; michael@0: *target++=c; michael@0: *target++=trail; michael@0: } else { michael@0: break; michael@0: } michael@0: } while(--count>0); michael@0: } else { michael@0: do { michael@0: c=((UChar)source[0]<<8)|source[1]; michael@0: source+=2; michael@0: if(U16_IS_SINGLE(c)) { michael@0: *target++=c; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=2; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && michael@0: U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) michael@0: ) { michael@0: source+=2; michael@0: --count; michael@0: *target++=c; michael@0: *target++=trail; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=4; michael@0: } else { michael@0: break; michael@0: } michael@0: } while(--count>0); michael@0: } michael@0: michael@0: if(count==0) { michael@0: /* done with the loop for complete UChars */ michael@0: c=0; michael@0: } else { michael@0: /* keep c for surrogate handling, trail will be set there */ michael@0: length+=2*(count-1); /* one more byte pair was consumed than count decremented */ michael@0: targetCapacity+=count; michael@0: } michael@0: } michael@0: michael@0: if(c!=0) { michael@0: /* michael@0: * c is a surrogate, and michael@0: * - source or target too short michael@0: * - or the surrogate is unmatched michael@0: */ michael@0: cnv->toUBytes[0]=(uint8_t)(c>>8); michael@0: cnv->toUBytes[1]=(uint8_t)c; michael@0: cnv->toULength=2; michael@0: michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(length>=2) { michael@0: if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { michael@0: /* output the surrogate pair, will overflow (see conditions comment above) */ michael@0: source+=2; michael@0: length-=2; michael@0: *target++=c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: cnv->UCharErrorBuffer[0]=trail; michael@0: cnv->UCharErrorBufferLength=1; michael@0: cnv->toULength=0; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* see if the trail surrogate is in the next buffer */ michael@0: } michael@0: } else { michael@0: /* unmatched trail surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } michael@0: michael@0: if(U_SUCCESS(*pErrorCode)) { michael@0: /* check for a remaining source byte */ michael@0: if(length>0) { michael@0: if(targetCapacity==0) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } else { michael@0: /* it must be length==1 because otherwise the above would have copied more */ michael@0: cnv->toUBytes[cnv->toULength++]=*source++; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: } michael@0: michael@0: static UChar32 michael@0: _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { michael@0: const uint8_t *s, *sourceLimit; michael@0: UChar32 c; michael@0: michael@0: if(pArgs->converter->mode<8) { michael@0: return UCNV_GET_NEXT_UCHAR_USE_TO_U; michael@0: } michael@0: michael@0: s=(const uint8_t *)pArgs->source; michael@0: sourceLimit=(const uint8_t *)pArgs->sourceLimit; michael@0: michael@0: if(s>=sourceLimit) { michael@0: /* no input */ michael@0: *err=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0xffff; michael@0: } michael@0: michael@0: if(s+2>sourceLimit) { michael@0: /* only one byte: truncated UChar */ michael@0: pArgs->converter->toUBytes[0]=*s++; michael@0: pArgs->converter->toULength=1; michael@0: pArgs->source=(const char *)s; michael@0: *err = U_TRUNCATED_CHAR_FOUND; michael@0: return 0xffff; michael@0: } michael@0: michael@0: /* get one UChar */ michael@0: c=((UChar32)*s<<8)|s[1]; michael@0: s+=2; michael@0: michael@0: /* check for a surrogate pair */ michael@0: if(U_IS_SURROGATE(c)) { michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(s+2<=sourceLimit) { michael@0: UChar trail; michael@0: michael@0: /* get a second UChar and see if it is a trail surrogate */ michael@0: trail=((UChar)*s<<8)|s[1]; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: c=U16_GET_SUPPLEMENTARY(c, trail); michael@0: s+=2; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: c=-2; michael@0: } michael@0: } else { michael@0: /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ michael@0: uint8_t *bytes=pArgs->converter->toUBytes; michael@0: s-=2; michael@0: pArgs->converter->toULength=(int8_t)(sourceLimit-s); michael@0: do { michael@0: *bytes++=*s++; michael@0: } while(sconverter->toUBytes; michael@0: pArgs->converter->toULength=2; michael@0: *bytes=*(s-2); michael@0: bytes[1]=*(s-1); michael@0: michael@0: c=0xffff; michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } michael@0: michael@0: pArgs->source=(const char *)s; michael@0: return c; michael@0: } michael@0: michael@0: static void michael@0: _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: /* reset toUnicode state */ michael@0: if(UCNV_GET_VERSION(cnv)==0) { michael@0: cnv->mode=8; /* no BOM handling */ michael@0: } else { michael@0: cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ michael@0: } michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { michael@0: /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ michael@0: cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: _UTF16BEOpen(UConverter *cnv, michael@0: UConverterLoadArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: if(UCNV_GET_VERSION(cnv)<=1) { michael@0: _UTF16BEReset(cnv, UCNV_RESET_BOTH); michael@0: } else { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } michael@0: michael@0: static const char * michael@0: _UTF16BEGetName(const UConverter *cnv) { michael@0: if(UCNV_GET_VERSION(cnv)==0) { michael@0: return "UTF-16BE"; michael@0: } else { michael@0: return "UTF-16BE,version=1"; michael@0: } michael@0: } michael@0: michael@0: static const UConverterImpl _UTF16BEImpl={ michael@0: UCNV_UTF16_BigEndian, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _UTF16BEOpen, michael@0: NULL, michael@0: _UTF16BEReset, michael@0: michael@0: _UTF16BEToUnicodeWithOffsets, michael@0: _UTF16BEToUnicodeWithOffsets, michael@0: _UTF16BEFromUnicodeWithOffsets, michael@0: _UTF16BEFromUnicodeWithOffsets, michael@0: _UTF16BEGetNextUChar, michael@0: michael@0: NULL, michael@0: _UTF16BEGetName, michael@0: NULL, michael@0: NULL, michael@0: ucnv_getNonSurrogateUnicodeSet michael@0: }; michael@0: michael@0: static const UConverterStaticData _UTF16BEStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "UTF-16BE", michael@0: 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, michael@0: { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: michael@0: const UConverterSharedData _UTF16BEData={ michael@0: sizeof(UConverterSharedData), ~((uint32_t) 0), michael@0: NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, michael@0: 0 michael@0: }; michael@0: michael@0: /* UTF-16LE ----------------------------------------------------------------- */ michael@0: michael@0: static void michael@0: _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: const UChar *source; michael@0: char *target; michael@0: int32_t *offsets; michael@0: michael@0: uint32_t targetCapacity, length, sourceIndex; michael@0: UChar c, trail; michael@0: char overflow[4]; michael@0: michael@0: source=pArgs->source; michael@0: length=(int32_t)(pArgs->sourceLimit-source); michael@0: if(length<=0) { michael@0: /* no input, nothing to do */ michael@0: return; michael@0: } michael@0: michael@0: cnv=pArgs->converter; michael@0: michael@0: /* write the BOM if necessary */ michael@0: if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { michael@0: static const char bom[]={ (char)0xff, (char)0xfe }; michael@0: ucnv_fromUWriteBytes(cnv, michael@0: bom, 2, michael@0: &pArgs->target, pArgs->targetLimit, michael@0: &pArgs->offsets, -1, michael@0: pErrorCode); michael@0: cnv->fromUnicodeStatus=0; michael@0: } michael@0: michael@0: target=pArgs->target; michael@0: if(target >= pArgs->targetLimit) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: return; michael@0: } michael@0: michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); michael@0: offsets=pArgs->offsets; michael@0: sourceIndex=0; michael@0: michael@0: /* c!=0 indicates in several places outside the main loops that a surrogate was found */ michael@0: michael@0: if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { michael@0: /* the last buffer ended with a lead surrogate, output the surrogate pair */ michael@0: ++source; michael@0: --length; michael@0: target[0]=(uint8_t)c; michael@0: target[1]=(uint8_t)(c>>8); michael@0: target[2]=(uint8_t)trail; michael@0: target[3]=(uint8_t)(trail>>8); michael@0: target+=4; michael@0: targetCapacity-=4; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: } michael@0: sourceIndex=1; michael@0: cnv->fromUChar32=c=0; michael@0: } michael@0: michael@0: if(c==0) { michael@0: /* copy an even number of bytes for complete UChars */ michael@0: uint32_t count=2*length; michael@0: if(count>targetCapacity) { michael@0: count=targetCapacity&~1; michael@0: } michael@0: /* count is even */ michael@0: targetCapacity-=count; michael@0: count>>=1; michael@0: length-=count; michael@0: michael@0: if(offsets==NULL) { michael@0: while(count>0) { michael@0: c=*source++; michael@0: if(U16_IS_SINGLE(c)) { michael@0: target[0]=(uint8_t)c; michael@0: target[1]=(uint8_t)(c>>8); michael@0: target+=2; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { michael@0: ++source; michael@0: --count; michael@0: target[0]=(uint8_t)c; michael@0: target[1]=(uint8_t)(c>>8); michael@0: target[2]=(uint8_t)trail; michael@0: target[3]=(uint8_t)(trail>>8); michael@0: target+=4; michael@0: } else { michael@0: break; michael@0: } michael@0: --count; michael@0: } michael@0: } else { michael@0: while(count>0) { michael@0: c=*source++; michael@0: if(U16_IS_SINGLE(c)) { michael@0: target[0]=(uint8_t)c; michael@0: target[1]=(uint8_t)(c>>8); michael@0: target+=2; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex++; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { michael@0: ++source; michael@0: --count; michael@0: target[0]=(uint8_t)c; michael@0: target[1]=(uint8_t)(c>>8); michael@0: target[2]=(uint8_t)trail; michael@0: target[3]=(uint8_t)(trail>>8); michael@0: target+=4; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=2; michael@0: } else { michael@0: break; michael@0: } michael@0: --count; michael@0: } michael@0: } michael@0: michael@0: if(count==0) { michael@0: /* done with the loop for complete UChars */ michael@0: if(length>0 && targetCapacity>0) { michael@0: /* michael@0: * there is more input and some target capacity - michael@0: * it must be targetCapacity==1 because otherwise michael@0: * the above would have copied more; michael@0: * prepare for overflow output michael@0: */ michael@0: if(U16_IS_SINGLE(c=*source++)) { michael@0: overflow[0]=(char)c; michael@0: overflow[1]=(char)(c>>8); michael@0: length=2; /* 2 bytes to output */ michael@0: c=0; michael@0: /* } else { keep c for surrogate handling, length will be set there */ michael@0: } michael@0: } else { michael@0: length=0; michael@0: c=0; michael@0: } michael@0: } else { michael@0: /* keep c for surrogate handling, length will be set there */ michael@0: targetCapacity+=2*count; michael@0: } michael@0: } else { michael@0: length=0; /* from here on, length counts the bytes in overflow[] */ michael@0: } michael@0: michael@0: if(c!=0) { michael@0: /* michael@0: * c is a surrogate, and michael@0: * - source or target too short michael@0: * - or the surrogate is unmatched michael@0: */ michael@0: length=0; michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(sourcesourceLimit) { michael@0: if(U16_IS_TRAIL(trail=*source)) { michael@0: /* output the surrogate pair, will overflow (see conditions comment above) */ michael@0: ++source; michael@0: overflow[0]=(char)c; michael@0: overflow[1]=(char)(c>>8); michael@0: overflow[2]=(char)trail; michael@0: overflow[3]=(char)(trail>>8); michael@0: length=4; /* 4 bytes to output */ michael@0: c=0; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* see if the trail surrogate is in the next buffer */ michael@0: } michael@0: } else { michael@0: /* unmatched trail surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: cnv->fromUChar32=c; michael@0: } michael@0: michael@0: if(length>0) { michael@0: /* output length bytes with overflow (length>targetCapacity>0) */ michael@0: ucnv_fromUWriteBytes(cnv, michael@0: overflow, length, michael@0: &target, pArgs->targetLimit, michael@0: &offsets, sourceIndex, michael@0: pErrorCode); michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); michael@0: } michael@0: michael@0: if(U_SUCCESS(*pErrorCode) && sourcesourceLimit && targetCapacity==0) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: } michael@0: michael@0: static void michael@0: _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv; michael@0: const uint8_t *source; michael@0: UChar *target; michael@0: int32_t *offsets; michael@0: michael@0: uint32_t targetCapacity, length, count, sourceIndex; michael@0: UChar c, trail; michael@0: michael@0: if(pArgs->converter->mode<8) { michael@0: _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: return; michael@0: } michael@0: michael@0: cnv=pArgs->converter; michael@0: source=(const uint8_t *)pArgs->source; michael@0: length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); michael@0: if(length<=0 && cnv->toUnicodeStatus==0) { michael@0: /* no input, nothing to do */ michael@0: return; michael@0: } michael@0: michael@0: target=pArgs->target; michael@0: if(target >= pArgs->targetLimit) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: return; michael@0: } michael@0: michael@0: targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); michael@0: offsets=pArgs->offsets; michael@0: sourceIndex=0; michael@0: c=0; michael@0: michael@0: /* complete a partial UChar or pair from the last call */ michael@0: if(cnv->toUnicodeStatus!=0) { michael@0: /* michael@0: * special case: single byte from a previous buffer, michael@0: * where the byte turned out not to belong to a trail surrogate michael@0: * and the preceding, unmatched lead surrogate was put into toUBytes[] michael@0: * for error handling michael@0: */ michael@0: cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; michael@0: cnv->toULength=1; michael@0: cnv->toUnicodeStatus=0; michael@0: } michael@0: if((count=cnv->toULength)!=0) { michael@0: uint8_t *p=cnv->toUBytes; michael@0: do { michael@0: p[count++]=*source++; michael@0: ++sourceIndex; michael@0: --length; michael@0: if(count==2) { michael@0: c=((UChar)p[1]<<8)|p[0]; michael@0: if(U16_IS_SINGLE(c)) { michael@0: /* output the BMP code point */ michael@0: *target++=c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: } michael@0: --targetCapacity; michael@0: count=0; michael@0: c=0; michael@0: break; michael@0: } else if(U16_IS_SURROGATE_LEAD(c)) { michael@0: /* continue collecting bytes for the trail surrogate */ michael@0: c=0; /* avoid unnecessary surrogate handling below */ michael@0: } else { michael@0: /* fall through to error handling for an unmatched trail surrogate */ michael@0: break; michael@0: } michael@0: } else if(count==4) { michael@0: c=((UChar)p[1]<<8)|p[0]; michael@0: trail=((UChar)p[3]<<8)|p[2]; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: /* output the surrogate pair */ michael@0: *target++=c; michael@0: if(targetCapacity>=2) { michael@0: *target++=trail; michael@0: if(offsets!=NULL) { michael@0: *offsets++=-1; michael@0: *offsets++=-1; michael@0: } michael@0: targetCapacity-=2; michael@0: } else /* targetCapacity==1 */ { michael@0: targetCapacity=0; michael@0: cnv->UCharErrorBuffer[0]=trail; michael@0: cnv->UCharErrorBufferLength=1; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } michael@0: count=0; michael@0: c=0; michael@0: break; michael@0: } else { michael@0: /* unmatched lead surrogate, handle here for consistent toUBytes[] */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: michael@0: /* back out reading the code unit after it */ michael@0: if(((const uint8_t *)pArgs->source-source)>=2) { michael@0: source-=2; michael@0: } else { michael@0: /* michael@0: * if the trail unit's first byte was in a previous buffer, then michael@0: * we need to put it into a special place because toUBytes[] will be michael@0: * used for the lead unit's bytes michael@0: */ michael@0: cnv->toUnicodeStatus=0x100|p[2]; michael@0: --source; michael@0: } michael@0: cnv->toULength=2; michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: return; michael@0: } michael@0: } michael@0: } while(length>0); michael@0: cnv->toULength=(int8_t)count; michael@0: } michael@0: michael@0: /* copy an even number of bytes for complete UChars */ michael@0: count=2*targetCapacity; michael@0: if(count>length) { michael@0: count=length&~1; michael@0: } michael@0: if(c==0 && count>0) { michael@0: length-=count; michael@0: count>>=1; michael@0: targetCapacity-=count; michael@0: if(offsets==NULL) { michael@0: do { michael@0: c=((UChar)source[1]<<8)|source[0]; michael@0: source+=2; michael@0: if(U16_IS_SINGLE(c)) { michael@0: *target++=c; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && michael@0: U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) michael@0: ) { michael@0: source+=2; michael@0: --count; michael@0: *target++=c; michael@0: *target++=trail; michael@0: } else { michael@0: break; michael@0: } michael@0: } while(--count>0); michael@0: } else { michael@0: do { michael@0: c=((UChar)source[1]<<8)|source[0]; michael@0: source+=2; michael@0: if(U16_IS_SINGLE(c)) { michael@0: *target++=c; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=2; michael@0: } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && michael@0: U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) michael@0: ) { michael@0: source+=2; michael@0: --count; michael@0: *target++=c; michael@0: *target++=trail; michael@0: *offsets++=sourceIndex; michael@0: *offsets++=sourceIndex; michael@0: sourceIndex+=4; michael@0: } else { michael@0: break; michael@0: } michael@0: } while(--count>0); michael@0: } michael@0: michael@0: if(count==0) { michael@0: /* done with the loop for complete UChars */ michael@0: c=0; michael@0: } else { michael@0: /* keep c for surrogate handling, trail will be set there */ michael@0: length+=2*(count-1); /* one more byte pair was consumed than count decremented */ michael@0: targetCapacity+=count; michael@0: } michael@0: } michael@0: michael@0: if(c!=0) { michael@0: /* michael@0: * c is a surrogate, and michael@0: * - source or target too short michael@0: * - or the surrogate is unmatched michael@0: */ michael@0: cnv->toUBytes[0]=(uint8_t)c; michael@0: cnv->toUBytes[1]=(uint8_t)(c>>8); michael@0: cnv->toULength=2; michael@0: michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(length>=2) { michael@0: if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { michael@0: /* output the surrogate pair, will overflow (see conditions comment above) */ michael@0: source+=2; michael@0: length-=2; michael@0: *target++=c; michael@0: if(offsets!=NULL) { michael@0: *offsets++=sourceIndex; michael@0: } michael@0: cnv->UCharErrorBuffer[0]=trail; michael@0: cnv->UCharErrorBufferLength=1; michael@0: cnv->toULength=0; michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } else { michael@0: /* see if the trail surrogate is in the next buffer */ michael@0: } michael@0: } else { michael@0: /* unmatched trail surrogate */ michael@0: *pErrorCode=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } michael@0: michael@0: if(U_SUCCESS(*pErrorCode)) { michael@0: /* check for a remaining source byte */ michael@0: if(length>0) { michael@0: if(targetCapacity==0) { michael@0: *pErrorCode=U_BUFFER_OVERFLOW_ERROR; michael@0: } else { michael@0: /* it must be length==1 because otherwise the above would have copied more */ michael@0: cnv->toUBytes[cnv->toULength++]=*source++; michael@0: } michael@0: } michael@0: } michael@0: michael@0: /* write back the updated pointers */ michael@0: pArgs->source=(const char *)source; michael@0: pArgs->target=target; michael@0: pArgs->offsets=offsets; michael@0: } michael@0: michael@0: static UChar32 michael@0: _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { michael@0: const uint8_t *s, *sourceLimit; michael@0: UChar32 c; michael@0: michael@0: if(pArgs->converter->mode<8) { michael@0: return UCNV_GET_NEXT_UCHAR_USE_TO_U; michael@0: } michael@0: michael@0: s=(const uint8_t *)pArgs->source; michael@0: sourceLimit=(const uint8_t *)pArgs->sourceLimit; michael@0: michael@0: if(s>=sourceLimit) { michael@0: /* no input */ michael@0: *err=U_INDEX_OUTOFBOUNDS_ERROR; michael@0: return 0xffff; michael@0: } michael@0: michael@0: if(s+2>sourceLimit) { michael@0: /* only one byte: truncated UChar */ michael@0: pArgs->converter->toUBytes[0]=*s++; michael@0: pArgs->converter->toULength=1; michael@0: pArgs->source=(const char *)s; michael@0: *err = U_TRUNCATED_CHAR_FOUND; michael@0: return 0xffff; michael@0: } michael@0: michael@0: /* get one UChar */ michael@0: c=((UChar32)s[1]<<8)|*s; michael@0: s+=2; michael@0: michael@0: /* check for a surrogate pair */ michael@0: if(U_IS_SURROGATE(c)) { michael@0: if(U16_IS_SURROGATE_LEAD(c)) { michael@0: if(s+2<=sourceLimit) { michael@0: UChar trail; michael@0: michael@0: /* get a second UChar and see if it is a trail surrogate */ michael@0: trail=((UChar)s[1]<<8)|*s; michael@0: if(U16_IS_TRAIL(trail)) { michael@0: c=U16_GET_SUPPLEMENTARY(c, trail); michael@0: s+=2; michael@0: } else { michael@0: /* unmatched lead surrogate */ michael@0: c=-2; michael@0: } michael@0: } else { michael@0: /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ michael@0: uint8_t *bytes=pArgs->converter->toUBytes; michael@0: s-=2; michael@0: pArgs->converter->toULength=(int8_t)(sourceLimit-s); michael@0: do { michael@0: *bytes++=*s++; michael@0: } while(sconverter->toUBytes; michael@0: pArgs->converter->toULength=2; michael@0: *bytes=*(s-2); michael@0: bytes[1]=*(s-1); michael@0: michael@0: c=0xffff; michael@0: *err=U_ILLEGAL_CHAR_FOUND; michael@0: } michael@0: } michael@0: michael@0: pArgs->source=(const char *)s; michael@0: return c; michael@0: } michael@0: michael@0: static void michael@0: _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: /* reset toUnicode state */ michael@0: if(UCNV_GET_VERSION(cnv)==0) { michael@0: cnv->mode=8; /* no BOM handling */ michael@0: } else { michael@0: cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ michael@0: } michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { michael@0: /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ michael@0: cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: _UTF16LEOpen(UConverter *cnv, michael@0: UConverterLoadArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: if(UCNV_GET_VERSION(cnv)<=1) { michael@0: _UTF16LEReset(cnv, UCNV_RESET_BOTH); michael@0: } else { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } michael@0: michael@0: static const char * michael@0: _UTF16LEGetName(const UConverter *cnv) { michael@0: if(UCNV_GET_VERSION(cnv)==0) { michael@0: return "UTF-16LE"; michael@0: } else { michael@0: return "UTF-16LE,version=1"; michael@0: } michael@0: } michael@0: michael@0: static const UConverterImpl _UTF16LEImpl={ michael@0: UCNV_UTF16_LittleEndian, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _UTF16LEOpen, michael@0: NULL, michael@0: _UTF16LEReset, michael@0: michael@0: _UTF16LEToUnicodeWithOffsets, michael@0: _UTF16LEToUnicodeWithOffsets, michael@0: _UTF16LEFromUnicodeWithOffsets, michael@0: _UTF16LEFromUnicodeWithOffsets, michael@0: _UTF16LEGetNextUChar, michael@0: michael@0: NULL, michael@0: _UTF16LEGetName, michael@0: NULL, michael@0: NULL, michael@0: ucnv_getNonSurrogateUnicodeSet michael@0: }; michael@0: michael@0: michael@0: static const UConverterStaticData _UTF16LEStaticData={ michael@0: sizeof(UConverterStaticData), michael@0: "UTF-16LE", michael@0: 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, michael@0: { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: michael@0: const UConverterSharedData _UTF16LEData={ michael@0: sizeof(UConverterSharedData), ~((uint32_t) 0), michael@0: NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, michael@0: 0 michael@0: }; michael@0: michael@0: /* UTF-16 (Detect BOM) ------------------------------------------------------ */ michael@0: michael@0: /* michael@0: * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE michael@0: * accordingly. michael@0: * This is a simpler version of the UTF-32 converter, with michael@0: * fewer states for shorter BOMs. michael@0: * michael@0: * State values: michael@0: * 0 initial state michael@0: * 1 saw first byte michael@0: * 2..5 - michael@0: * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 michael@0: * 8 UTF-16BE mode michael@0: * 9 UTF-16LE mode michael@0: * michael@0: * During detection: state==number of initial bytes seen so far. michael@0: * michael@0: * On output, emit U+FEFF as the first code point. michael@0: * michael@0: * Variants: michael@0: * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. michael@0: * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and michael@0: * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. michael@0: */ michael@0: michael@0: static void michael@0: _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { michael@0: if(choice<=UCNV_RESET_TO_UNICODE) { michael@0: /* reset toUnicode: state=0 */ michael@0: cnv->mode=0; michael@0: } michael@0: if(choice!=UCNV_RESET_TO_UNICODE) { michael@0: /* reset fromUnicode: prepare to output the UTF-16PE BOM */ michael@0: cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; michael@0: } michael@0: } michael@0: michael@0: static const UConverterSharedData _UTF16v2Data; michael@0: michael@0: static void michael@0: _UTF16Open(UConverter *cnv, michael@0: UConverterLoadArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: if(UCNV_GET_VERSION(cnv)<=2) { michael@0: if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { michael@0: /* michael@0: * Switch implementation, and switch the staticData that's different michael@0: * and was copied into the UConverter. michael@0: * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) michael@0: * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. michael@0: */ michael@0: cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; michael@0: uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); michael@0: } michael@0: _UTF16Reset(cnv, UCNV_RESET_BOTH); michael@0: } else { michael@0: *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; michael@0: } michael@0: } michael@0: michael@0: static const char * michael@0: _UTF16GetName(const UConverter *cnv) { michael@0: if(UCNV_GET_VERSION(cnv)==0) { michael@0: return "UTF-16"; michael@0: } else if(UCNV_GET_VERSION(cnv)==1) { michael@0: return "UTF-16,version=1"; michael@0: } else { michael@0: return "UTF-16,version=2"; michael@0: } michael@0: } michael@0: michael@0: const UConverterSharedData _UTF16Data; michael@0: michael@0: #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) michael@0: #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) michael@0: #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data) michael@0: michael@0: static void michael@0: _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: UConverter *cnv=pArgs->converter; michael@0: const char *source=pArgs->source; michael@0: const char *sourceLimit=pArgs->sourceLimit; michael@0: int32_t *offsets=pArgs->offsets; michael@0: michael@0: int32_t state, offsetDelta; michael@0: uint8_t b; michael@0: michael@0: state=cnv->mode; michael@0: michael@0: /* michael@0: * If we detect a BOM in this buffer, then we must add the BOM size to the michael@0: * offsets because the actual converter function will not see and count the BOM. michael@0: * offsetDelta will have the number of the BOM bytes that are in the current buffer. michael@0: */ michael@0: offsetDelta=0; michael@0: michael@0: while(sourcetoUBytes[0]=(uint8_t)*source++; michael@0: cnv->toULength=1; michael@0: state=1; michael@0: break; michael@0: case 1: michael@0: /* michael@0: * Only inside this switch case can the state variable michael@0: * temporarily take two additional values: michael@0: * 6: BOM error, continue with BE michael@0: * 7: BOM error, continue with LE michael@0: */ michael@0: b=*source; michael@0: if(cnv->toUBytes[0]==0xfe && b==0xff) { michael@0: if(IS_UTF16LE(cnv)) { michael@0: state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ michael@0: } else { michael@0: state=8; /* detect UTF-16BE */ michael@0: } michael@0: } else if(cnv->toUBytes[0]==0xff && b==0xfe) { michael@0: if(IS_UTF16BE(cnv)) { michael@0: state=6; /* illegal reverse BOM for Java "UnicodeBig" */ michael@0: } else { michael@0: state=9; /* detect UTF-16LE */ michael@0: } michael@0: } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { michael@0: state=6; /* illegal missing BOM for Java "Unicode" */ michael@0: } michael@0: if(state>=8) { michael@0: /* BOM detected, consume it */ michael@0: ++source; michael@0: cnv->toULength=0; michael@0: offsetDelta=(int32_t)(source-pArgs->source); michael@0: } else if(state<6) { michael@0: /* ok: no BOM, and not a reverse BOM */ michael@0: if(source!=pArgs->source) { michael@0: /* reset the source for a correct first offset */ michael@0: source=pArgs->source; michael@0: cnv->toULength=0; michael@0: } michael@0: if(IS_UTF16LE(cnv)) { michael@0: /* Make Java "UnicodeLittle" default to LE. */ michael@0: state=9; michael@0: } else { michael@0: /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ michael@0: state=8; michael@0: } michael@0: } else { michael@0: /* michael@0: * error: missing BOM, or reverse BOM michael@0: * UTF-16,version=1: Java-specific "Unicode" requires a BOM. michael@0: * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. michael@0: * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. michael@0: */ michael@0: /* report the non-BOM or reverse BOM as an illegal sequence */ michael@0: cnv->toUBytes[1]=b; michael@0: cnv->toULength=2; michael@0: pArgs->source=source+1; michael@0: /* continue with conversion if the callback resets the error */ michael@0: /* michael@0: * Make Java "Unicode" default to BE like standard UTF-16. michael@0: * Make Java "UnicodeBig" and "UnicodeLittle" default michael@0: * to their normal endiannesses. michael@0: */ michael@0: cnv->mode=state+2; michael@0: *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; michael@0: return; michael@0: } michael@0: /* convert the rest of the stream */ michael@0: cnv->mode=state; michael@0: continue; michael@0: case 8: michael@0: /* call UTF-16BE */ michael@0: pArgs->source=source; michael@0: _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: source=pArgs->source; michael@0: break; michael@0: case 9: michael@0: /* call UTF-16LE */ michael@0: pArgs->source=source; michael@0: _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: source=pArgs->source; michael@0: break; michael@0: default: michael@0: break; /* does not occur */ michael@0: } michael@0: } michael@0: michael@0: /* add BOM size to offsets - see comment at offsetDelta declaration */ michael@0: if(offsets!=NULL && offsetDelta!=0) { michael@0: int32_t *offsetsLimit=pArgs->offsets; michael@0: while(offsetssource=source; michael@0: michael@0: if(source==sourceLimit && pArgs->flush) { michael@0: /* handle truncated input */ michael@0: switch(state) { michael@0: case 0: michael@0: break; /* no input at all, nothing to do */ michael@0: case 8: michael@0: _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: break; michael@0: case 9: michael@0: _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); michael@0: break; michael@0: default: michael@0: /* 0mode=state; michael@0: } michael@0: michael@0: static UChar32 michael@0: _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, michael@0: UErrorCode *pErrorCode) { michael@0: switch(pArgs->converter->mode) { michael@0: case 8: michael@0: return _UTF16BEGetNextUChar(pArgs, pErrorCode); michael@0: case 9: michael@0: return _UTF16LEGetNextUChar(pArgs, pErrorCode); michael@0: default: michael@0: return UCNV_GET_NEXT_UCHAR_USE_TO_U; michael@0: } michael@0: } michael@0: michael@0: static const UConverterImpl _UTF16Impl = { michael@0: UCNV_UTF16, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _UTF16Open, michael@0: NULL, michael@0: _UTF16Reset, michael@0: michael@0: _UTF16ToUnicodeWithOffsets, michael@0: _UTF16ToUnicodeWithOffsets, michael@0: _UTF16PEFromUnicodeWithOffsets, michael@0: _UTF16PEFromUnicodeWithOffsets, michael@0: _UTF16GetNextUChar, michael@0: michael@0: NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ michael@0: _UTF16GetName, michael@0: NULL, michael@0: NULL, michael@0: ucnv_getNonSurrogateUnicodeSet michael@0: }; michael@0: michael@0: static const UConverterStaticData _UTF16StaticData = { michael@0: sizeof(UConverterStaticData), michael@0: "UTF-16", michael@0: 1204, /* CCSID for BOM sensitive UTF-16 */ michael@0: UCNV_IBM, UCNV_UTF16, 2, 2, michael@0: #if U_IS_BIG_ENDIAN michael@0: { 0xff, 0xfd, 0, 0 }, 2, michael@0: #else michael@0: { 0xfd, 0xff, 0, 0 }, 2, michael@0: #endif michael@0: FALSE, FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: const UConverterSharedData _UTF16Data = { michael@0: sizeof(UConverterSharedData), ~((uint32_t) 0), michael@0: NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, michael@0: 0 michael@0: }; michael@0: michael@0: static const UConverterImpl _UTF16v2Impl = { michael@0: UCNV_UTF16, michael@0: michael@0: NULL, michael@0: NULL, michael@0: michael@0: _UTF16Open, michael@0: NULL, michael@0: _UTF16Reset, michael@0: michael@0: _UTF16ToUnicodeWithOffsets, michael@0: _UTF16ToUnicodeWithOffsets, michael@0: _UTF16BEFromUnicodeWithOffsets, michael@0: _UTF16BEFromUnicodeWithOffsets, michael@0: _UTF16GetNextUChar, michael@0: michael@0: NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ michael@0: _UTF16GetName, michael@0: NULL, michael@0: NULL, michael@0: ucnv_getNonSurrogateUnicodeSet michael@0: }; michael@0: michael@0: static const UConverterStaticData _UTF16v2StaticData = { michael@0: sizeof(UConverterStaticData), michael@0: "UTF-16,version=2", michael@0: 1204, /* CCSID for BOM sensitive UTF-16 */ michael@0: UCNV_IBM, UCNV_UTF16, 2, 2, michael@0: { 0xff, 0xfd, 0, 0 }, 2, michael@0: FALSE, FALSE, michael@0: 0, michael@0: 0, michael@0: { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ michael@0: }; michael@0: michael@0: static const UConverterSharedData _UTF16v2Data = { michael@0: sizeof(UConverterSharedData), ~((uint32_t) 0), michael@0: NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl, michael@0: 0 michael@0: }; michael@0: michael@0: #endif