The Tor Browser: comparison intl/icu/source/common/ucnvlat1.c

--1:000000000000
+:f6b0e684aa48
+/*
+**********************************************************************
+*   Copyright (C) 2000-2012, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  ucnvlat1.cpp
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2000feb07
+*   created by: Markus W. Scherer
+*/
+#include "unicode/utypes.h"
+#if !UCONFIG_NO_CONVERSION
+#include "unicode/ucnv.h"
+#include "unicode/uset.h"
+#include "unicode/utf8.h"
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+/* control optimizations according to the platform */
+#define LATIN1_UNROLL_FROM_UNICODE 1
+/* ISO 8859-1 --------------------------------------------------------------- */
+/* This is a table-less and callback-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
+static void
+_Latin1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+const uint8_t *source;
+UChar *target;
+int32_t targetCapacity, length;
+int32_t *offsets;
+int32_t sourceIndex;
+/* set up the local pointers */
+source=(const uint8_t *)pArgs->source;
+target=pArgs->target;
+targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+offsets=pArgs->offsets;
+sourceIndex=0;
+/*
+* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+* for the minimum of the sourceLength and targetCapacity
+*/
+length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
+if(length<=targetCapacity) {
+targetCapacity=length;
+} else {
+/* target will be full */
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+length=targetCapacity;
+}
+if(targetCapacity>=8) {
+/* This loop is unrolled for speed and improved pipelining. */
+int32_t count, loops;
+loops=count=targetCapacity>>3;
+length=targetCapacity&=0x7;
+do {
+target[0]=source[0];
+target[1]=source[1];
+target[2]=source[2];
+target[3]=source[3];
+target[4]=source[4];
+target[5]=source[5];
+target[6]=source[6];
+target[7]=source[7];
+target+=8;
+source+=8;
+} while(--count>0);
+if(offsets!=NULL) {
+do {
+offsets[0]=sourceIndex++;
+offsets[1]=sourceIndex++;
+offsets[2]=sourceIndex++;
+offsets[3]=sourceIndex++;
+offsets[4]=sourceIndex++;
+offsets[5]=sourceIndex++;
+offsets[6]=sourceIndex++;
+offsets[7]=sourceIndex++;
+offsets+=8;
+} while(--loops>0);
+}
+}
+/* conversion loop */
+while(targetCapacity>0) {
+*target++=*source++;
+--targetCapacity;
+}
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+/* set offsets */
+if(offsets!=NULL) {
+while(length>0) {
+*offsets++=sourceIndex++;
+--length;
+}
+pArgs->offsets=offsets;
+}
+}
+/* This is a table-less and callback-less version of ucnv_MBCSSingleGetNextUChar(). */
+static UChar32
+_Latin1GetNextUChar(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+const uint8_t *source=(const uint8_t *)pArgs->source;
+if(source<(const uint8_t *)pArgs->sourceLimit) {
+pArgs->source=(const char *)(source+1);
+return *source;
+}
+/* no output because of empty input */
+*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+/* This is a table-less version of ucnv_MBCSSingleFromBMPWithOffsets(). */
+static void
+_Latin1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv;
+const UChar *source, *sourceLimit;
+uint8_t *target, *oldTarget;
+int32_t targetCapacity, length;
+int32_t *offsets;
+UChar32 cp;
+UChar c, max;
+int32_t sourceIndex;
+/* set up the local pointers */
+cnv=pArgs->converter;
+source=pArgs->source;
+sourceLimit=pArgs->sourceLimit;
+target=oldTarget=(uint8_t *)pArgs->target;
+targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+offsets=pArgs->offsets;
+if(cnv->sharedData==&_Latin1Data) {
+max=0xff; /* Latin-1 */
+} else {
+max=0x7f; /* US-ASCII */
+}
+/* get the converter state from UConverter */
+cp=cnv->fromUChar32;
+/* sourceIndex=-1 if the current character began in the previous buffer */
+sourceIndex= cp==0 ? 0 : -1;
+/*
+* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+* for the minimum of the sourceLength and targetCapacity
+*/
+length=(int32_t)(sourceLimit-source);
+if(length<targetCapacity) {
+targetCapacity=length;
+}
+/* conversion loop */
+if(cp!=0 && targetCapacity>0) {
+goto getTrail;
+}
+#if LATIN1_UNROLL_FROM_UNICODE
+/* unroll the loop with the most common case */
+if(targetCapacity>=16) {
+int32_t count, loops;
+UChar u, oredChars;
+loops=count=targetCapacity>>4;
+do {
+oredChars=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+oredChars|=u=*source++;
+*target++=(uint8_t)u;
+/* were all 16 entries really valid? */
+if(oredChars>max) {
+/* no, return to the first of these 16 */
+source-=16;
+target-=16;
+break;
+}
+} while(--count>0);
+count=loops-count;
+targetCapacity-=16*count;
+if(offsets!=NULL) {
+oldTarget+=16*count;
+while(count>0) {
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+*offsets++=sourceIndex++;
+--count;
+}
+}
+}
+#endif
+/* conversion loop */
+c=0;
+while(targetCapacity>0 && (c=*source++)<=max) {
+/* convert the Unicode code point */
+*target++=(uint8_t)c;
+--targetCapacity;
+}
+if(c>max) {
+cp=c;
+if(!U_IS_SURROGATE(cp)) {
+/* callback(unassigned) */
+} else if(U_IS_SURROGATE_LEAD(cp)) {
+getTrail:
+if(source<sourceLimit) {
+/* test the following code unit */
+UChar trail=*source;
+if(U16_IS_TRAIL(trail)) {
+++source;
+cp=U16_GET_SUPPLEMENTARY(cp, trail);
+/* this codepage does not map supplementary code points */
+/* callback(unassigned) */
+} else {
+/* this is an unmatched lead code unit (1st surrogate) */
+/* callback(illegal) */
+}
+} else {
+/* no more input */
+cnv->fromUChar32=cp;
+goto noMoreInput;
+}
+} else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+}
+*pErrorCode= U_IS_SURROGATE(cp) ? U_ILLEGAL_CHAR_FOUND : U_INVALID_CHAR_FOUND;
+cnv->fromUChar32=cp;
+}
+noMoreInput:
+/* set offsets since the start */
+if(offsets!=NULL) {
+size_t count=target-oldTarget;
+while(count>0) {
+*offsets++=sourceIndex++;
+--count;
+}
+}
+if(U_SUCCESS(*pErrorCode) && source<sourceLimit && target>=(uint8_t *)pArgs->targetLimit) {
+/* target is full */
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+/* write back the updated pointers */
+pArgs->source=source;
+pArgs->target=(char *)target;
+pArgs->offsets=offsets;
+}
+/* Convert UTF-8 to Latin-1. Adapted from ucnv_SBCSFromUTF8(). */
+static void
+ucnv_Latin1FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+UConverterToUnicodeArgs *pToUArgs,
+UErrorCode *pErrorCode) {
+UConverter *utf8;
+const uint8_t *source, *sourceLimit;
+uint8_t *target;
+int32_t targetCapacity;
+UChar32 c;
+uint8_t b, t1;
+/* set up the local pointers */
+utf8=pToUArgs->converter;
+source=(uint8_t *)pToUArgs->source;
+sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
+target=(uint8_t *)pFromUArgs->target;
+targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
+/* get the converter state from the UTF-8 UConverter */
+c=(UChar32)utf8->toUnicodeStatus;
+if(c!=0 && source<sourceLimit) {
+if(targetCapacity==0) {
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+return;
+} else if(c>=0xc2 && c<=0xc3 && (t1=(uint8_t)(*source-0x80)) <= 0x3f) {
+++source;
+*target++=(uint8_t)(((c&3)<<6)|t1);
+--targetCapacity;
+utf8->toUnicodeStatus=0;
+utf8->toULength=0;
+} else {
+/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
+*pErrorCode=U_USING_DEFAULT_WARNING;
+return;
+}
+}
+/*
+* Make sure that the last byte sequence before sourceLimit is complete
+* or runs into a lead byte.
+* In the conversion loop compare source with sourceLimit only once
+* per multi-byte character.
+* For Latin-1, adjust sourceLimit only for 1 trail byte because
+* the conversion loop handles at most 2-byte sequences.
+*/
+if(source<sourceLimit && U8_IS_LEAD(*(sourceLimit-1))) {
+--sourceLimit;
+}
+/* conversion loop */
+while(source<sourceLimit) {
+if(targetCapacity>0) {
+b=*source++;
+if((int8_t)b>=0) {
+/* convert ASCII */
+*target++=(uint8_t)b;
+--targetCapacity;
+} else if( /* handle U+0080..U+00FF inline */
+b>=0xc2 && b<=0xc3 &&
+(t1=(uint8_t)(*source-0x80)) <= 0x3f
+) {
+++source;
+*target++=(uint8_t)(((b&3)<<6)|t1);
+--targetCapacity;
+} else {
+/* complicated, illegal or unmappable input: fall back to the pivoting implementation */
+pToUArgs->source=(char *)(source-1);
+pFromUArgs->target=(char *)target;
+*pErrorCode=U_USING_DEFAULT_WARNING;
+return;
+}
+} else {
+/* target is full */
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+break;
+}
+}
+/*
+* The sourceLimit may have been adjusted before the conversion loop
+* to stop before a truncated sequence.
+* If so, then collect the truncated sequence now.
+* For Latin-1, there is at most exactly one lead byte because of the
+* smaller sourceLimit adjustment logic.
+*/
+if(U_SUCCESS(*pErrorCode) && source<(sourceLimit=(uint8_t *)pToUArgs->sourceLimit)) {
+utf8->toUnicodeStatus=utf8->toUBytes[0]=b=*source++;
+utf8->toULength=1;
+utf8->mode=U8_COUNT_TRAIL_BYTES(b)+1;
+}
+/* write back the updated pointers */
+pToUArgs->source=(char *)source;
+pFromUArgs->target=(char *)target;
+}
+static void
+_Latin1GetUnicodeSet(const UConverter *cnv,
+const USetAdder *sa,
+UConverterUnicodeSet which,
+UErrorCode *pErrorCode) {
+sa->addRange(sa->set, 0, 0xff);
+}
+static const UConverterImpl _Latin1Impl={
+UCNV_LATIN_1,
+NULL,
+NULL,
+NULL,
+NULL,
+NULL,
+_Latin1ToUnicodeWithOffsets,
+_Latin1ToUnicodeWithOffsets,
+_Latin1FromUnicodeWithOffsets,
+_Latin1FromUnicodeWithOffsets,
+_Latin1GetNextUChar,
+NULL,
+NULL,
+NULL,
+NULL,
+_Latin1GetUnicodeSet,
+NULL,
+ucnv_Latin1FromUTF8
+};
+static const UConverterStaticData _Latin1StaticData={
+sizeof(UConverterStaticData),
+"ISO-8859-1",
+819, UCNV_IBM, UCNV_LATIN_1, 1, 1,
+{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _Latin1Data={
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_Latin1StaticData, FALSE, &_Latin1Impl,
+0
+};
+/* US-ASCII ----------------------------------------------------------------- */
+/* This is a table-less version of ucnv_MBCSSingleToBMPWithOffsets(). */
+static void
+_ASCIIToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+const uint8_t *source, *sourceLimit;
+UChar *target, *oldTarget;
+int32_t targetCapacity, length;
+int32_t *offsets;
+int32_t sourceIndex;
+uint8_t c;
+/* set up the local pointers */
+source=(const uint8_t *)pArgs->source;
+sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+target=oldTarget=pArgs->target;
+targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
+offsets=pArgs->offsets;
+/* sourceIndex=-1 if the current character began in the previous buffer */
+sourceIndex=0;
+/*
+* since the conversion here is 1:1 UChar:uint8_t, we need only one counter
+* for the minimum of the sourceLength and targetCapacity
+*/
+length=(int32_t)(sourceLimit-source);
+if(length<targetCapacity) {
+targetCapacity=length;
+}
+if(targetCapacity>=8) {
+/* This loop is unrolled for speed and improved pipelining. */
+int32_t count, loops;
+UChar oredChars;
+loops=count=targetCapacity>>3;
+do {
+oredChars=target[0]=source[0];
+oredChars|=target[1]=source[1];
+oredChars|=target[2]=source[2];
+oredChars|=target[3]=source[3];
+oredChars|=target[4]=source[4];
+oredChars|=target[5]=source[5];
+oredChars|=target[6]=source[6];
+oredChars|=target[7]=source[7];
+/* were all 16 entries really valid? */
+if(oredChars>0x7f) {
+/* no, return to the first of these 16 */
+break;
+}
+source+=8;
+target+=8;
+} while(--count>0);
+count=loops-count;
+targetCapacity-=count*8;
+if(offsets!=NULL) {
+oldTarget+=count*8;
+while(count>0) {
+offsets[0]=sourceIndex++;
+offsets[1]=sourceIndex++;
+offsets[2]=sourceIndex++;
+offsets[3]=sourceIndex++;
+offsets[4]=sourceIndex++;
+offsets[5]=sourceIndex++;
+offsets[6]=sourceIndex++;
+offsets[7]=sourceIndex++;
+offsets+=8;
+--count;
+}
+}
+}
+/* conversion loop */
+c=0;
+while(targetCapacity>0 && (c=*source++)<=0x7f) {
+*target++=c;
+--targetCapacity;
+}
+if(c>0x7f) {
+/* callback(illegal); copy the current bytes to toUBytes[] */
+UConverter *cnv=pArgs->converter;
+cnv->toUBytes[0]=c;
+cnv->toULength=1;
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+} else if(source<sourceLimit && target>=pArgs->targetLimit) {
+/* target is full */
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+/* set offsets since the start */
+if(offsets!=NULL) {
+size_t count=target-oldTarget;
+while(count>0) {
+*offsets++=sourceIndex++;
+--count;
+}
+}
+/* write back the updated pointers */
+pArgs->source=(const char *)source;
+pArgs->target=target;
+pArgs->offsets=offsets;
+}
+/* This is a table-less version of ucnv_MBCSSingleGetNextUChar(). */
+static UChar32
+_ASCIIGetNextUChar(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+const uint8_t *source;
+uint8_t b;
+source=(const uint8_t *)pArgs->source;
+if(source<(const uint8_t *)pArgs->sourceLimit) {
+b=*source++;
+pArgs->source=(const char *)source;
+if(b<=0x7f) {
+return b;
+} else {
+UConverter *cnv=pArgs->converter;
+cnv->toUBytes[0]=b;
+cnv->toULength=1;
+*pErrorCode=U_ILLEGAL_CHAR_FOUND;
+return 0xffff;
+}
+}
+/* no output because of empty input */
+*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+/* "Convert" UTF-8 to US-ASCII: Validate and copy. */
+static void
+ucnv_ASCIIFromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
+UConverterToUnicodeArgs *pToUArgs,
+UErrorCode *pErrorCode) {
+const uint8_t *source, *sourceLimit;
+uint8_t *target;
+int32_t targetCapacity, length;
+uint8_t c;
+if(pToUArgs->converter->toUnicodeStatus!=0) {
+/* no handling of partial UTF-8 characters here, fall back to pivoting */
+*pErrorCode=U_USING_DEFAULT_WARNING;
+return;
+}
+/* set up the local pointers */
+source=(const uint8_t *)pToUArgs->source;
+sourceLimit=(const uint8_t *)pToUArgs->sourceLimit;
+target=(uint8_t *)pFromUArgs->target;
+targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
+/*
+* since the conversion here is 1:1 uint8_t:uint8_t, we need only one counter
+* for the minimum of the sourceLength and targetCapacity
+*/
+length=(int32_t)(sourceLimit-source);
+if(length<targetCapacity) {
+targetCapacity=length;
+}
+/* unroll the loop with the most common case */
+if(targetCapacity>=16) {
+int32_t count, loops;
+uint8_t oredChars;
+loops=count=targetCapacity>>4;
+do {
+oredChars=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+oredChars|=*target++=*source++;
+/* were all 16 entries really valid? */
+if(oredChars>0x7f) {
+/* no, return to the first of these 16 */
+source-=16;
+target-=16;
+break;
+}
+} while(--count>0);
+count=loops-count;
+targetCapacity-=16*count;
+}
+/* conversion loop */
+c=0;
+while(targetCapacity>0 && (c=*source)<=0x7f) {
+++source;
+*target++=c;
+--targetCapacity;
+}
+if(c>0x7f) {
+/* non-ASCII character, handle in standard converter */
+*pErrorCode=U_USING_DEFAULT_WARNING;
+} else if(source<sourceLimit && target>=(const uint8_t *)pFromUArgs->targetLimit) {
+/* target is full */
+*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+}
+/* write back the updated pointers */
+pToUArgs->source=(const char *)source;
+pFromUArgs->target=(char *)target;
+}
+static void
+_ASCIIGetUnicodeSet(const UConverter *cnv,
+const USetAdder *sa,
+UConverterUnicodeSet which,
+UErrorCode *pErrorCode) {
+sa->addRange(sa->set, 0, 0x7f);
+}
+static const UConverterImpl _ASCIIImpl={
+UCNV_US_ASCII,
+NULL,
+NULL,
+NULL,
+NULL,
+NULL,
+_ASCIIToUnicodeWithOffsets,
+_ASCIIToUnicodeWithOffsets,
+_Latin1FromUnicodeWithOffsets,
+_Latin1FromUnicodeWithOffsets,
+_ASCIIGetNextUChar,
+NULL,
+NULL,
+NULL,
+NULL,
+_ASCIIGetUnicodeSet,
+NULL,
+ucnv_ASCIIFromUTF8
+};
+static const UConverterStaticData _ASCIIStaticData={
+sizeof(UConverterStaticData),
+"US-ASCII",
+367, UCNV_IBM, UCNV_US_ASCII, 1, 1,
+{ 0x1a, 0, 0, 0 }, 1, FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _ASCIIData={
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_ASCIIStaticData, FALSE, &_ASCIIImpl,
+0
+};
+#endif

The Tor Browser / file comparison

comparison: intl/icu/source/common/ucnvlat1.c

intl/icu/source/common/ucnvlat1.c