The Tor Browser: comparison intl/icu/source/common/ucnv

--1:000000000000
+:55b01d951aa7
+/*
+**********************************************************************
+*   Copyright (C) 2002-2011, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+**********************************************************************
+*   file name:  ucnv_u32.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 2002jul01
+*   created by: Markus W. Scherer
+*
+*   UTF-32 converter implementation. Used to be in ucnv_utf.c.
+*/
+#include "unicode/utypes.h"
+#if !UCONFIG_NO_CONVERSION
+#include "unicode/ucnv.h"
+#include "unicode/utf.h"
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+#include "cmemory.h"
+#define MAXIMUM_UCS2            0x0000FFFF
+#define MAXIMUM_UTF             0x0010FFFF
+#define HALF_SHIFT              10
+#define HALF_BASE               0x0010000
+#define HALF_MASK               0x3FF
+#define SURROGATE_HIGH_START    0xD800
+#define SURROGATE_LOW_START     0xDC00
+/* -SURROGATE_LOW_START + HALF_BASE */
+#define SURROGATE_LOW_BASE      9216
+enum {
+UCNV_NEED_TO_WRITE_BOM=1
+};
+/* UTF-32BE ----------------------------------------------------------------- */
+static void
+T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
+UErrorCode * err)
+{
+const unsigned char *mySource = (unsigned char *) args->source;
+UChar *myTarget = args->target;
+const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
+const UChar *targetLimit = args->targetLimit;
+unsigned char *toUBytes = args->converter->toUBytes;
+uint32_t ch, i;
+/* Restore state of current sequence */
+if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+i = args->converter->toULength;       /* restore # of bytes consumed */
+args->converter->toULength = 0;
+ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
+args->converter->toUnicodeStatus = 0;
+goto morebytes;
+}
+while (mySource < sourceLimit && myTarget < targetLimit) {
+i = 0;
+ch = 0;
+morebytes:
+while (i < sizeof(uint32_t)) {
+if (mySource < sourceLimit) {
+ch = (ch << 8) | (uint8_t)(*mySource);
+toUBytes[i++] = (char) *(mySource++);
+}
+else {
+/* stores a partially calculated target*/
+/* + 1 to make 0 a valid character */
+args->converter->toUnicodeStatus = ch + 1;
+args->converter->toULength = (int8_t) i;
+goto donefornow;
+}
+}
+if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
+/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+if (ch <= MAXIMUM_UCS2)
+{
+/* fits in 16 bits */
+*(myTarget++) = (UChar) ch;
+}
+else {
+/* write out the surrogates */
+*(myTarget++) = U16_LEAD(ch);
+ch = U16_TRAIL(ch);
+if (myTarget < targetLimit) {
+*(myTarget++) = (UChar)ch;
+}
+else {
+/* Put in overflow buffer (not handled here) */
+args->converter->UCharErrorBuffer[0] = (UChar) ch;
+args->converter->UCharErrorBufferLength = 1;
+*err = U_BUFFER_OVERFLOW_ERROR;
+break;
+}
+}
+}
+else {
+args->converter->toULength = (int8_t)i;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+donefornow:
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
+/* End of target buffer */
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = myTarget;
+args->source = (const char *) mySource;
+}
+static void
+T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
+UErrorCode * err)
+{
+const unsigned char *mySource = (unsigned char *) args->source;
+UChar *myTarget = args->target;
+int32_t *myOffsets = args->offsets;
+const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
+const UChar *targetLimit = args->targetLimit;
+unsigned char *toUBytes = args->converter->toUBytes;
+uint32_t ch, i;
+int32_t offsetNum = 0;
+/* Restore state of current sequence */
+if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
+i = args->converter->toULength;       /* restore # of bytes consumed */
+args->converter->toULength = 0;
+ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
+args->converter->toUnicodeStatus = 0;
+goto morebytes;
+}
+while (mySource < sourceLimit && myTarget < targetLimit) {
+i = 0;
+ch = 0;
+morebytes:
+while (i < sizeof(uint32_t)) {
+if (mySource < sourceLimit) {
+ch = (ch << 8) | (uint8_t)(*mySource);
+toUBytes[i++] = (char) *(mySource++);
+}
+else {
+/* stores a partially calculated target*/
+/* + 1 to make 0 a valid character */
+args->converter->toUnicodeStatus = ch + 1;
+args->converter->toULength = (int8_t) i;
+goto donefornow;
+}
+}
+if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
+/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+if (ch <= MAXIMUM_UCS2) {
+/* fits in 16 bits */
+*(myTarget++) = (UChar) ch;
+*(myOffsets++) = offsetNum;
+}
+else {
+/* write out the surrogates */
+*(myTarget++) = U16_LEAD(ch);
+*myOffsets++ = offsetNum;
+ch = U16_TRAIL(ch);
+if (myTarget < targetLimit)
+{
+*(myTarget++) = (UChar)ch;
+*(myOffsets++) = offsetNum;
+}
+else {
+/* Put in overflow buffer (not handled here) */
+args->converter->UCharErrorBuffer[0] = (UChar) ch;
+args->converter->UCharErrorBufferLength = 1;
+*err = U_BUFFER_OVERFLOW_ERROR;
+break;
+}
+}
+}
+else {
+args->converter->toULength = (int8_t)i;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+offsetNum += i;
+}
+donefornow:
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
+{
+/* End of target buffer */
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = myTarget;
+args->source = (const char *) mySource;
+args->offsets = myOffsets;
+}
+static void
+T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
+UErrorCode * err)
+{
+const UChar *mySource = args->source;
+unsigned char *myTarget;
+const UChar *sourceLimit = args->sourceLimit;
+const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
+UChar32 ch, ch2;
+unsigned int indexToWrite;
+unsigned char temp[sizeof(uint32_t)];
+if(mySource >= sourceLimit) {
+/* no input, nothing to do */
+return;
+}
+/* write the BOM if necessary */
+if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+ucnv_fromUWriteBytes(args->converter,
+bom, 4,
+&args->target, args->targetLimit,
+&args->offsets, -1,
+err);
+args->converter->fromUnicodeStatus=0;
+}
+myTarget = (unsigned char *) args->target;
+temp[0] = 0;
+if (args->converter->fromUChar32) {
+ch = args->converter->fromUChar32;
+args->converter->fromUChar32 = 0;
+goto lowsurogate;
+}
+while (mySource < sourceLimit && myTarget < targetLimit) {
+ch = *(mySource++);
+if (U_IS_SURROGATE(ch)) {
+if (U_IS_LEAD(ch)) {
+lowsurogate:
+if (mySource < sourceLimit) {
+ch2 = *mySource;
+if (U_IS_TRAIL(ch2)) {
+ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+mySource++;
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+else {
+/* ran out of source */
+args->converter->fromUChar32 = ch;
+if (args->flush) {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+*err = U_ILLEGAL_CHAR_FOUND;
+}
+break;
+}
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
+temp[1] = (uint8_t) (ch >> 16 & 0x1F);
+temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
+temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
+for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+if (myTarget < targetLimit) {
+*(myTarget++) = temp[indexToWrite];
+}
+else {
+args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+}
+}
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = (char *) myTarget;
+args->source = mySource;
+}
+static void
+T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
+UErrorCode * err)
+{
+const UChar *mySource = args->source;
+unsigned char *myTarget;
+int32_t *myOffsets;
+const UChar *sourceLimit = args->sourceLimit;
+const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
+UChar32 ch, ch2;
+int32_t offsetNum = 0;
+unsigned int indexToWrite;
+unsigned char temp[sizeof(uint32_t)];
+if(mySource >= sourceLimit) {
+/* no input, nothing to do */
+return;
+}
+/* write the BOM if necessary */
+if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
+ucnv_fromUWriteBytes(args->converter,
+bom, 4,
+&args->target, args->targetLimit,
+&args->offsets, -1,
+err);
+args->converter->fromUnicodeStatus=0;
+}
+myTarget = (unsigned char *) args->target;
+myOffsets = args->offsets;
+temp[0] = 0;
+if (args->converter->fromUChar32) {
+ch = args->converter->fromUChar32;
+args->converter->fromUChar32 = 0;
+goto lowsurogate;
+}
+while (mySource < sourceLimit && myTarget < targetLimit) {
+ch = *(mySource++);
+if (U_IS_SURROGATE(ch)) {
+if (U_IS_LEAD(ch)) {
+lowsurogate:
+if (mySource < sourceLimit) {
+ch2 = *mySource;
+if (U_IS_TRAIL(ch2)) {
+ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+mySource++;
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+else {
+/* ran out of source */
+args->converter->fromUChar32 = ch;
+if (args->flush) {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+*err = U_ILLEGAL_CHAR_FOUND;
+}
+break;
+}
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
+temp[1] = (uint8_t) (ch >> 16 & 0x1F);
+temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
+temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
+for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
+if (myTarget < targetLimit) {
+*(myTarget++) = temp[indexToWrite];
+*(myOffsets++) = offsetNum;
+}
+else {
+args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+}
+offsetNum = offsetNum + 1 + (temp[1] != 0);
+}
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = (char *) myTarget;
+args->source = mySource;
+args->offsets = myOffsets;
+}
+static UChar32
+T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
+UErrorCode* err)
+{
+const uint8_t *mySource;
+UChar32 myUChar;
+int32_t length;
+mySource = (const uint8_t *)args->source;
+if (mySource >= (const uint8_t *)args->sourceLimit)
+{
+/* no input */
+*err = U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+if (length < 4)
+{
+/* got a partial character */
+uprv_memcpy(args->converter->toUBytes, mySource, length);
+args->converter->toULength = (int8_t)length;
+args->source = (const char *)(mySource + length);
+*err = U_TRUNCATED_CHAR_FOUND;
+return 0xffff;
+}
+/* Don't even try to do a direct cast because the value may be on an odd address. */
+myUChar = ((UChar32)mySource[0] << 24)
+| ((UChar32)mySource[1] << 16)
+| ((UChar32)mySource[2] << 8)
+| ((UChar32)mySource[3]);
+args->source = (const char *)(mySource + 4);
+if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+return myUChar;
+}
+uprv_memcpy(args->converter->toUBytes, mySource, 4);
+args->converter->toULength = 4;
+*err = U_ILLEGAL_CHAR_FOUND;
+return 0xffff;
+}
+static const UConverterImpl _UTF32BEImpl = {
+UCNV_UTF32_BigEndian,
+NULL,
+NULL,
+NULL,
+NULL,
+NULL,
+T_UConverter_toUnicode_UTF32_BE,
+T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
+T_UConverter_fromUnicode_UTF32_BE,
+T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
+T_UConverter_getNextUChar_UTF32_BE,
+NULL,
+NULL,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
+static const UConverterStaticData _UTF32BEStaticData = {
+sizeof(UConverterStaticData),
+"UTF-32BE",
+1232,
+UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
+{ 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF32BEData = {
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
+0
+};
+/* UTF-32LE ---------------------------------------------------------- */
+static void
+T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
+UErrorCode * err)
+{
+const unsigned char *mySource = (unsigned char *) args->source;
+UChar *myTarget = args->target;
+const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
+const UChar *targetLimit = args->targetLimit;
+unsigned char *toUBytes = args->converter->toUBytes;
+uint32_t ch, i;
+/* Restore state of current sequence */
+if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+{
+i = args->converter->toULength;       /* restore # of bytes consumed */
+args->converter->toULength = 0;
+/* Stores the previously calculated ch from a previous call*/
+ch = args->converter->toUnicodeStatus - 1;
+args->converter->toUnicodeStatus = 0;
+goto morebytes;
+}
+while (mySource < sourceLimit && myTarget < targetLimit)
+{
+i = 0;
+ch = 0;
+morebytes:
+while (i < sizeof(uint32_t))
+{
+if (mySource < sourceLimit)
+{
+ch |= ((uint8_t)(*mySource)) << (i * 8);
+toUBytes[i++] = (char) *(mySource++);
+}
+else
+{
+/* stores a partially calculated target*/
+/* + 1 to make 0 a valid character */
+args->converter->toUnicodeStatus = ch + 1;
+args->converter->toULength = (int8_t) i;
+goto donefornow;
+}
+}
+if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
+/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+if (ch <= MAXIMUM_UCS2) {
+/* fits in 16 bits */
+*(myTarget++) = (UChar) ch;
+}
+else {
+/* write out the surrogates */
+*(myTarget++) = U16_LEAD(ch);
+ch = U16_TRAIL(ch);
+if (myTarget < targetLimit) {
+*(myTarget++) = (UChar)ch;
+}
+else {
+/* Put in overflow buffer (not handled here) */
+args->converter->UCharErrorBuffer[0] = (UChar) ch;
+args->converter->UCharErrorBufferLength = 1;
+*err = U_BUFFER_OVERFLOW_ERROR;
+break;
+}
+}
+}
+else {
+args->converter->toULength = (int8_t)i;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+donefornow:
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
+{
+/* End of target buffer */
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = myTarget;
+args->source = (const char *) mySource;
+}
+static void
+T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
+UErrorCode * err)
+{
+const unsigned char *mySource = (unsigned char *) args->source;
+UChar *myTarget = args->target;
+int32_t *myOffsets = args->offsets;
+const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
+const UChar *targetLimit = args->targetLimit;
+unsigned char *toUBytes = args->converter->toUBytes;
+uint32_t ch, i;
+int32_t offsetNum = 0;
+/* Restore state of current sequence */
+if (args->converter->toUnicodeStatus && myTarget < targetLimit)
+{
+i = args->converter->toULength;       /* restore # of bytes consumed */
+args->converter->toULength = 0;
+/* Stores the previously calculated ch from a previous call*/
+ch = args->converter->toUnicodeStatus - 1;
+args->converter->toUnicodeStatus = 0;
+goto morebytes;
+}
+while (mySource < sourceLimit && myTarget < targetLimit)
+{
+i = 0;
+ch = 0;
+morebytes:
+while (i < sizeof(uint32_t))
+{
+if (mySource < sourceLimit)
+{
+ch |= ((uint8_t)(*mySource)) << (i * 8);
+toUBytes[i++] = (char) *(mySource++);
+}
+else
+{
+/* stores a partially calculated target*/
+/* + 1 to make 0 a valid character */
+args->converter->toUnicodeStatus = ch + 1;
+args->converter->toULength = (int8_t) i;
+goto donefornow;
+}
+}
+if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
+{
+/* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
+if (ch <= MAXIMUM_UCS2)
+{
+/* fits in 16 bits */
+*(myTarget++) = (UChar) ch;
+*(myOffsets++) = offsetNum;
+}
+else {
+/* write out the surrogates */
+*(myTarget++) = U16_LEAD(ch);
+*(myOffsets++) = offsetNum;
+ch = U16_TRAIL(ch);
+if (myTarget < targetLimit)
+{
+*(myTarget++) = (UChar)ch;
+*(myOffsets++) = offsetNum;
+}
+else
+{
+/* Put in overflow buffer (not handled here) */
+args->converter->UCharErrorBuffer[0] = (UChar) ch;
+args->converter->UCharErrorBufferLength = 1;
+*err = U_BUFFER_OVERFLOW_ERROR;
+break;
+}
+}
+}
+else
+{
+args->converter->toULength = (int8_t)i;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+offsetNum += i;
+}
+donefornow:
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
+{
+/* End of target buffer */
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = myTarget;
+args->source = (const char *) mySource;
+args->offsets = myOffsets;
+}
+static void
+T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
+UErrorCode * err)
+{
+const UChar *mySource = args->source;
+unsigned char *myTarget;
+const UChar *sourceLimit = args->sourceLimit;
+const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
+UChar32 ch, ch2;
+unsigned int indexToWrite;
+unsigned char temp[sizeof(uint32_t)];
+if(mySource >= sourceLimit) {
+/* no input, nothing to do */
+return;
+}
+/* write the BOM if necessary */
+if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+ucnv_fromUWriteBytes(args->converter,
+bom, 4,
+&args->target, args->targetLimit,
+&args->offsets, -1,
+err);
+args->converter->fromUnicodeStatus=0;
+}
+myTarget = (unsigned char *) args->target;
+temp[3] = 0;
+if (args->converter->fromUChar32)
+{
+ch = args->converter->fromUChar32;
+args->converter->fromUChar32 = 0;
+goto lowsurogate;
+}
+while (mySource < sourceLimit && myTarget < targetLimit)
+{
+ch = *(mySource++);
+if (U16_IS_SURROGATE(ch)) {
+if (U16_IS_LEAD(ch))
+{
+lowsurogate:
+if (mySource < sourceLimit)
+{
+ch2 = *mySource;
+if (U16_IS_TRAIL(ch2)) {
+ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+mySource++;
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+else {
+/* ran out of source */
+args->converter->fromUChar32 = ch;
+if (args->flush) {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+*err = U_ILLEGAL_CHAR_FOUND;
+}
+break;
+}
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
+temp[2] = (uint8_t) (ch >> 16 & 0x1F);
+temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
+temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
+for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
+{
+if (myTarget < targetLimit)
+{
+*(myTarget++) = temp[indexToWrite];
+}
+else
+{
+args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+}
+}
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
+{
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = (char *) myTarget;
+args->source = mySource;
+}
+static void
+T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
+UErrorCode * err)
+{
+const UChar *mySource = args->source;
+unsigned char *myTarget;
+int32_t *myOffsets;
+const UChar *sourceLimit = args->sourceLimit;
+const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
+UChar32 ch, ch2;
+unsigned int indexToWrite;
+unsigned char temp[sizeof(uint32_t)];
+int32_t offsetNum = 0;
+if(mySource >= sourceLimit) {
+/* no input, nothing to do */
+return;
+}
+/* write the BOM if necessary */
+if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
+static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
+ucnv_fromUWriteBytes(args->converter,
+bom, 4,
+&args->target, args->targetLimit,
+&args->offsets, -1,
+err);
+args->converter->fromUnicodeStatus=0;
+}
+myTarget = (unsigned char *) args->target;
+myOffsets = args->offsets;
+temp[3] = 0;
+if (args->converter->fromUChar32)
+{
+ch = args->converter->fromUChar32;
+args->converter->fromUChar32 = 0;
+goto lowsurogate;
+}
+while (mySource < sourceLimit && myTarget < targetLimit)
+{
+ch = *(mySource++);
+if (U16_IS_SURROGATE(ch)) {
+if (U16_IS_LEAD(ch))
+{
+lowsurogate:
+if (mySource < sourceLimit)
+{
+ch2 = *mySource;
+if (U16_IS_TRAIL(ch2))
+{
+ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
+mySource++;
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+else {
+/* ran out of source */
+args->converter->fromUChar32 = ch;
+if (args->flush) {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+*err = U_ILLEGAL_CHAR_FOUND;
+}
+break;
+}
+}
+else {
+/* this is an unmatched trail code unit (2nd surrogate) */
+/* callback(illegal) */
+args->converter->fromUChar32 = ch;
+*err = U_ILLEGAL_CHAR_FOUND;
+break;
+}
+}
+/* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
+temp[2] = (uint8_t) (ch >> 16 & 0x1F);
+temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
+temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
+for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
+{
+if (myTarget < targetLimit)
+{
+*(myTarget++) = temp[indexToWrite];
+*(myOffsets++) = offsetNum;
+}
+else
+{
+args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+}
+offsetNum = offsetNum + 1 + (temp[2] != 0);
+}
+if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
+{
+*err = U_BUFFER_OVERFLOW_ERROR;
+}
+args->target = (char *) myTarget;
+args->source = mySource;
+args->offsets = myOffsets;
+}
+static UChar32
+T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
+UErrorCode* err)
+{
+const uint8_t *mySource;
+UChar32 myUChar;
+int32_t length;
+mySource = (const uint8_t *)args->source;
+if (mySource >= (const uint8_t *)args->sourceLimit)
+{
+/* no input */
+*err = U_INDEX_OUTOFBOUNDS_ERROR;
+return 0xffff;
+}
+length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
+if (length < 4)
+{
+/* got a partial character */
+uprv_memcpy(args->converter->toUBytes, mySource, length);
+args->converter->toULength = (int8_t)length;
+args->source = (const char *)(mySource + length);
+*err = U_TRUNCATED_CHAR_FOUND;
+return 0xffff;
+}
+/* Don't even try to do a direct cast because the value may be on an odd address. */
+myUChar = ((UChar32)mySource[3] << 24)
+| ((UChar32)mySource[2] << 16)
+| ((UChar32)mySource[1] << 8)
+| ((UChar32)mySource[0]);
+args->source = (const char *)(mySource + 4);
+if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
+return myUChar;
+}
+uprv_memcpy(args->converter->toUBytes, mySource, 4);
+args->converter->toULength = 4;
+*err = U_ILLEGAL_CHAR_FOUND;
+return 0xffff;
+}
+static const UConverterImpl _UTF32LEImpl = {
+UCNV_UTF32_LittleEndian,
+NULL,
+NULL,
+NULL,
+NULL,
+NULL,
+T_UConverter_toUnicode_UTF32_LE,
+T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
+T_UConverter_fromUnicode_UTF32_LE,
+T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
+T_UConverter_getNextUChar_UTF32_LE,
+NULL,
+NULL,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+/* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
+static const UConverterStaticData _UTF32LEStaticData = {
+sizeof(UConverterStaticData),
+"UTF-32LE",
+1234,
+UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
+{ 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF32LEData = {
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
+0
+};
+/* UTF-32 (Detect BOM) ------------------------------------------------------ */
+/*
+* Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
+* accordingly.
+*
+* State values:
+* 0    initial state
+* 1    saw 00
+* 2    saw 00 00
+* 3    saw 00 00 FE
+* 4    -
+* 5    saw FF
+* 6    saw FF FE
+* 7    saw FF FE 00
+* 8    UTF-32BE mode
+* 9    UTF-32LE mode
+*
+* During detection: state&3==number of matching bytes so far.
+*
+* On output, emit U+FEFF as the first code point.
+*/
+static void
+_UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
+if(choice<=UCNV_RESET_TO_UNICODE) {
+/* reset toUnicode: state=0 */
+cnv->mode=0;
+}
+if(choice!=UCNV_RESET_TO_UNICODE) {
+/* reset fromUnicode: prepare to output the UTF-32PE BOM */
+cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
+}
+}
+static void
+_UTF32Open(UConverter *cnv,
+UConverterLoadArgs *pArgs,
+UErrorCode *pErrorCode) {
+_UTF32Reset(cnv, UCNV_RESET_BOTH);
+}
+static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
+static void
+_UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+UConverter *cnv=pArgs->converter;
+const char *source=pArgs->source;
+const char *sourceLimit=pArgs->sourceLimit;
+int32_t *offsets=pArgs->offsets;
+int32_t state, offsetDelta;
+char b;
+state=cnv->mode;
+/*
+* If we detect a BOM in this buffer, then we must add the BOM size to the
+* offsets because the actual converter function will not see and count the BOM.
+* offsetDelta will have the number of the BOM bytes that are in the current buffer.
+*/
+offsetDelta=0;
+while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
+switch(state) {
+case 0:
+b=*source;
+if(b==0) {
+state=1; /* could be 00 00 FE FF */
+} else if(b==(char)0xff) {
+state=5; /* could be FF FE 00 00 */
+} else {
+state=8; /* default to UTF-32BE */
+continue;
+}
+++source;
+break;
+case 1:
+case 2:
+case 3:
+case 5:
+case 6:
+case 7:
+if(*source==utf32BOM[state]) {
+++state;
+++source;
+if(state==4) {
+state=8; /* detect UTF-32BE */
+offsetDelta=(int32_t)(source-pArgs->source);
+} else if(state==8) {
+state=9; /* detect UTF-32LE */
+offsetDelta=(int32_t)(source-pArgs->source);
+}
+} else {
+/* switch to UTF-32BE and pass the previous bytes */
+int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
+/* reset the source */
+source=pArgs->source;
+if(count==(state&3)) {
+/* simple: all in the same buffer, just reset source */
+} else {
+UBool oldFlush=pArgs->flush;
+/* some of the bytes are from a previous buffer, replay those first */
+pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
+pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
+pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
+/* no offsets: bytes from previous buffer, and not enough for output */
+T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
+/* restore real pointers; pArgs->source will be set in case 8/9 */
+pArgs->sourceLimit=sourceLimit;
+pArgs->flush=oldFlush;
+}
+state=8;
+continue;
+}
+break;
+case 8:
+/* call UTF-32BE */
+pArgs->source=source;
+if(offsets==NULL) {
+T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
+} else {
+T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
+}
+source=pArgs->source;
+break;
+case 9:
+/* call UTF-32LE */
+pArgs->source=source;
+if(offsets==NULL) {
+T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
+} else {
+T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
+}
+source=pArgs->source;
+break;
+default:
+break; /* does not occur */
+}
+}
+/* add BOM size to offsets - see comment at offsetDelta declaration */
+if(offsets!=NULL && offsetDelta!=0) {
+int32_t *offsetsLimit=pArgs->offsets;
+while(offsets<offsetsLimit) {
+*offsets++ += offsetDelta;
+}
+}
+pArgs->source=source;
+if(source==sourceLimit && pArgs->flush) {
+/* handle truncated input */
+switch(state) {
+case 0:
+break; /* no input at all, nothing to do */
+case 8:
+T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
+break;
+case 9:
+T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
+break;
+default:
+/* handle 0<state<8: call UTF-32BE with too-short input */
+pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
+pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
+/* no offsets: not enough for output */
+T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
+pArgs->source=source;
+pArgs->sourceLimit=sourceLimit;
+state=8;
+break;
+}
+}
+cnv->mode=state;
+}
+static UChar32
+_UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
+UErrorCode *pErrorCode) {
+switch(pArgs->converter->mode) {
+case 8:
+return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
+case 9:
+return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
+default:
+return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+}
+}
+static const UConverterImpl _UTF32Impl = {
+UCNV_UTF32,
+NULL,
+NULL,
+_UTF32Open,
+NULL,
+_UTF32Reset,
+_UTF32ToUnicodeWithOffsets,
+_UTF32ToUnicodeWithOffsets,
+#if U_IS_BIG_ENDIAN
+T_UConverter_fromUnicode_UTF32_BE,
+T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
+#else
+T_UConverter_fromUnicode_UTF32_LE,
+T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
+#endif
+_UTF32GetNextUChar,
+NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
+NULL,
+NULL,
+NULL,
+ucnv_getNonSurrogateUnicodeSet
+};
+/* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
+static const UConverterStaticData _UTF32StaticData = {
+sizeof(UConverterStaticData),
+"UTF-32",
+1236,
+UCNV_IBM, UCNV_UTF32, 4, 4,
+#if U_IS_BIG_ENDIAN
+{ 0, 0, 0xff, 0xfd }, 4,
+#else
+{ 0xfd, 0xff, 0, 0 }, 4,
+#endif
+FALSE, FALSE,
+0,
+0,
+{ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
+};
+const UConverterSharedData _UTF32Data = {
+sizeof(UConverterSharedData), ~((uint32_t) 0),
+NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
+0
+};
+#endif

The Tor Browser / file comparison

comparison: intl/icu/source/common/ucnv_u32.c

intl/icu/source/common/ucnv_u32.c