The Tor Browser: comparison intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

--1:000000000000
+:483da0657f4f
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+* License, v. 2.0. If a copy of the MPL was not distributed with this
+* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+#include "nsUTF16ToUnicode.h"
+#include "nsCharTraits.h"
+#include "mozilla/Endian.h"
+enum {
+STATE_NORMAL = 0,
+STATE_HALF_CODE_POINT = 1,
+STATE_FIRST_CALL = 2,
+STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
+STATE_ODD_SURROGATE_PAIR = 4
+};
+nsresult
+nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
+int32_t * aSrcLength,
+char16_t * aDest,
+int32_t * aDestLength,
+bool aSwapBytes)
+{
+const char* src = aSrc;
+const char* srcEnd = aSrc + *aSrcLength;
+char16_t* dest = aDest;
+char16_t* destEnd = aDest + *aDestLength;
+char16_t oddHighSurrogate;
+switch(mState) {
+case STATE_FIRST_CALL:
+NS_ASSERTION(*aSrcLength > 1, "buffer too short");
+src+=2;
+mState = STATE_NORMAL;
+break;
+case STATE_SECOND_BYTE:
+NS_ASSERTION(*aSrcLength > 0, "buffer too short");
+src++;
+mState = STATE_NORMAL;
+break;
+case STATE_ODD_SURROGATE_PAIR:
+if (*aDestLength < 2)
+goto error;
+else {
+*dest++ = mOddHighSurrogate;
+*dest++ = mOddLowSurrogate;
+mOddHighSurrogate = mOddLowSurrogate = 0;
+mState = STATE_NORMAL;
+}
+break;
+case STATE_NORMAL:
+case STATE_HALF_CODE_POINT:
+default:
+break;
+}
+oddHighSurrogate = mOddHighSurrogate;
+if (src == srcEnd) {
+*aDestLength = dest - aDest;
+return (mState != STATE_NORMAL || oddHighSurrogate) ?
+NS_OK_UDEC_MOREINPUT : NS_OK;
+}
+const char* srcEvenEnd;
+char16_t u;
+if (mState == STATE_HALF_CODE_POINT) {
+if (dest == destEnd)
+goto error;
+// the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
+// previous run while the 2nd byte has to come from |*src|.
+mState = STATE_NORMAL;
+#if MOZ_BIG_ENDIAN
+u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
+#else
+u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
+#endif
+srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+goto have_codepoint;
+} else {
+srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
+}
+while (src != srcEvenEnd) {
+if (dest == destEnd)
+goto error;
+#if !defined(__sparc__) && !defined(__arm__)
+u = *(const char16_t*)src;
+#else
+memcpy(&u, src, 2);
+#endif
+src += 2;
+have_codepoint:
+if (aSwapBytes)
+u = u << 8 | u >> 8;
+if (!IS_SURROGATE(u)) {
+if (oddHighSurrogate) {
+if (mErrBehavior == kOnError_Signal) {
+goto error2;
+}
+*dest++ = UCS2_REPLACEMENT_CHAR;
+if (dest == destEnd)
+goto error;
+oddHighSurrogate = 0;
+}
+*dest++ = u;
+} else if (NS_IS_HIGH_SURROGATE(u)) {
+if (oddHighSurrogate) {
+if (mErrBehavior == kOnError_Signal) {
+goto error2;
+}
+*dest++ = UCS2_REPLACEMENT_CHAR;
+if (dest == destEnd)
+goto error;
+}
+oddHighSurrogate = u;
+}
+else /* if (NS_IS_LOW_SURROGATE(u)) */ {
+if (oddHighSurrogate && *aDestLength > 1) {
+if (dest + 1 >= destEnd) {
+mOddLowSurrogate = u;
+mOddHighSurrogate = oddHighSurrogate;
+mState = STATE_ODD_SURROGATE_PAIR;
+goto error;
+}
+*dest++ = oddHighSurrogate;
+*dest++ = u;
+} else {
+if (mErrBehavior == kOnError_Signal) {
+goto error2;
+}
+*dest++ = UCS2_REPLACEMENT_CHAR;
+}
+oddHighSurrogate = 0;
+}
+}
+if (src != srcEnd) {
+// store the lead byte of a 16-bit unit for the next run.
+mOddByte = *src++;
+mState = STATE_HALF_CODE_POINT;
+}
+mOddHighSurrogate = oddHighSurrogate;
+*aDestLength = dest - aDest;
+*aSrcLength =  src  - aSrc;
+return (mState != STATE_NORMAL || oddHighSurrogate) ?
+NS_OK_UDEC_MOREINPUT : NS_OK;
+error:
+*aDestLength = dest - aDest;
+*aSrcLength =  src  - aSrc;
+return  NS_OK_UDEC_MOREOUTPUT;
+error2:
+*aDestLength = dest - aDest;
+*aSrcLength = --src - aSrc;
+return  NS_ERROR_ILLEGAL_INPUT;
+}
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::Reset()
+{
+mState = STATE_FIRST_CALL;
+mOddByte = 0;
+mOddHighSurrogate = 0;
+mOddLowSurrogate = 0;
+return NS_OK;
+}
+NS_IMETHODIMP
+nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength,
+int32_t * aDestLength)
+{
+// the left-over data of the previous run have to be taken into account.
+*aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
+if (mOddHighSurrogate)
+(*aDestLength)++;
+if (mOddLowSurrogate)
+(*aDestLength)++;
+return NS_OK;
+}
+NS_IMETHODIMP
+nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+char16_t * aDest, int32_t * aDestLength)
+{
+switch (mState) {
+case STATE_FIRST_CALL:
+if (*aSrcLength < 2) {
+if (*aSrcLength < 1) {
+*aDestLength = 0;
+return NS_OK;
+}
+if (uint8_t(*aSrc) != 0xFE) {
+mState = STATE_NORMAL;
+break;
+}
+*aDestLength = 0;
+mState = STATE_SECOND_BYTE;
+return NS_OK_UDEC_MOREINPUT;
+}
+#if MOZ_LITTLE_ENDIAN
+// on LE machines, BE BOM is 0xFFFE
+if (0xFFFE != *((char16_t*)aSrc)) {
+mState = STATE_NORMAL;
+}
+#else
+if (0xFEFF != *((char16_t*)aSrc)) {
+mState = STATE_NORMAL;
+}
+#endif
+break;
+case STATE_SECOND_BYTE:
+if (*aSrcLength < 1) {
+*aDestLength = 0;
+return NS_OK_UDEC_MOREINPUT;
+}
+if (uint8_t(*aSrc) != 0xFF) {
+mOddByte = 0xFE;
+mState = STATE_HALF_CODE_POINT;
+}
+break;
+}
+return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+bool(MOZ_LITTLE_ENDIAN));
+}
+NS_IMETHODIMP
+nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+char16_t * aDest, int32_t * aDestLength)
+{
+switch (mState) {
+case STATE_FIRST_CALL:
+if (*aSrcLength < 2) {
+if (*aSrcLength < 1) {
+*aDestLength = 0;
+return NS_OK;
+}
+if (uint8_t(*aSrc) != 0xFF) {
+mState = STATE_NORMAL;
+break;
+}
+*aDestLength = 0;
+mState = STATE_SECOND_BYTE;
+return NS_OK_UDEC_MOREINPUT;
+}
+#if MOZ_BIG_ENDIAN
+// on BE machines, LE BOM is 0xFFFE
+if (0xFFFE != *((char16_t*)aSrc)) {
+mState = STATE_NORMAL;
+}
+#else
+if (0xFEFF != *((char16_t*)aSrc)) {
+mState = STATE_NORMAL;
+}
+#endif
+break;
+case STATE_SECOND_BYTE:
+if (*aSrcLength < 1) {
+*aDestLength = 0;
+return NS_OK_UDEC_MOREINPUT;
+}
+if (uint8_t(*aSrc) != 0xFE) {
+mOddByte = 0xFF;
+mState = STATE_HALF_CODE_POINT;
+}
+break;
+}
+return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+bool(MOZ_BIG_ENDIAN));
+}
+NS_IMETHODIMP
+nsUTF16ToUnicode::Reset()
+{
+mEndian = kUnknown;
+mFoundBOM = false;
+return nsUTF16ToUnicodeBase::Reset();
+}
+NS_IMETHODIMP
+nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
+char16_t * aDest, int32_t * aDestLength)
+{
+if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
+{
+nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
+*aSrcLength=0;
+*aDestLength=0;
+return res;
+}
+if(STATE_FIRST_CALL == mState) // first time called
+{
+// check if BOM (0xFEFF) is at the beginning, remove it if found, and
+// set mEndian accordingly.
+if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
+mEndian = kLittleEndian;
+mFoundBOM = true;
+}
+else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
+mEndian = kBigEndian;
+mFoundBOM = true;
+}
+// BOM is not found, but we can use a simple heuristic to determine
+// the endianness. Assume the first character is [U+0001, U+00FF].
+// Not always valid, but it's very likely to hold for html/xml/css.
+else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
+mState = STATE_NORMAL;
+mEndian = kBigEndian;
+}
+else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
+mState = STATE_NORMAL;
+mEndian = kLittleEndian;
+}
+else { // Neither BOM nor 'plausible' byte patterns at the beginning.
+// Just assume it's BE (following Unicode standard)
+// and let the garbage show up in the browser. (security concern?)
+// (bug 246194)
+mState = STATE_NORMAL;
+mEndian = kBigEndian;
+}
+}
+nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
+#if MOZ_BIG_ENDIAN
+(mEndian == kLittleEndian)
+#else
+(mEndian == kBigEndian)
+#endif
+);
+// If BOM is not found and we're to return NS_OK, signal that BOM
+// is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
+return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
+}

The Tor Browser / file comparison

comparison: intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp