michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "nsUTF16ToUnicode.h" michael@0: #include "nsCharTraits.h" michael@0: #include "mozilla/Endian.h" michael@0: michael@0: enum { michael@0: STATE_NORMAL = 0, michael@0: STATE_HALF_CODE_POINT = 1, michael@0: STATE_FIRST_CALL = 2, michael@0: STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT, michael@0: STATE_ODD_SURROGATE_PAIR = 4 michael@0: }; michael@0: michael@0: nsresult michael@0: nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc, michael@0: int32_t * aSrcLength, michael@0: char16_t * aDest, michael@0: int32_t * aDestLength, michael@0: bool aSwapBytes) michael@0: { michael@0: const char* src = aSrc; michael@0: const char* srcEnd = aSrc + *aSrcLength; michael@0: char16_t* dest = aDest; michael@0: char16_t* destEnd = aDest + *aDestLength; michael@0: char16_t oddHighSurrogate; michael@0: michael@0: switch(mState) { michael@0: case STATE_FIRST_CALL: michael@0: NS_ASSERTION(*aSrcLength > 1, "buffer too short"); michael@0: src+=2; michael@0: mState = STATE_NORMAL; michael@0: break; michael@0: michael@0: case STATE_SECOND_BYTE: michael@0: NS_ASSERTION(*aSrcLength > 0, "buffer too short"); michael@0: src++; michael@0: mState = STATE_NORMAL; michael@0: break; michael@0: michael@0: case STATE_ODD_SURROGATE_PAIR: michael@0: if (*aDestLength < 2) michael@0: goto error; michael@0: else { michael@0: *dest++ = mOddHighSurrogate; michael@0: *dest++ = mOddLowSurrogate; michael@0: mOddHighSurrogate = mOddLowSurrogate = 0; michael@0: mState = STATE_NORMAL; michael@0: } michael@0: break; michael@0: michael@0: case STATE_NORMAL: michael@0: case STATE_HALF_CODE_POINT: michael@0: default: michael@0: break; michael@0: } michael@0: michael@0: oddHighSurrogate = mOddHighSurrogate; michael@0: michael@0: if (src == srcEnd) { michael@0: *aDestLength = dest - aDest; michael@0: return (mState != STATE_NORMAL || oddHighSurrogate) ? michael@0: NS_OK_UDEC_MOREINPUT : NS_OK; michael@0: } michael@0: michael@0: const char* srcEvenEnd; michael@0: michael@0: char16_t u; michael@0: if (mState == STATE_HALF_CODE_POINT) { michael@0: if (dest == destEnd) michael@0: goto error; michael@0: michael@0: // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the michael@0: // previous run while the 2nd byte has to come from |*src|. michael@0: mState = STATE_NORMAL; michael@0: #if MOZ_BIG_ENDIAN michael@0: u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte. michael@0: #else michael@0: u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte. michael@0: #endif michael@0: srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop michael@0: goto have_codepoint; michael@0: } else { michael@0: srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop michael@0: } michael@0: michael@0: while (src != srcEvenEnd) { michael@0: if (dest == destEnd) michael@0: goto error; michael@0: michael@0: #if !defined(__sparc__) && !defined(__arm__) michael@0: u = *(const char16_t*)src; michael@0: #else michael@0: memcpy(&u, src, 2); michael@0: #endif michael@0: src += 2; michael@0: michael@0: have_codepoint: michael@0: if (aSwapBytes) michael@0: u = u << 8 | u >> 8; michael@0: michael@0: if (!IS_SURROGATE(u)) { michael@0: if (oddHighSurrogate) { michael@0: if (mErrBehavior == kOnError_Signal) { michael@0: goto error2; michael@0: } michael@0: *dest++ = UCS2_REPLACEMENT_CHAR; michael@0: if (dest == destEnd) michael@0: goto error; michael@0: oddHighSurrogate = 0; michael@0: } michael@0: *dest++ = u; michael@0: } else if (NS_IS_HIGH_SURROGATE(u)) { michael@0: if (oddHighSurrogate) { michael@0: if (mErrBehavior == kOnError_Signal) { michael@0: goto error2; michael@0: } michael@0: *dest++ = UCS2_REPLACEMENT_CHAR; michael@0: if (dest == destEnd) michael@0: goto error; michael@0: } michael@0: oddHighSurrogate = u; michael@0: } michael@0: else /* if (NS_IS_LOW_SURROGATE(u)) */ { michael@0: if (oddHighSurrogate && *aDestLength > 1) { michael@0: if (dest + 1 >= destEnd) { michael@0: mOddLowSurrogate = u; michael@0: mOddHighSurrogate = oddHighSurrogate; michael@0: mState = STATE_ODD_SURROGATE_PAIR; michael@0: goto error; michael@0: } michael@0: *dest++ = oddHighSurrogate; michael@0: *dest++ = u; michael@0: } else { michael@0: if (mErrBehavior == kOnError_Signal) { michael@0: goto error2; michael@0: } michael@0: *dest++ = UCS2_REPLACEMENT_CHAR; michael@0: } michael@0: oddHighSurrogate = 0; michael@0: } michael@0: } michael@0: if (src != srcEnd) { michael@0: // store the lead byte of a 16-bit unit for the next run. michael@0: mOddByte = *src++; michael@0: mState = STATE_HALF_CODE_POINT; michael@0: } michael@0: michael@0: mOddHighSurrogate = oddHighSurrogate; michael@0: michael@0: *aDestLength = dest - aDest; michael@0: *aSrcLength = src - aSrc; michael@0: return (mState != STATE_NORMAL || oddHighSurrogate) ? michael@0: NS_OK_UDEC_MOREINPUT : NS_OK; michael@0: michael@0: error: michael@0: *aDestLength = dest - aDest; michael@0: *aSrcLength = src - aSrc; michael@0: return NS_OK_UDEC_MOREOUTPUT; michael@0: michael@0: error2: michael@0: *aDestLength = dest - aDest; michael@0: *aSrcLength = --src - aSrc; michael@0: return NS_ERROR_ILLEGAL_INPUT; michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16ToUnicodeBase::Reset() michael@0: { michael@0: mState = STATE_FIRST_CALL; michael@0: mOddByte = 0; michael@0: mOddHighSurrogate = 0; michael@0: mOddLowSurrogate = 0; michael@0: return NS_OK; michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, michael@0: int32_t * aDestLength) michael@0: { michael@0: // the left-over data of the previous run have to be taken into account. michael@0: *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2; michael@0: if (mOddHighSurrogate) michael@0: (*aDestLength)++; michael@0: if (mOddLowSurrogate) michael@0: (*aDestLength)++; michael@0: return NS_OK; michael@0: } michael@0: michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, michael@0: char16_t * aDest, int32_t * aDestLength) michael@0: { michael@0: switch (mState) { michael@0: case STATE_FIRST_CALL: michael@0: if (*aSrcLength < 2) { michael@0: if (*aSrcLength < 1) { michael@0: *aDestLength = 0; michael@0: return NS_OK; michael@0: } michael@0: if (uint8_t(*aSrc) != 0xFE) { michael@0: mState = STATE_NORMAL; michael@0: break; michael@0: } michael@0: *aDestLength = 0; michael@0: mState = STATE_SECOND_BYTE; michael@0: return NS_OK_UDEC_MOREINPUT; michael@0: } michael@0: #if MOZ_LITTLE_ENDIAN michael@0: // on LE machines, BE BOM is 0xFFFE michael@0: if (0xFFFE != *((char16_t*)aSrc)) { michael@0: mState = STATE_NORMAL; michael@0: } michael@0: #else michael@0: if (0xFEFF != *((char16_t*)aSrc)) { michael@0: mState = STATE_NORMAL; michael@0: } michael@0: #endif michael@0: break; michael@0: michael@0: case STATE_SECOND_BYTE: michael@0: if (*aSrcLength < 1) { michael@0: *aDestLength = 0; michael@0: return NS_OK_UDEC_MOREINPUT; michael@0: } michael@0: if (uint8_t(*aSrc) != 0xFF) { michael@0: mOddByte = 0xFE; michael@0: mState = STATE_HALF_CODE_POINT; michael@0: } michael@0: break; michael@0: } michael@0: michael@0: return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, michael@0: bool(MOZ_LITTLE_ENDIAN)); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, michael@0: char16_t * aDest, int32_t * aDestLength) michael@0: { michael@0: switch (mState) { michael@0: case STATE_FIRST_CALL: michael@0: if (*aSrcLength < 2) { michael@0: if (*aSrcLength < 1) { michael@0: *aDestLength = 0; michael@0: return NS_OK; michael@0: } michael@0: if (uint8_t(*aSrc) != 0xFF) { michael@0: mState = STATE_NORMAL; michael@0: break; michael@0: } michael@0: *aDestLength = 0; michael@0: mState = STATE_SECOND_BYTE; michael@0: return NS_OK_UDEC_MOREINPUT; michael@0: } michael@0: #if MOZ_BIG_ENDIAN michael@0: // on BE machines, LE BOM is 0xFFFE michael@0: if (0xFFFE != *((char16_t*)aSrc)) { michael@0: mState = STATE_NORMAL; michael@0: } michael@0: #else michael@0: if (0xFEFF != *((char16_t*)aSrc)) { michael@0: mState = STATE_NORMAL; michael@0: } michael@0: #endif michael@0: break; michael@0: michael@0: case STATE_SECOND_BYTE: michael@0: if (*aSrcLength < 1) { michael@0: *aDestLength = 0; michael@0: return NS_OK_UDEC_MOREINPUT; michael@0: } michael@0: if (uint8_t(*aSrc) != 0xFE) { michael@0: mOddByte = 0xFF; michael@0: mState = STATE_HALF_CODE_POINT; michael@0: } michael@0: break; michael@0: } michael@0: michael@0: return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, michael@0: bool(MOZ_BIG_ENDIAN)); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16ToUnicode::Reset() michael@0: { michael@0: mEndian = kUnknown; michael@0: mFoundBOM = false; michael@0: return nsUTF16ToUnicodeBase::Reset(); michael@0: } michael@0: michael@0: NS_IMETHODIMP michael@0: nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, michael@0: char16_t * aDest, int32_t * aDestLength) michael@0: { michael@0: if(STATE_FIRST_CALL == mState && *aSrcLength < 2) michael@0: { michael@0: nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT; michael@0: *aSrcLength=0; michael@0: *aDestLength=0; michael@0: return res; michael@0: } michael@0: if(STATE_FIRST_CALL == mState) // first time called michael@0: { michael@0: // check if BOM (0xFEFF) is at the beginning, remove it if found, and michael@0: // set mEndian accordingly. michael@0: if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) { michael@0: mEndian = kLittleEndian; michael@0: mFoundBOM = true; michael@0: } michael@0: else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) { michael@0: mEndian = kBigEndian; michael@0: mFoundBOM = true; michael@0: } michael@0: // BOM is not found, but we can use a simple heuristic to determine michael@0: // the endianness. Assume the first character is [U+0001, U+00FF]. michael@0: // Not always valid, but it's very likely to hold for html/xml/css. michael@0: else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00) michael@0: mState = STATE_NORMAL; michael@0: mEndian = kBigEndian; michael@0: } michael@0: else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00) michael@0: mState = STATE_NORMAL; michael@0: mEndian = kLittleEndian; michael@0: } michael@0: else { // Neither BOM nor 'plausible' byte patterns at the beginning. michael@0: // Just assume it's BE (following Unicode standard) michael@0: // and let the garbage show up in the browser. (security concern?) michael@0: // (bug 246194) michael@0: mState = STATE_NORMAL; michael@0: mEndian = kBigEndian; michael@0: } michael@0: } michael@0: michael@0: nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, michael@0: #if MOZ_BIG_ENDIAN michael@0: (mEndian == kLittleEndian) michael@0: #else michael@0: (mEndian == kBigEndian) michael@0: #endif michael@0: ); michael@0: michael@0: // If BOM is not found and we're to return NS_OK, signal that BOM michael@0: // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| michael@0: return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; michael@0: }