michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: 
michael@0: #include "nsUTF16ToUnicode.h"
michael@0: #include "nsCharTraits.h"
michael@0: #include "mozilla/Endian.h"
michael@0: 
michael@0: enum {
michael@0:   STATE_NORMAL = 0,
michael@0:   STATE_HALF_CODE_POINT = 1,
michael@0:   STATE_FIRST_CALL = 2,
michael@0:   STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
michael@0:   STATE_ODD_SURROGATE_PAIR = 4
michael@0: };
michael@0: 
michael@0: nsresult
michael@0: nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
michael@0:                                             int32_t * aSrcLength,
michael@0:                                             char16_t * aDest,
michael@0:                                             int32_t * aDestLength,
michael@0:                                             bool aSwapBytes)
michael@0: {
michael@0:   const char* src = aSrc;
michael@0:   const char* srcEnd = aSrc + *aSrcLength;
michael@0:   char16_t* dest = aDest;
michael@0:   char16_t* destEnd = aDest + *aDestLength;
michael@0:   char16_t oddHighSurrogate;
michael@0: 
michael@0:   switch(mState) {
michael@0:     case STATE_FIRST_CALL:
michael@0:       NS_ASSERTION(*aSrcLength > 1, "buffer too short");
michael@0:       src+=2;
michael@0:       mState = STATE_NORMAL;
michael@0:       break;
michael@0: 
michael@0:     case STATE_SECOND_BYTE:
michael@0:       NS_ASSERTION(*aSrcLength > 0, "buffer too short");
michael@0:       src++;
michael@0:       mState = STATE_NORMAL;
michael@0:       break;
michael@0: 
michael@0:     case STATE_ODD_SURROGATE_PAIR:
michael@0:       if (*aDestLength < 2)
michael@0:         goto error;
michael@0:       else {
michael@0:         *dest++ = mOddHighSurrogate;
michael@0:         *dest++ = mOddLowSurrogate;
michael@0:         mOddHighSurrogate = mOddLowSurrogate = 0;
michael@0:         mState = STATE_NORMAL;
michael@0:       }
michael@0:       break;
michael@0: 
michael@0:     case STATE_NORMAL:
michael@0:     case STATE_HALF_CODE_POINT:
michael@0:     default:
michael@0:       break;
michael@0:   }
michael@0: 
michael@0:   oddHighSurrogate = mOddHighSurrogate;
michael@0: 
michael@0:   if (src == srcEnd) {
michael@0:     *aDestLength = dest - aDest;
michael@0:     return (mState != STATE_NORMAL || oddHighSurrogate) ?
michael@0:            NS_OK_UDEC_MOREINPUT : NS_OK;
michael@0:   }
michael@0: 
michael@0:   const char* srcEvenEnd;
michael@0: 
michael@0:   char16_t u;
michael@0:   if (mState == STATE_HALF_CODE_POINT) {
michael@0:     if (dest == destEnd)
michael@0:       goto error;
michael@0: 
michael@0:     // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
michael@0:     // previous run while the 2nd byte has to come from |*src|.
michael@0:     mState = STATE_NORMAL;
michael@0: #if MOZ_BIG_ENDIAN
michael@0:     u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
michael@0: #else
michael@0:     u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
michael@0: #endif
michael@0:     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
michael@0:     goto have_codepoint;
michael@0:   } else {
michael@0:     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
michael@0:   }
michael@0: 
michael@0:   while (src != srcEvenEnd) {
michael@0:     if (dest == destEnd)
michael@0:       goto error;
michael@0: 
michael@0: #if !defined(__sparc__) && !defined(__arm__)
michael@0:     u = *(const char16_t*)src;
michael@0: #else
michael@0:     memcpy(&u, src, 2);
michael@0: #endif
michael@0:     src += 2;
michael@0: 
michael@0: have_codepoint:
michael@0:     if (aSwapBytes)
michael@0:       u = u << 8 | u >> 8;
michael@0: 
michael@0:     if (!IS_SURROGATE(u)) {
michael@0:       if (oddHighSurrogate) {
michael@0:         if (mErrBehavior == kOnError_Signal) {
michael@0:           goto error2;
michael@0:         }
michael@0:         *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0:         if (dest == destEnd)
michael@0:           goto error;
michael@0:         oddHighSurrogate = 0;
michael@0:       }
michael@0:       *dest++ = u;
michael@0:     } else if (NS_IS_HIGH_SURROGATE(u)) {
michael@0:       if (oddHighSurrogate) {
michael@0:         if (mErrBehavior == kOnError_Signal) {
michael@0:           goto error2;
michael@0:         }
michael@0:         *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0:         if (dest == destEnd)
michael@0:           goto error;
michael@0:       }
michael@0:       oddHighSurrogate = u;
michael@0:     }
michael@0:     else /* if (NS_IS_LOW_SURROGATE(u)) */ {
michael@0:       if (oddHighSurrogate && *aDestLength > 1) {
michael@0:         if (dest + 1 >= destEnd) {
michael@0:           mOddLowSurrogate = u;
michael@0:           mOddHighSurrogate = oddHighSurrogate;
michael@0:           mState = STATE_ODD_SURROGATE_PAIR;
michael@0:           goto error;
michael@0:         }
michael@0:         *dest++ = oddHighSurrogate;
michael@0:         *dest++ = u;
michael@0:       } else {
michael@0:         if (mErrBehavior == kOnError_Signal) {
michael@0:           goto error2;
michael@0:         }
michael@0:         *dest++ = UCS2_REPLACEMENT_CHAR;
michael@0:       }
michael@0:       oddHighSurrogate = 0;
michael@0:     }
michael@0:   }
michael@0:   if (src != srcEnd) {
michael@0:     // store the lead byte of a 16-bit unit for the next run.
michael@0:     mOddByte = *src++;
michael@0:     mState = STATE_HALF_CODE_POINT;
michael@0:   }
michael@0: 
michael@0:   mOddHighSurrogate = oddHighSurrogate;
michael@0: 
michael@0:   *aDestLength = dest - aDest;
michael@0:   *aSrcLength =  src  - aSrc; 
michael@0:   return (mState != STATE_NORMAL || oddHighSurrogate) ?
michael@0:          NS_OK_UDEC_MOREINPUT : NS_OK;
michael@0: 
michael@0: error:
michael@0:   *aDestLength = dest - aDest;
michael@0:   *aSrcLength =  src  - aSrc; 
michael@0:   return  NS_OK_UDEC_MOREOUTPUT;
michael@0: 
michael@0: error2:
michael@0:   *aDestLength = dest - aDest;
michael@0:   *aSrcLength = --src - aSrc; 
michael@0:   return  NS_ERROR_ILLEGAL_INPUT;
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16ToUnicodeBase::Reset()
michael@0: {
michael@0:   mState = STATE_FIRST_CALL;
michael@0:   mOddByte = 0;
michael@0:   mOddHighSurrogate = 0;
michael@0:   mOddLowSurrogate = 0;
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, 
michael@0:                                    int32_t * aDestLength)
michael@0: {
michael@0:   // the left-over data of the previous run have to be taken into account.
michael@0:   *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
michael@0:   if (mOddHighSurrogate)
michael@0:     (*aDestLength)++;
michael@0:   if (mOddLowSurrogate)
michael@0:     (*aDestLength)++;
michael@0:   return NS_OK;
michael@0: }
michael@0: 
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0:                             char16_t * aDest, int32_t * aDestLength)
michael@0: {
michael@0:   switch (mState) {
michael@0:     case STATE_FIRST_CALL:
michael@0:       if (*aSrcLength < 2) {
michael@0:         if (*aSrcLength < 1) {
michael@0:           *aDestLength = 0;
michael@0:           return NS_OK;
michael@0:         }
michael@0:         if (uint8_t(*aSrc) != 0xFE) {
michael@0:           mState = STATE_NORMAL;
michael@0:           break;
michael@0:         }
michael@0:         *aDestLength = 0;
michael@0:         mState = STATE_SECOND_BYTE;
michael@0:         return NS_OK_UDEC_MOREINPUT;
michael@0:       }
michael@0: #if MOZ_LITTLE_ENDIAN
michael@0:       // on LE machines, BE BOM is 0xFFFE
michael@0:       if (0xFFFE != *((char16_t*)aSrc)) {
michael@0:         mState = STATE_NORMAL;
michael@0:       }
michael@0: #else
michael@0:       if (0xFEFF != *((char16_t*)aSrc)) {
michael@0:         mState = STATE_NORMAL;
michael@0:       }
michael@0: #endif
michael@0:       break;
michael@0: 
michael@0:     case STATE_SECOND_BYTE:
michael@0:       if (*aSrcLength < 1) {
michael@0:         *aDestLength = 0;
michael@0:         return NS_OK_UDEC_MOREINPUT;
michael@0:       }
michael@0:       if (uint8_t(*aSrc) != 0xFF) {
michael@0:         mOddByte = 0xFE;
michael@0:         mState = STATE_HALF_CODE_POINT;
michael@0:       }
michael@0:       break;
michael@0:   }
michael@0: 
michael@0:   return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0:                                bool(MOZ_LITTLE_ENDIAN));
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0:                             char16_t * aDest, int32_t * aDestLength)
michael@0: {
michael@0:   switch (mState) {
michael@0:     case STATE_FIRST_CALL:
michael@0:       if (*aSrcLength < 2) {
michael@0:         if (*aSrcLength < 1) {
michael@0:           *aDestLength = 0;
michael@0:           return NS_OK;
michael@0:         }
michael@0:         if (uint8_t(*aSrc) != 0xFF) {
michael@0:           mState = STATE_NORMAL;
michael@0:           break;
michael@0:         }
michael@0:         *aDestLength = 0;
michael@0:         mState = STATE_SECOND_BYTE;
michael@0:         return NS_OK_UDEC_MOREINPUT;
michael@0:       }
michael@0: #if MOZ_BIG_ENDIAN
michael@0:       // on BE machines, LE BOM is 0xFFFE
michael@0:       if (0xFFFE != *((char16_t*)aSrc)) {
michael@0:         mState = STATE_NORMAL;
michael@0:       }
michael@0: #else
michael@0:       if (0xFEFF != *((char16_t*)aSrc)) {
michael@0:         mState = STATE_NORMAL;
michael@0:       }
michael@0: #endif
michael@0:       break;
michael@0: 
michael@0:     case STATE_SECOND_BYTE:
michael@0:       if (*aSrcLength < 1) {
michael@0:         *aDestLength = 0;
michael@0:         return NS_OK_UDEC_MOREINPUT;
michael@0:       }
michael@0:       if (uint8_t(*aSrc) != 0xFE) {
michael@0:         mOddByte = 0xFF;
michael@0:         mState = STATE_HALF_CODE_POINT;
michael@0:       }
michael@0:       break;
michael@0:   }
michael@0: 
michael@0:   return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0:                                bool(MOZ_BIG_ENDIAN));
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16ToUnicode::Reset()
michael@0: {
michael@0:   mEndian = kUnknown;
michael@0:   mFoundBOM = false;
michael@0:   return nsUTF16ToUnicodeBase::Reset();
michael@0: }
michael@0: 
michael@0: NS_IMETHODIMP
michael@0: nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
michael@0:                           char16_t * aDest, int32_t * aDestLength)
michael@0: {
michael@0:     if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
michael@0:     {
michael@0:       nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
michael@0:       *aSrcLength=0;
michael@0:       *aDestLength=0;
michael@0:       return res;
michael@0:     }
michael@0:     if(STATE_FIRST_CALL == mState) // first time called
michael@0:     {
michael@0:       // check if BOM (0xFEFF) is at the beginning, remove it if found, and
michael@0:       // set mEndian accordingly.
michael@0:       if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
michael@0:         mEndian = kLittleEndian;
michael@0:         mFoundBOM = true;
michael@0:       }
michael@0:       else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
michael@0:         mEndian = kBigEndian;
michael@0:         mFoundBOM = true;
michael@0:       }
michael@0:       // BOM is not found, but we can use a simple heuristic to determine
michael@0:       // the endianness. Assume the first character is [U+0001, U+00FF].
michael@0:       // Not always valid, but it's very likely to hold for html/xml/css. 
michael@0:       else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
michael@0:         mState = STATE_NORMAL;
michael@0:         mEndian = kBigEndian;
michael@0:       }
michael@0:       else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
michael@0:         mState = STATE_NORMAL;
michael@0:         mEndian = kLittleEndian;
michael@0:       }
michael@0:       else { // Neither BOM nor 'plausible' byte patterns at the beginning.
michael@0:              // Just assume it's BE (following Unicode standard)
michael@0:              // and let the garbage show up in the browser. (security concern?)
michael@0:              // (bug 246194)
michael@0:         mState = STATE_NORMAL;
michael@0:         mEndian = kBigEndian;
michael@0:       }
michael@0:     }
michael@0:     
michael@0:     nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
michael@0: #if MOZ_BIG_ENDIAN
michael@0:                                         (mEndian == kLittleEndian)
michael@0: #else
michael@0:                                         (mEndian == kBigEndian)
michael@0: #endif
michael@0:                                         );
michael@0: 
michael@0:     // If BOM is not found and we're to return NS_OK, signal that BOM
michael@0:     // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
michael@0:     return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
michael@0: }