michael@0: /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: /** michael@0: * A character set converter from GBK to Unicode. michael@0: * michael@0: * michael@0: * @created 07/Sept/1999 michael@0: * @author Yueheng Xu, Yueheng.Xu@intel.com michael@0: */ michael@0: michael@0: #include "nsGBKToUnicode.h" michael@0: #include "gbku.h" michael@0: michael@0: michael@0: //------------------------------------------------------------ michael@0: // nsGBKUnique2BytesToUnicode michael@0: //------------------------------------------------------------ michael@0: class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport michael@0: { michael@0: public: michael@0: nsGBKUnique2BytesToUnicode(); michael@0: virtual ~nsGBKUnique2BytesToUnicode() michael@0: { } michael@0: protected: michael@0: }; michael@0: michael@0: static const uint16_t g_utGBKUnique2Bytes[] = { michael@0: #include "gbkuniq2b.ut" michael@0: }; michael@0: nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() michael@0: : nsTableDecoderSupport(u2BytesCharset, nullptr, michael@0: (uMappingTable*) &g_utGBKUnique2Bytes, 1) michael@0: { michael@0: } michael@0: michael@0: //------------------------------------------------------------ michael@0: // nsGB18030Unique2BytesToUnicode michael@0: //------------------------------------------------------------ michael@0: class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport michael@0: { michael@0: public: michael@0: nsGB18030Unique2BytesToUnicode(); michael@0: virtual ~nsGB18030Unique2BytesToUnicode() michael@0: { } michael@0: protected: michael@0: }; michael@0: michael@0: static const uint16_t g_utGB18030Unique2Bytes[] = { michael@0: #include "gb18030uniq2b.ut" michael@0: }; michael@0: nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() michael@0: : nsTableDecoderSupport(u2BytesCharset, nullptr, michael@0: (uMappingTable*) &g_utGB18030Unique2Bytes, 1) michael@0: { michael@0: } michael@0: michael@0: //------------------------------------------------------------ michael@0: // nsGB18030Unique4BytesToUnicode michael@0: //------------------------------------------------------------ michael@0: class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport michael@0: { michael@0: public: michael@0: nsGB18030Unique4BytesToUnicode(); michael@0: virtual ~nsGB18030Unique4BytesToUnicode() michael@0: { } michael@0: protected: michael@0: }; michael@0: michael@0: static const uint16_t g_utGB18030Unique4Bytes[] = { michael@0: #include "gb180304bytes.ut" michael@0: }; michael@0: nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() michael@0: : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr, michael@0: (uMappingTable*) &g_utGB18030Unique4Bytes, 1) michael@0: { michael@0: } michael@0: michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // Class nsGBKToUnicode [implementation] michael@0: michael@0: //---------------------------------------------------------------------- michael@0: // Subclassing of nsTablesDecoderSupport class [implementation] michael@0: michael@0: #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \ michael@0: (UINT8_IN_RANGE(0x81, (c), 0xFE)) michael@0: #define FIRST_BYTE_IS_SURROGATE(c) \ michael@0: (UINT8_IN_RANGE(0x90, (c), 0xFE)) michael@0: #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \ michael@0: (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE)) michael@0: #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \ michael@0: (UINT8_IN_RANGE(0x30, (c), 0x39)) michael@0: #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \ michael@0: (UINT8_IN_RANGE(0x81, (c), 0xFE)) michael@0: #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \ michael@0: (UINT8_IN_RANGE(0x30, (c), 0x39)) michael@0: michael@0: NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc, michael@0: int32_t * aSrcLength, michael@0: char16_t *aDest, michael@0: int32_t * aDestLength) michael@0: { michael@0: int32_t i=0; michael@0: int32_t iSrcLength = (*aSrcLength); michael@0: int32_t iDestlen = 0; michael@0: nsresult rv=NS_OK; michael@0: *aSrcLength = 0; michael@0: michael@0: for (i=0;i= (*aDestLength) ) michael@0: { michael@0: rv = NS_OK_UDEC_MOREOUTPUT; michael@0: break; michael@0: } michael@0: // The valid range for the 1st byte is [0x81,0xFE] michael@0: if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc)) michael@0: { michael@0: if(i+1 >= iSrcLength) michael@0: { michael@0: rv = NS_OK_UDEC_MOREINPUT; michael@0: break; michael@0: } michael@0: // To make sure, the second byte has to be checked as well. michael@0: // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE] michael@0: if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1])) michael@0: { michael@0: // Valid GBK code michael@0: *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]); michael@0: if(UCS2_NO_MAPPING == *aDest) michael@0: { michael@0: // We cannot map in the common mapping, let's call the michael@0: // delegate 2 byte decoder to decode the gbk or gb18030 unique michael@0: // 2 byte mapping michael@0: if(! TryExtensionDecoder(aSrc, aDest)) michael@0: { michael@0: *aDest = UCS2_NO_MAPPING; michael@0: } michael@0: } michael@0: aSrc += 2; michael@0: i++; michael@0: } michael@0: else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) michael@0: { michael@0: // from the first 2 bytes, it looks like a 4 byte GB18030 michael@0: if(i+3 >= iSrcLength) // make sure we got 4 bytes michael@0: { michael@0: rv = NS_OK_UDEC_MOREINPUT; michael@0: break; michael@0: } michael@0: // 4 bytes patten michael@0: // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39] michael@0: // preset the michael@0: michael@0: if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) && michael@0: LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) michael@0: { michael@0: if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) michael@0: { michael@0: // let's call the delegated 4 byte gb18030 converter to convert it michael@0: if(! Try4BytesDecoder(aSrc, aDest)) michael@0: *aDest = UCS2_NO_MAPPING; michael@0: } else { michael@0: // let's try supplement mapping michael@0: if ( (iDestlen+1) < (*aDestLength) ) michael@0: { michael@0: if(DecodeToSurrogate(aSrc, aDest)) michael@0: { michael@0: // surrogte two char16_t michael@0: iDestlen++; michael@0: aDest++; michael@0: } else { michael@0: *aDest = UCS2_NO_MAPPING; michael@0: } michael@0: } else { michael@0: if (*aDestLength < 2) { michael@0: NS_ERROR("insufficient space in output buffer"); michael@0: *aDest = UCS2_NO_MAPPING; michael@0: } else { michael@0: rv = NS_OK_UDEC_MOREOUTPUT; michael@0: break; michael@0: } michael@0: } michael@0: } michael@0: aSrc += 4; michael@0: i += 3; michael@0: } else { michael@0: *aDest = UCS2_NO_MAPPING; michael@0: // If the third and fourth bytes are not in the legal ranges for michael@0: // a four-byte sequnce, resynchronize on the second byte michael@0: // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE, michael@0: // 0x30-0x39) michael@0: aSrc++; michael@0: } michael@0: } michael@0: else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 ) michael@0: { michael@0: // stand-alone (not followed by a valid second byte) 0xA0 ! michael@0: // treat it as valid a la Netscape 4.x michael@0: *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); michael@0: aSrc++; michael@0: } else { michael@0: // Invalid GBK code point (second byte should be 0x40 or higher) michael@0: *aDest = UCS2_NO_MAPPING; michael@0: aSrc++; michael@0: } michael@0: } else { michael@0: if(IS_ASCII(*aSrc)) michael@0: { michael@0: // The source is an ASCII michael@0: *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); michael@0: aSrc++; michael@0: } else { michael@0: if(IS_GBK_EURO(*aSrc)) { michael@0: *aDest = UCS2_EURO; michael@0: } else { michael@0: *aDest = UCS2_NO_MAPPING; michael@0: } michael@0: aSrc++; michael@0: } michael@0: } michael@0: iDestlen++; michael@0: aDest++; michael@0: *aSrcLength = i+1; michael@0: } michael@0: *aDestLength = iDestlen; michael@0: return rv; michael@0: } michael@0: michael@0: michael@0: void nsGBKToUnicode::CreateExtensionDecoder() michael@0: { michael@0: mExtensionDecoder = new nsGBKUnique2BytesToUnicode(); michael@0: } michael@0: void nsGBKToUnicode::Create4BytesDecoder() michael@0: { michael@0: m4BytesDecoder = nullptr; michael@0: } michael@0: void nsGB18030ToUnicode::CreateExtensionDecoder() michael@0: { michael@0: mExtensionDecoder = new nsGB18030Unique2BytesToUnicode(); michael@0: } michael@0: void nsGB18030ToUnicode::Create4BytesDecoder() michael@0: { michael@0: m4BytesDecoder = new nsGB18030Unique4BytesToUnicode(); michael@0: } michael@0: bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) michael@0: { michael@0: NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte"); michael@0: NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte"); michael@0: NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte"); michael@0: NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte"); michael@0: if(! FIRST_BYTE_IS_SURROGATE(aSrc[0])) michael@0: return false; michael@0: if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) michael@0: return false; michael@0: if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2])) michael@0: return false; michael@0: if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) michael@0: return false; michael@0: michael@0: uint8_t a1 = (uint8_t) aSrc[0]; michael@0: uint8_t a2 = (uint8_t) aSrc[1]; michael@0: uint8_t a3 = (uint8_t) aSrc[2]; michael@0: uint8_t a4 = (uint8_t) aSrc[3]; michael@0: a1 -= (uint8_t)0x90; michael@0: a2 -= (uint8_t)0x30; michael@0: a3 -= (uint8_t)0x81; michael@0: a4 -= (uint8_t)0x30; michael@0: uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4; michael@0: // idx == ucs4Codepoint - 0x10000 michael@0: if (idx > 0x000FFFFF) michael@0: return false; michael@0: michael@0: *aOut++ = 0xD800 | (idx >> 10); michael@0: *aOut = 0xDC00 | (0x000003FF & idx); michael@0: michael@0: return true; michael@0: } michael@0: bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut) michael@0: { michael@0: if(!mExtensionDecoder) michael@0: CreateExtensionDecoder(); michael@0: NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter"); michael@0: if(mExtensionDecoder) michael@0: { michael@0: nsresult res = mExtensionDecoder->Reset(); michael@0: NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed"); michael@0: int32_t len = 2; michael@0: int32_t dstlen = 1; michael@0: res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); michael@0: NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), michael@0: "some strange conversion result"); michael@0: // if we failed, we then just use the 0xfffd michael@0: // therefore, we ignore the res here. michael@0: if(NS_SUCCEEDED(res)) michael@0: return true; michael@0: } michael@0: return false; michael@0: } michael@0: bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) michael@0: { michael@0: return false; michael@0: } michael@0: bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut) michael@0: { michael@0: if(!m4BytesDecoder) michael@0: Create4BytesDecoder(); michael@0: if(m4BytesDecoder) michael@0: { michael@0: nsresult res = m4BytesDecoder->Reset(); michael@0: NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed"); michael@0: int32_t len = 4; michael@0: int32_t dstlen = 1; michael@0: res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); michael@0: NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), michael@0: "some strange conversion result"); michael@0: // if we failed, we then just use the 0xfffd michael@0: // therefore, we ignore the res here. michael@0: if(NS_SUCCEEDED(res)) michael@0: return true; michael@0: } michael@0: return false; michael@0: }