1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,326 @@ 1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +/** 1.9 + * A character set converter from GBK to Unicode. 1.10 + * 1.11 + * 1.12 + * @created 07/Sept/1999 1.13 + * @author Yueheng Xu, Yueheng.Xu@intel.com 1.14 + */ 1.15 + 1.16 +#include "nsGBKToUnicode.h" 1.17 +#include "gbku.h" 1.18 + 1.19 + 1.20 +//------------------------------------------------------------ 1.21 +// nsGBKUnique2BytesToUnicode 1.22 +//------------------------------------------------------------ 1.23 +class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport 1.24 +{ 1.25 +public: 1.26 + nsGBKUnique2BytesToUnicode(); 1.27 + virtual ~nsGBKUnique2BytesToUnicode() 1.28 + { } 1.29 +protected: 1.30 +}; 1.31 + 1.32 +static const uint16_t g_utGBKUnique2Bytes[] = { 1.33 +#include "gbkuniq2b.ut" 1.34 +}; 1.35 +nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() 1.36 + : nsTableDecoderSupport(u2BytesCharset, nullptr, 1.37 + (uMappingTable*) &g_utGBKUnique2Bytes, 1) 1.38 +{ 1.39 +} 1.40 + 1.41 +//------------------------------------------------------------ 1.42 +// nsGB18030Unique2BytesToUnicode 1.43 +//------------------------------------------------------------ 1.44 +class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport 1.45 +{ 1.46 +public: 1.47 + nsGB18030Unique2BytesToUnicode(); 1.48 + virtual ~nsGB18030Unique2BytesToUnicode() 1.49 + { } 1.50 +protected: 1.51 +}; 1.52 + 1.53 +static const uint16_t g_utGB18030Unique2Bytes[] = { 1.54 +#include "gb18030uniq2b.ut" 1.55 +}; 1.56 +nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() 1.57 + : nsTableDecoderSupport(u2BytesCharset, nullptr, 1.58 + (uMappingTable*) &g_utGB18030Unique2Bytes, 1) 1.59 +{ 1.60 +} 1.61 + 1.62 +//------------------------------------------------------------ 1.63 +// nsGB18030Unique4BytesToUnicode 1.64 +//------------------------------------------------------------ 1.65 +class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport 1.66 +{ 1.67 +public: 1.68 + nsGB18030Unique4BytesToUnicode(); 1.69 + virtual ~nsGB18030Unique4BytesToUnicode() 1.70 + { } 1.71 +protected: 1.72 +}; 1.73 + 1.74 +static const uint16_t g_utGB18030Unique4Bytes[] = { 1.75 +#include "gb180304bytes.ut" 1.76 +}; 1.77 +nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() 1.78 + : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr, 1.79 + (uMappingTable*) &g_utGB18030Unique4Bytes, 1) 1.80 +{ 1.81 +} 1.82 + 1.83 + 1.84 +//---------------------------------------------------------------------- 1.85 +// Class nsGBKToUnicode [implementation] 1.86 + 1.87 +//---------------------------------------------------------------------- 1.88 +// Subclassing of nsTablesDecoderSupport class [implementation] 1.89 + 1.90 +#define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \ 1.91 + (UINT8_IN_RANGE(0x81, (c), 0xFE)) 1.92 +#define FIRST_BYTE_IS_SURROGATE(c) \ 1.93 + (UINT8_IN_RANGE(0x90, (c), 0xFE)) 1.94 +#define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \ 1.95 + (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE)) 1.96 +#define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \ 1.97 + (UINT8_IN_RANGE(0x30, (c), 0x39)) 1.98 +#define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \ 1.99 + (UINT8_IN_RANGE(0x81, (c), 0xFE)) 1.100 +#define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \ 1.101 + (UINT8_IN_RANGE(0x30, (c), 0x39)) 1.102 + 1.103 +NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc, 1.104 + int32_t * aSrcLength, 1.105 + char16_t *aDest, 1.106 + int32_t * aDestLength) 1.107 +{ 1.108 + int32_t i=0; 1.109 + int32_t iSrcLength = (*aSrcLength); 1.110 + int32_t iDestlen = 0; 1.111 + nsresult rv=NS_OK; 1.112 + *aSrcLength = 0; 1.113 + 1.114 + for (i=0;i<iSrcLength;i++) 1.115 + { 1.116 + if ( iDestlen >= (*aDestLength) ) 1.117 + { 1.118 + rv = NS_OK_UDEC_MOREOUTPUT; 1.119 + break; 1.120 + } 1.121 + // The valid range for the 1st byte is [0x81,0xFE] 1.122 + if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc)) 1.123 + { 1.124 + if(i+1 >= iSrcLength) 1.125 + { 1.126 + rv = NS_OK_UDEC_MOREINPUT; 1.127 + break; 1.128 + } 1.129 + // To make sure, the second byte has to be checked as well. 1.130 + // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE] 1.131 + if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1])) 1.132 + { 1.133 + // Valid GBK code 1.134 + *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]); 1.135 + if(UCS2_NO_MAPPING == *aDest) 1.136 + { 1.137 + // We cannot map in the common mapping, let's call the 1.138 + // delegate 2 byte decoder to decode the gbk or gb18030 unique 1.139 + // 2 byte mapping 1.140 + if(! TryExtensionDecoder(aSrc, aDest)) 1.141 + { 1.142 + *aDest = UCS2_NO_MAPPING; 1.143 + } 1.144 + } 1.145 + aSrc += 2; 1.146 + i++; 1.147 + } 1.148 + else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) 1.149 + { 1.150 + // from the first 2 bytes, it looks like a 4 byte GB18030 1.151 + if(i+3 >= iSrcLength) // make sure we got 4 bytes 1.152 + { 1.153 + rv = NS_OK_UDEC_MOREINPUT; 1.154 + break; 1.155 + } 1.156 + // 4 bytes patten 1.157 + // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39] 1.158 + // preset the 1.159 + 1.160 + if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) && 1.161 + LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) 1.162 + { 1.163 + if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 1.164 + { 1.165 + // let's call the delegated 4 byte gb18030 converter to convert it 1.166 + if(! Try4BytesDecoder(aSrc, aDest)) 1.167 + *aDest = UCS2_NO_MAPPING; 1.168 + } else { 1.169 + // let's try supplement mapping 1.170 + if ( (iDestlen+1) < (*aDestLength) ) 1.171 + { 1.172 + if(DecodeToSurrogate(aSrc, aDest)) 1.173 + { 1.174 + // surrogte two char16_t 1.175 + iDestlen++; 1.176 + aDest++; 1.177 + } else { 1.178 + *aDest = UCS2_NO_MAPPING; 1.179 + } 1.180 + } else { 1.181 + if (*aDestLength < 2) { 1.182 + NS_ERROR("insufficient space in output buffer"); 1.183 + *aDest = UCS2_NO_MAPPING; 1.184 + } else { 1.185 + rv = NS_OK_UDEC_MOREOUTPUT; 1.186 + break; 1.187 + } 1.188 + } 1.189 + } 1.190 + aSrc += 4; 1.191 + i += 3; 1.192 + } else { 1.193 + *aDest = UCS2_NO_MAPPING; 1.194 + // If the third and fourth bytes are not in the legal ranges for 1.195 + // a four-byte sequnce, resynchronize on the second byte 1.196 + // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE, 1.197 + // 0x30-0x39) 1.198 + aSrc++; 1.199 + } 1.200 + } 1.201 + else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 ) 1.202 + { 1.203 + // stand-alone (not followed by a valid second byte) 0xA0 ! 1.204 + // treat it as valid a la Netscape 4.x 1.205 + *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); 1.206 + aSrc++; 1.207 + } else { 1.208 + // Invalid GBK code point (second byte should be 0x40 or higher) 1.209 + *aDest = UCS2_NO_MAPPING; 1.210 + aSrc++; 1.211 + } 1.212 + } else { 1.213 + if(IS_ASCII(*aSrc)) 1.214 + { 1.215 + // The source is an ASCII 1.216 + *aDest = CAST_CHAR_TO_UNICHAR(*aSrc); 1.217 + aSrc++; 1.218 + } else { 1.219 + if(IS_GBK_EURO(*aSrc)) { 1.220 + *aDest = UCS2_EURO; 1.221 + } else { 1.222 + *aDest = UCS2_NO_MAPPING; 1.223 + } 1.224 + aSrc++; 1.225 + } 1.226 + } 1.227 + iDestlen++; 1.228 + aDest++; 1.229 + *aSrcLength = i+1; 1.230 + } 1.231 + *aDestLength = iDestlen; 1.232 + return rv; 1.233 +} 1.234 + 1.235 + 1.236 +void nsGBKToUnicode::CreateExtensionDecoder() 1.237 +{ 1.238 + mExtensionDecoder = new nsGBKUnique2BytesToUnicode(); 1.239 +} 1.240 +void nsGBKToUnicode::Create4BytesDecoder() 1.241 +{ 1.242 + m4BytesDecoder = nullptr; 1.243 +} 1.244 +void nsGB18030ToUnicode::CreateExtensionDecoder() 1.245 +{ 1.246 + mExtensionDecoder = new nsGB18030Unique2BytesToUnicode(); 1.247 +} 1.248 +void nsGB18030ToUnicode::Create4BytesDecoder() 1.249 +{ 1.250 + m4BytesDecoder = new nsGB18030Unique4BytesToUnicode(); 1.251 +} 1.252 +bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) 1.253 +{ 1.254 + NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte"); 1.255 + NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte"); 1.256 + NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte"); 1.257 + NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte"); 1.258 + if(! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 1.259 + return false; 1.260 + if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1])) 1.261 + return false; 1.262 + if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2])) 1.263 + return false; 1.264 + if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3])) 1.265 + return false; 1.266 + 1.267 + uint8_t a1 = (uint8_t) aSrc[0]; 1.268 + uint8_t a2 = (uint8_t) aSrc[1]; 1.269 + uint8_t a3 = (uint8_t) aSrc[2]; 1.270 + uint8_t a4 = (uint8_t) aSrc[3]; 1.271 + a1 -= (uint8_t)0x90; 1.272 + a2 -= (uint8_t)0x30; 1.273 + a3 -= (uint8_t)0x81; 1.274 + a4 -= (uint8_t)0x30; 1.275 + uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4; 1.276 + // idx == ucs4Codepoint - 0x10000 1.277 + if (idx > 0x000FFFFF) 1.278 + return false; 1.279 + 1.280 + *aOut++ = 0xD800 | (idx >> 10); 1.281 + *aOut = 0xDC00 | (0x000003FF & idx); 1.282 + 1.283 + return true; 1.284 +} 1.285 +bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut) 1.286 +{ 1.287 + if(!mExtensionDecoder) 1.288 + CreateExtensionDecoder(); 1.289 + NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter"); 1.290 + if(mExtensionDecoder) 1.291 + { 1.292 + nsresult res = mExtensionDecoder->Reset(); 1.293 + NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed"); 1.294 + int32_t len = 2; 1.295 + int32_t dstlen = 1; 1.296 + res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); 1.297 + NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), 1.298 + "some strange conversion result"); 1.299 + // if we failed, we then just use the 0xfffd 1.300 + // therefore, we ignore the res here. 1.301 + if(NS_SUCCEEDED(res)) 1.302 + return true; 1.303 + } 1.304 + return false; 1.305 +} 1.306 +bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut) 1.307 +{ 1.308 + return false; 1.309 +} 1.310 +bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut) 1.311 +{ 1.312 + if(!m4BytesDecoder) 1.313 + Create4BytesDecoder(); 1.314 + if(m4BytesDecoder) 1.315 + { 1.316 + nsresult res = m4BytesDecoder->Reset(); 1.317 + NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed"); 1.318 + int32_t len = 4; 1.319 + int32_t dstlen = 1; 1.320 + res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); 1.321 + NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), 1.322 + "some strange conversion result"); 1.323 + // if we failed, we then just use the 0xfffd 1.324 + // therefore, we ignore the res here. 1.325 + if(NS_SUCCEEDED(res)) 1.326 + return true; 1.327 + } 1.328 + return false; 1.329 +}