intl/uconv/ucvcn/nsGBKToUnicode.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 /**
michael@0 6 * A character set converter from GBK to Unicode.
michael@0 7 *
michael@0 8 *
michael@0 9 * @created 07/Sept/1999
michael@0 10 * @author Yueheng Xu, Yueheng.Xu@intel.com
michael@0 11 */
michael@0 12
michael@0 13 #include "nsGBKToUnicode.h"
michael@0 14 #include "gbku.h"
michael@0 15
michael@0 16
michael@0 17 //------------------------------------------------------------
michael@0 18 // nsGBKUnique2BytesToUnicode
michael@0 19 //------------------------------------------------------------
michael@0 20 class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport
michael@0 21 {
michael@0 22 public:
michael@0 23 nsGBKUnique2BytesToUnicode();
michael@0 24 virtual ~nsGBKUnique2BytesToUnicode()
michael@0 25 { }
michael@0 26 protected:
michael@0 27 };
michael@0 28
michael@0 29 static const uint16_t g_utGBKUnique2Bytes[] = {
michael@0 30 #include "gbkuniq2b.ut"
michael@0 31 };
michael@0 32 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode()
michael@0 33 : nsTableDecoderSupport(u2BytesCharset, nullptr,
michael@0 34 (uMappingTable*) &g_utGBKUnique2Bytes, 1)
michael@0 35 {
michael@0 36 }
michael@0 37
michael@0 38 //------------------------------------------------------------
michael@0 39 // nsGB18030Unique2BytesToUnicode
michael@0 40 //------------------------------------------------------------
michael@0 41 class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport
michael@0 42 {
michael@0 43 public:
michael@0 44 nsGB18030Unique2BytesToUnicode();
michael@0 45 virtual ~nsGB18030Unique2BytesToUnicode()
michael@0 46 { }
michael@0 47 protected:
michael@0 48 };
michael@0 49
michael@0 50 static const uint16_t g_utGB18030Unique2Bytes[] = {
michael@0 51 #include "gb18030uniq2b.ut"
michael@0 52 };
michael@0 53 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode()
michael@0 54 : nsTableDecoderSupport(u2BytesCharset, nullptr,
michael@0 55 (uMappingTable*) &g_utGB18030Unique2Bytes, 1)
michael@0 56 {
michael@0 57 }
michael@0 58
michael@0 59 //------------------------------------------------------------
michael@0 60 // nsGB18030Unique4BytesToUnicode
michael@0 61 //------------------------------------------------------------
michael@0 62 class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport
michael@0 63 {
michael@0 64 public:
michael@0 65 nsGB18030Unique4BytesToUnicode();
michael@0 66 virtual ~nsGB18030Unique4BytesToUnicode()
michael@0 67 { }
michael@0 68 protected:
michael@0 69 };
michael@0 70
michael@0 71 static const uint16_t g_utGB18030Unique4Bytes[] = {
michael@0 72 #include "gb180304bytes.ut"
michael@0 73 };
michael@0 74 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode()
michael@0 75 : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr,
michael@0 76 (uMappingTable*) &g_utGB18030Unique4Bytes, 1)
michael@0 77 {
michael@0 78 }
michael@0 79
michael@0 80
michael@0 81 //----------------------------------------------------------------------
michael@0 82 // Class nsGBKToUnicode [implementation]
michael@0 83
michael@0 84 //----------------------------------------------------------------------
michael@0 85 // Subclassing of nsTablesDecoderSupport class [implementation]
michael@0 86
michael@0 87 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c) \
michael@0 88 (UINT8_IN_RANGE(0x81, (c), 0xFE))
michael@0 89 #define FIRST_BYTE_IS_SURROGATE(c) \
michael@0 90 (UINT8_IN_RANGE(0x90, (c), 0xFE))
michael@0 91 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
michael@0 92 (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
michael@0 93 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
michael@0 94 (UINT8_IN_RANGE(0x30, (c), 0x39))
michael@0 95 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c) \
michael@0 96 (UINT8_IN_RANGE(0x81, (c), 0xFE))
michael@0 97 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
michael@0 98 (UINT8_IN_RANGE(0x30, (c), 0x39))
michael@0 99
michael@0 100 NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
michael@0 101 int32_t * aSrcLength,
michael@0 102 char16_t *aDest,
michael@0 103 int32_t * aDestLength)
michael@0 104 {
michael@0 105 int32_t i=0;
michael@0 106 int32_t iSrcLength = (*aSrcLength);
michael@0 107 int32_t iDestlen = 0;
michael@0 108 nsresult rv=NS_OK;
michael@0 109 *aSrcLength = 0;
michael@0 110
michael@0 111 for (i=0;i<iSrcLength;i++)
michael@0 112 {
michael@0 113 if ( iDestlen >= (*aDestLength) )
michael@0 114 {
michael@0 115 rv = NS_OK_UDEC_MOREOUTPUT;
michael@0 116 break;
michael@0 117 }
michael@0 118 // The valid range for the 1st byte is [0x81,0xFE]
michael@0 119 if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
michael@0 120 {
michael@0 121 if(i+1 >= iSrcLength)
michael@0 122 {
michael@0 123 rv = NS_OK_UDEC_MOREINPUT;
michael@0 124 break;
michael@0 125 }
michael@0 126 // To make sure, the second byte has to be checked as well.
michael@0 127 // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
michael@0 128 if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
michael@0 129 {
michael@0 130 // Valid GBK code
michael@0 131 *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
michael@0 132 if(UCS2_NO_MAPPING == *aDest)
michael@0 133 {
michael@0 134 // We cannot map in the common mapping, let's call the
michael@0 135 // delegate 2 byte decoder to decode the gbk or gb18030 unique
michael@0 136 // 2 byte mapping
michael@0 137 if(! TryExtensionDecoder(aSrc, aDest))
michael@0 138 {
michael@0 139 *aDest = UCS2_NO_MAPPING;
michael@0 140 }
michael@0 141 }
michael@0 142 aSrc += 2;
michael@0 143 i++;
michael@0 144 }
michael@0 145 else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
michael@0 146 {
michael@0 147 // from the first 2 bytes, it looks like a 4 byte GB18030
michael@0 148 if(i+3 >= iSrcLength) // make sure we got 4 bytes
michael@0 149 {
michael@0 150 rv = NS_OK_UDEC_MOREINPUT;
michael@0 151 break;
michael@0 152 }
michael@0 153 // 4 bytes patten
michael@0 154 // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
michael@0 155 // preset the
michael@0 156
michael@0 157 if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
michael@0 158 LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
michael@0 159 {
michael@0 160 if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
michael@0 161 {
michael@0 162 // let's call the delegated 4 byte gb18030 converter to convert it
michael@0 163 if(! Try4BytesDecoder(aSrc, aDest))
michael@0 164 *aDest = UCS2_NO_MAPPING;
michael@0 165 } else {
michael@0 166 // let's try supplement mapping
michael@0 167 if ( (iDestlen+1) < (*aDestLength) )
michael@0 168 {
michael@0 169 if(DecodeToSurrogate(aSrc, aDest))
michael@0 170 {
michael@0 171 // surrogte two char16_t
michael@0 172 iDestlen++;
michael@0 173 aDest++;
michael@0 174 } else {
michael@0 175 *aDest = UCS2_NO_MAPPING;
michael@0 176 }
michael@0 177 } else {
michael@0 178 if (*aDestLength < 2) {
michael@0 179 NS_ERROR("insufficient space in output buffer");
michael@0 180 *aDest = UCS2_NO_MAPPING;
michael@0 181 } else {
michael@0 182 rv = NS_OK_UDEC_MOREOUTPUT;
michael@0 183 break;
michael@0 184 }
michael@0 185 }
michael@0 186 }
michael@0 187 aSrc += 4;
michael@0 188 i += 3;
michael@0 189 } else {
michael@0 190 *aDest = UCS2_NO_MAPPING;
michael@0 191 // If the third and fourth bytes are not in the legal ranges for
michael@0 192 // a four-byte sequnce, resynchronize on the second byte
michael@0 193 // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
michael@0 194 // 0x30-0x39)
michael@0 195 aSrc++;
michael@0 196 }
michael@0 197 }
michael@0 198 else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
michael@0 199 {
michael@0 200 // stand-alone (not followed by a valid second byte) 0xA0 !
michael@0 201 // treat it as valid a la Netscape 4.x
michael@0 202 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
michael@0 203 aSrc++;
michael@0 204 } else {
michael@0 205 // Invalid GBK code point (second byte should be 0x40 or higher)
michael@0 206 *aDest = UCS2_NO_MAPPING;
michael@0 207 aSrc++;
michael@0 208 }
michael@0 209 } else {
michael@0 210 if(IS_ASCII(*aSrc))
michael@0 211 {
michael@0 212 // The source is an ASCII
michael@0 213 *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
michael@0 214 aSrc++;
michael@0 215 } else {
michael@0 216 if(IS_GBK_EURO(*aSrc)) {
michael@0 217 *aDest = UCS2_EURO;
michael@0 218 } else {
michael@0 219 *aDest = UCS2_NO_MAPPING;
michael@0 220 }
michael@0 221 aSrc++;
michael@0 222 }
michael@0 223 }
michael@0 224 iDestlen++;
michael@0 225 aDest++;
michael@0 226 *aSrcLength = i+1;
michael@0 227 }
michael@0 228 *aDestLength = iDestlen;
michael@0 229 return rv;
michael@0 230 }
michael@0 231
michael@0 232
michael@0 233 void nsGBKToUnicode::CreateExtensionDecoder()
michael@0 234 {
michael@0 235 mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
michael@0 236 }
michael@0 237 void nsGBKToUnicode::Create4BytesDecoder()
michael@0 238 {
michael@0 239 m4BytesDecoder = nullptr;
michael@0 240 }
michael@0 241 void nsGB18030ToUnicode::CreateExtensionDecoder()
michael@0 242 {
michael@0 243 mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
michael@0 244 }
michael@0 245 void nsGB18030ToUnicode::Create4BytesDecoder()
michael@0 246 {
michael@0 247 m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
michael@0 248 }
michael@0 249 bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
michael@0 250 {
michael@0 251 NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]), "illegal first byte");
michael@0 252 NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]), "illegal second byte");
michael@0 253 NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]), "illegal third byte");
michael@0 254 NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]), "illegal forth byte");
michael@0 255 if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
michael@0 256 return false;
michael@0 257 if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
michael@0 258 return false;
michael@0 259 if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
michael@0 260 return false;
michael@0 261 if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
michael@0 262 return false;
michael@0 263
michael@0 264 uint8_t a1 = (uint8_t) aSrc[0];
michael@0 265 uint8_t a2 = (uint8_t) aSrc[1];
michael@0 266 uint8_t a3 = (uint8_t) aSrc[2];
michael@0 267 uint8_t a4 = (uint8_t) aSrc[3];
michael@0 268 a1 -= (uint8_t)0x90;
michael@0 269 a2 -= (uint8_t)0x30;
michael@0 270 a3 -= (uint8_t)0x81;
michael@0 271 a4 -= (uint8_t)0x30;
michael@0 272 uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
michael@0 273 // idx == ucs4Codepoint - 0x10000
michael@0 274 if (idx > 0x000FFFFF)
michael@0 275 return false;
michael@0 276
michael@0 277 *aOut++ = 0xD800 | (idx >> 10);
michael@0 278 *aOut = 0xDC00 | (0x000003FF & idx);
michael@0 279
michael@0 280 return true;
michael@0 281 }
michael@0 282 bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
michael@0 283 {
michael@0 284 if(!mExtensionDecoder)
michael@0 285 CreateExtensionDecoder();
michael@0 286 NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
michael@0 287 if(mExtensionDecoder)
michael@0 288 {
michael@0 289 nsresult res = mExtensionDecoder->Reset();
michael@0 290 NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
michael@0 291 int32_t len = 2;
michael@0 292 int32_t dstlen = 1;
michael@0 293 res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen);
michael@0 294 NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)),
michael@0 295 "some strange conversion result");
michael@0 296 // if we failed, we then just use the 0xfffd
michael@0 297 // therefore, we ignore the res here.
michael@0 298 if(NS_SUCCEEDED(res))
michael@0 299 return true;
michael@0 300 }
michael@0 301 return false;
michael@0 302 }
michael@0 303 bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
michael@0 304 {
michael@0 305 return false;
michael@0 306 }
michael@0 307 bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
michael@0 308 {
michael@0 309 if(!m4BytesDecoder)
michael@0 310 Create4BytesDecoder();
michael@0 311 if(m4BytesDecoder)
michael@0 312 {
michael@0 313 nsresult res = m4BytesDecoder->Reset();
michael@0 314 NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
michael@0 315 int32_t len = 4;
michael@0 316 int32_t dstlen = 1;
michael@0 317 res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen);
michael@0 318 NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)),
michael@0 319 "some strange conversion result");
michael@0 320 // if we failed, we then just use the 0xfffd
michael@0 321 // therefore, we ignore the res here.
michael@0 322 if(NS_SUCCEEDED(res))
michael@0 323 return true;
michael@0 324 }
michael@0 325 return false;
michael@0 326 }

mercurial