intl/uconv/ucvcn/nsGBKToUnicode.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/ucvcn/nsGBKToUnicode.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,326 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +/**
     1.9 + * A character set converter from GBK to Unicode.
    1.10 + * 
    1.11 + *
    1.12 + * @created         07/Sept/1999
    1.13 + * @author  Yueheng Xu, Yueheng.Xu@intel.com
    1.14 + */
    1.15 +
    1.16 +#include "nsGBKToUnicode.h"
    1.17 +#include "gbku.h"
    1.18 +
    1.19 +
    1.20 +//------------------------------------------------------------
    1.21 +// nsGBKUnique2BytesToUnicode
    1.22 +//------------------------------------------------------------
    1.23 +class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport 
    1.24 +{
    1.25 +public:
    1.26 +  nsGBKUnique2BytesToUnicode();
    1.27 +  virtual ~nsGBKUnique2BytesToUnicode() 
    1.28 +    { }
    1.29 +protected:
    1.30 +};
    1.31 +
    1.32 +static const uint16_t g_utGBKUnique2Bytes[] = {
    1.33 +#include "gbkuniq2b.ut"
    1.34 +};
    1.35 +nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() 
    1.36 +  : nsTableDecoderSupport(u2BytesCharset, nullptr,
    1.37 +        (uMappingTable*) &g_utGBKUnique2Bytes, 1) 
    1.38 +{
    1.39 +}
    1.40 +
    1.41 +//------------------------------------------------------------
    1.42 +// nsGB18030Unique2BytesToUnicode
    1.43 +//------------------------------------------------------------
    1.44 +class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport 
    1.45 +{
    1.46 +public:
    1.47 +  nsGB18030Unique2BytesToUnicode();
    1.48 +  virtual ~nsGB18030Unique2BytesToUnicode() 
    1.49 +    { }
    1.50 +protected:
    1.51 +};
    1.52 +
    1.53 +static const uint16_t g_utGB18030Unique2Bytes[] = {
    1.54 +#include "gb18030uniq2b.ut"
    1.55 +};
    1.56 +nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() 
    1.57 +  : nsTableDecoderSupport(u2BytesCharset, nullptr,
    1.58 +        (uMappingTable*) &g_utGB18030Unique2Bytes, 1) 
    1.59 +{
    1.60 +}
    1.61 +
    1.62 +//------------------------------------------------------------
    1.63 +// nsGB18030Unique4BytesToUnicode
    1.64 +//------------------------------------------------------------
    1.65 +class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport 
    1.66 +{
    1.67 +public:
    1.68 +  nsGB18030Unique4BytesToUnicode();
    1.69 +  virtual ~nsGB18030Unique4BytesToUnicode() 
    1.70 +    { }
    1.71 +protected:
    1.72 +};
    1.73 +
    1.74 +static const uint16_t g_utGB18030Unique4Bytes[] = {
    1.75 +#include "gb180304bytes.ut"
    1.76 +};
    1.77 +nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() 
    1.78 +  : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr,
    1.79 +        (uMappingTable*) &g_utGB18030Unique4Bytes, 1) 
    1.80 +{
    1.81 +}
    1.82 +
    1.83 +
    1.84 +//----------------------------------------------------------------------
    1.85 +// Class nsGBKToUnicode [implementation]
    1.86 +
    1.87 +//----------------------------------------------------------------------
    1.88 +// Subclassing of nsTablesDecoderSupport class [implementation]
    1.89 +
    1.90 +#define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
    1.91 +      (UINT8_IN_RANGE(0x81, (c), 0xFE))
    1.92 +#define FIRST_BYTE_IS_SURROGATE(c)  \
    1.93 +      (UINT8_IN_RANGE(0x90, (c), 0xFE))
    1.94 +#define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
    1.95 +      (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
    1.96 +#define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
    1.97 +      (UINT8_IN_RANGE(0x30, (c), 0x39))
    1.98 +#define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
    1.99 +      (UINT8_IN_RANGE(0x81, (c), 0xFE))
   1.100 +#define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
   1.101 +      (UINT8_IN_RANGE(0x30, (c), 0x39))
   1.102 +
   1.103 +NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
   1.104 +                                            int32_t * aSrcLength,
   1.105 +                                            char16_t *aDest,
   1.106 +                                            int32_t * aDestLength)
   1.107 +{
   1.108 +  int32_t i=0;
   1.109 +  int32_t iSrcLength = (*aSrcLength);
   1.110 +  int32_t iDestlen = 0;
   1.111 +  nsresult rv=NS_OK;
   1.112 +  *aSrcLength = 0;
   1.113 +  
   1.114 +  for (i=0;i<iSrcLength;i++)
   1.115 +  {
   1.116 +    if ( iDestlen >= (*aDestLength) )
   1.117 +    {
   1.118 +      rv = NS_OK_UDEC_MOREOUTPUT;
   1.119 +      break;
   1.120 +    }
   1.121 +    // The valid range for the 1st byte is [0x81,0xFE] 
   1.122 +    if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
   1.123 +    {
   1.124 +      if(i+1 >= iSrcLength) 
   1.125 +      {
   1.126 +        rv = NS_OK_UDEC_MOREINPUT;
   1.127 +        break;
   1.128 +      }
   1.129 +      // To make sure, the second byte has to be checked as well.
   1.130 +      // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
   1.131 +      if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
   1.132 +      {
   1.133 +        // Valid GBK code
   1.134 +        *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
   1.135 +        if(UCS2_NO_MAPPING == *aDest)
   1.136 +        { 
   1.137 +          // We cannot map in the common mapping, let's call the
   1.138 +          // delegate 2 byte decoder to decode the gbk or gb18030 unique 
   1.139 +          // 2 byte mapping
   1.140 +          if(! TryExtensionDecoder(aSrc, aDest))
   1.141 +          {
   1.142 +            *aDest = UCS2_NO_MAPPING;
   1.143 +          }
   1.144 +        }
   1.145 +        aSrc += 2;
   1.146 +        i++;
   1.147 +      }
   1.148 +      else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
   1.149 +      {
   1.150 +        // from the first 2 bytes, it looks like a 4 byte GB18030
   1.151 +        if(i+3 >= iSrcLength)  // make sure we got 4 bytes
   1.152 +        {
   1.153 +          rv = NS_OK_UDEC_MOREINPUT;
   1.154 +          break;
   1.155 +        }
   1.156 +        // 4 bytes patten
   1.157 +        // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
   1.158 +        // preset the 
   1.159 + 
   1.160 +        if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
   1.161 +            LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
   1.162 +        {
   1.163 +           if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 
   1.164 +           {
   1.165 +             // let's call the delegated 4 byte gb18030 converter to convert it
   1.166 +             if(! Try4BytesDecoder(aSrc, aDest))
   1.167 +               *aDest = UCS2_NO_MAPPING;
   1.168 +           } else {
   1.169 +              // let's try supplement mapping
   1.170 +             if ( (iDestlen+1) < (*aDestLength) )
   1.171 +             {
   1.172 +               if(DecodeToSurrogate(aSrc, aDest))
   1.173 +               {
   1.174 +                 // surrogte two char16_t
   1.175 +                 iDestlen++;
   1.176 +                 aDest++;
   1.177 +               }  else {
   1.178 +                 *aDest = UCS2_NO_MAPPING;
   1.179 +              }
   1.180 +             } else {
   1.181 +               if (*aDestLength < 2) {
   1.182 +                 NS_ERROR("insufficient space in output buffer");
   1.183 +                 *aDest = UCS2_NO_MAPPING;
   1.184 +               } else {
   1.185 +                 rv = NS_OK_UDEC_MOREOUTPUT;
   1.186 +                 break;
   1.187 +               }
   1.188 +             }
   1.189 +           }
   1.190 +           aSrc += 4;
   1.191 +           i += 3;
   1.192 +        } else {
   1.193 +          *aDest = UCS2_NO_MAPPING; 
   1.194 +          // If the third and fourth bytes are not in the legal ranges for
   1.195 +          // a four-byte sequnce, resynchronize on the second byte
   1.196 +          // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
   1.197 +          //  0x30-0x39)
   1.198 +          aSrc++;
   1.199 +        }
   1.200 +      }
   1.201 +      else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
   1.202 +      {
   1.203 +        // stand-alone (not followed by a valid second byte) 0xA0 !
   1.204 +        // treat it as valid a la Netscape 4.x
   1.205 +        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
   1.206 +        aSrc++;
   1.207 +      } else {
   1.208 +        // Invalid GBK code point (second byte should be 0x40 or higher)
   1.209 +        *aDest = UCS2_NO_MAPPING;
   1.210 +        aSrc++;
   1.211 +      }
   1.212 +    } else {
   1.213 +      if(IS_ASCII(*aSrc))
   1.214 +      {
   1.215 +        // The source is an ASCII
   1.216 +        *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
   1.217 +        aSrc++;
   1.218 +      } else {
   1.219 +        if(IS_GBK_EURO(*aSrc)) {
   1.220 +          *aDest = UCS2_EURO;
   1.221 +        } else {
   1.222 +          *aDest = UCS2_NO_MAPPING;
   1.223 +        }
   1.224 +        aSrc++;
   1.225 +      }
   1.226 +    }
   1.227 +    iDestlen++;
   1.228 +    aDest++;
   1.229 +    *aSrcLength = i+1;
   1.230 +  }
   1.231 +  *aDestLength = iDestlen;
   1.232 +  return rv;
   1.233 +}
   1.234 +
   1.235 +
   1.236 +void nsGBKToUnicode::CreateExtensionDecoder()
   1.237 +{
   1.238 +  mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
   1.239 +}
   1.240 +void nsGBKToUnicode::Create4BytesDecoder()
   1.241 +{
   1.242 +  m4BytesDecoder =  nullptr;
   1.243 +}
   1.244 +void nsGB18030ToUnicode::CreateExtensionDecoder()
   1.245 +{
   1.246 +  mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
   1.247 +}
   1.248 +void nsGB18030ToUnicode::Create4BytesDecoder()
   1.249 +{
   1.250 +  m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
   1.251 +}
   1.252 +bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
   1.253 +{
   1.254 +  NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
   1.255 +  NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
   1.256 +  NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
   1.257 +  NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
   1.258 +  if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
   1.259 +    return false;
   1.260 +  if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
   1.261 +    return false;
   1.262 +  if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
   1.263 +    return false;
   1.264 +  if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
   1.265 +    return false;
   1.266 +
   1.267 +  uint8_t a1 = (uint8_t) aSrc[0];
   1.268 +  uint8_t a2 = (uint8_t) aSrc[1];
   1.269 +  uint8_t a3 = (uint8_t) aSrc[2];
   1.270 +  uint8_t a4 = (uint8_t) aSrc[3];
   1.271 +  a1 -= (uint8_t)0x90;
   1.272 +  a2 -= (uint8_t)0x30;
   1.273 +  a3 -= (uint8_t)0x81;
   1.274 +  a4 -= (uint8_t)0x30;
   1.275 +  uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
   1.276 +  // idx == ucs4Codepoint - 0x10000
   1.277 +  if (idx > 0x000FFFFF)
   1.278 +    return false;
   1.279 +
   1.280 +  *aOut++ = 0xD800 | (idx >> 10);
   1.281 +  *aOut = 0xDC00 | (0x000003FF & idx);
   1.282 +
   1.283 +  return true;
   1.284 +}
   1.285 +bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
   1.286 +{
   1.287 +  if(!mExtensionDecoder)
   1.288 +    CreateExtensionDecoder();
   1.289 +  NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
   1.290 +  if(mExtensionDecoder)
   1.291 +  {
   1.292 +    nsresult res = mExtensionDecoder->Reset();
   1.293 +    NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
   1.294 +    int32_t len = 2;
   1.295 +    int32_t dstlen = 1;
   1.296 +    res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); 
   1.297 +    NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), 
   1.298 +       "some strange conversion result");
   1.299 +     // if we failed, we then just use the 0xfffd 
   1.300 +     // therefore, we ignore the res here. 
   1.301 +    if(NS_SUCCEEDED(res)) 
   1.302 +      return true;
   1.303 +  }
   1.304 +  return  false;
   1.305 +}
   1.306 +bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
   1.307 +{
   1.308 +  return false;
   1.309 +}
   1.310 +bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
   1.311 +{
   1.312 +  if(!m4BytesDecoder)
   1.313 +    Create4BytesDecoder();
   1.314 +  if(m4BytesDecoder)
   1.315 +  {
   1.316 +    nsresult res = m4BytesDecoder->Reset();
   1.317 +    NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
   1.318 +    int32_t len = 4;
   1.319 +    int32_t dstlen = 1;
   1.320 +    res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); 
   1.321 +    NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), 
   1.322 +       "some strange conversion result");
   1.323 +     // if we failed, we then just use the 0xfffd 
   1.324 +     // therefore, we ignore the res here. 
   1.325 +    if(NS_SUCCEEDED(res)) 
   1.326 +      return true;
   1.327 +  }
   1.328 +  return  false;
   1.329 +}

mercurial