intl/uconv/ucvcn/nsUnicodeToGBK.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/ucvcn/nsUnicodeToGBK.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,310 @@
     1.4 +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 + /**
     1.9 + * A character set converter from Unicode to GBK.
    1.10 + * 
    1.11 + *
    1.12 + * @created         08/Sept/1999
    1.13 + * @author  Yueheng Xu, Yueheng.Xu@intel.com
    1.14 + * Revision History
    1.15 + * 04/Oct/1999. Yueheng Xu: used table gUnicodeToGBKTable[0x5200] to make 
    1.16 + *              Unicode to GB mapping fast 
    1.17 + */
    1.18 +
    1.19 +#include "nsUnicodeToGBK.h"
    1.20 +#include "gbku.h"
    1.21 +#include "uconvutil.h"
    1.22 +#include "nsCharTraits.h"
    1.23 +
    1.24 +//-------------------------------------------------------------
    1.25 +// Global table initialization function defined in gbku.h
    1.26 +//-------------------------------------------------------------
    1.27 +
    1.28 +//-----------------------------------------------------------------------
    1.29 +//  Private class used by nsUnicodeToGB18030 and nsUnicodeToGB18030Font0
    1.30 +//    nsUnicodeToGB18030Uniq2Bytes
    1.31 +//-----------------------------------------------------------------------
    1.32 +static const uint16_t g_uf_gb18030_2bytes[] = {
    1.33 +#include "gb18030uniq2b.uf"
    1.34 +};
    1.35 +class nsUnicodeToGB18030Uniq2Bytes : public nsTableEncoderSupport
    1.36 +{
    1.37 +public: 
    1.38 +  nsUnicodeToGB18030Uniq2Bytes() 
    1.39 +    : nsTableEncoderSupport(u2BytesCharset,
    1.40 +                            (uMappingTable*) &g_uf_gb18030_2bytes, 2) {}
    1.41 +protected: 
    1.42 +};
    1.43 +//-----------------------------------------------------------------------
    1.44 +//  Private class used by nsUnicodeToGB18030
    1.45 +//    nsUnicodeTo4BytesGB18030
    1.46 +//-----------------------------------------------------------------------
    1.47 +static const uint16_t g_uf_gb18030_4bytes[] = {
    1.48 +#include "gb180304bytes.uf"
    1.49 +};
    1.50 +class nsUnicodeTo4BytesGB18030 : public nsTableEncoderSupport
    1.51 +{
    1.52 +public: 
    1.53 +  nsUnicodeTo4BytesGB18030()
    1.54 +    : nsTableEncoderSupport(u4BytesGB18030Charset, 
    1.55 +                             (uMappingTable*) &g_uf_gb18030_4bytes, 4) {}
    1.56 +protected: 
    1.57 +};
    1.58 +//-----------------------------------------------------------------------
    1.59 +//  Private class used by nsUnicodeToGBK
    1.60 +//    nsUnicodeToGBKUniq2Bytes
    1.61 +//-----------------------------------------------------------------------
    1.62 +static const uint16_t g_uf_gbk_2bytes[] = {
    1.63 +#include "gbkuniq2b.uf"
    1.64 +};
    1.65 +class nsUnicodeToGBKUniq2Bytes : public nsTableEncoderSupport
    1.66 +{
    1.67 +public: 
    1.68 +  nsUnicodeToGBKUniq2Bytes()
    1.69 +    : nsTableEncoderSupport(u2BytesCharset, 
    1.70 +                             (uMappingTable*) &g_uf_gbk_2bytes, 2) {}
    1.71 +protected: 
    1.72 +};
    1.73 +//-----------------------------------------------------------------------
    1.74 +//  nsUnicodeToGB18030
    1.75 +//-----------------------------------------------------------------------
    1.76 +void nsUnicodeToGB18030::CreateExtensionEncoder()
    1.77 +{
    1.78 +  mExtensionEncoder = new nsUnicodeToGB18030Uniq2Bytes();
    1.79 +}
    1.80 +void nsUnicodeToGB18030::Create4BytesEncoder()
    1.81 +{
    1.82 +  m4BytesEncoder = new nsUnicodeTo4BytesGB18030();
    1.83 +}
    1.84 +
    1.85 +bool nsUnicodeToGB18030::EncodeSurrogate(
    1.86 +  char16_t aSurrogateHigh,
    1.87 +  char16_t aSurrogateLow,
    1.88 +  char* aOut)
    1.89 +{
    1.90 +  if( NS_IS_HIGH_SURROGATE(aSurrogateHigh) && 
    1.91 +      NS_IS_LOW_SURROGATE(aSurrogateLow) )
    1.92 +  {
    1.93 +    // notice that idx does not include the 0x10000 
    1.94 +    uint32_t idx = ((aSurrogateHigh - (char16_t)0xD800) << 10 ) |
    1.95 +                   (aSurrogateLow - (char16_t) 0xDC00);
    1.96 +
    1.97 +    unsigned char *out = (unsigned char*) aOut;
    1.98 +    // notice this is from 0x90 for supplment planes
    1.99 +    out[0] = (idx / (10*126*10)) + 0x90; 
   1.100 +    idx %= (10*126*10);
   1.101 +    out[1] = (idx / (10*126)) + 0x30;
   1.102 +    idx %= (10*126);
   1.103 +    out[2] = (idx / (10)) + 0x81;
   1.104 +    out[3] = (idx % 10) + 0x30;
   1.105 +    return true;
   1.106 +  } 
   1.107 +  return false; 
   1.108 +} 
   1.109 +
   1.110 +//----------------------------------------------------------------------
   1.111 +// Class nsUnicodeToGBK [implementation]
   1.112 +
   1.113 +nsUnicodeToGBK::nsUnicodeToGBK(uint32_t aMaxLength) :
   1.114 +  nsEncoderSupport(aMaxLength)
   1.115 +{
   1.116 +  mExtensionEncoder = nullptr;
   1.117 +  m4BytesEncoder = nullptr;
   1.118 +  mSurrogateHigh = 0;
   1.119 +}
   1.120 +void nsUnicodeToGBK::CreateExtensionEncoder()
   1.121 +{
   1.122 +  mExtensionEncoder = new nsUnicodeToGBKUniq2Bytes();
   1.123 +}
   1.124 +void nsUnicodeToGBK::Create4BytesEncoder()
   1.125 +{
   1.126 +  m4BytesEncoder = nullptr;
   1.127 +}
   1.128 +bool nsUnicodeToGBK::TryExtensionEncoder(
   1.129 +  char16_t aChar,
   1.130 +  char* aOut,
   1.131 +  int32_t *aOutLen
   1.132 +)
   1.133 +{
   1.134 +  if( NS_IS_HIGH_SURROGATE(aChar) || 
   1.135 +      NS_IS_LOW_SURROGATE(aChar) )
   1.136 +  {
   1.137 +    // performance tune for surrogate characters
   1.138 +    return false;
   1.139 +  }
   1.140 +  if(! mExtensionEncoder )
   1.141 +    CreateExtensionEncoder();
   1.142 +  if(mExtensionEncoder) 
   1.143 +  {
   1.144 +    int32_t len = 1;
   1.145 +    nsresult res = NS_OK;
   1.146 +    res = mExtensionEncoder->Convert(&aChar, &len, aOut, aOutLen);
   1.147 +    if(NS_SUCCEEDED(res) && (*aOutLen > 0))
   1.148 +      return true;
   1.149 +  }
   1.150 +  return false;
   1.151 +}
   1.152 +
   1.153 +bool nsUnicodeToGBK::Try4BytesEncoder(
   1.154 +  char16_t aChar,
   1.155 +  char* aOut,
   1.156 +  int32_t *aOutLen
   1.157 +)
   1.158 +{
   1.159 +  if( NS_IS_HIGH_SURROGATE(aChar) || 
   1.160 +      NS_IS_LOW_SURROGATE(aChar) )
   1.161 +  {
   1.162 +    // performance tune for surrogate characters
   1.163 +    return false;
   1.164 +  }
   1.165 +  if(! m4BytesEncoder )
   1.166 +    Create4BytesEncoder();
   1.167 +  if(m4BytesEncoder) 
   1.168 +  {
   1.169 +    int32_t len = 1;
   1.170 +    nsresult res = NS_OK;
   1.171 +    res = m4BytesEncoder->Convert(&aChar, &len, aOut, aOutLen);
   1.172 +    NS_ASSERTION(NS_FAILED(res) || ((1 == len) && (4 == *aOutLen)),
   1.173 +      "unexpect conversion length");
   1.174 +    if(NS_SUCCEEDED(res) && (*aOutLen > 0))
   1.175 +      return true;
   1.176 +  }
   1.177 +  return false;
   1.178 +}
   1.179 +bool nsUnicodeToGBK::EncodeSurrogate(
   1.180 +  char16_t aSurrogateHigh,
   1.181 +  char16_t aSurrogateLow,
   1.182 +  char* aOut)
   1.183 +{
   1.184 +  return false; // GBK cannot encode Surrogate, let the subclass encode it.
   1.185 +} 
   1.186 +
   1.187 +NS_IMETHODIMP nsUnicodeToGBK::ConvertNoBuff(
   1.188 +  const char16_t * aSrc, 
   1.189 +  int32_t * aSrcLength, 
   1.190 +  char * aDest, 
   1.191 +  int32_t * aDestLength)
   1.192 +{
   1.193 +  int32_t iSrcLength = 0;
   1.194 +  int32_t iDestLength = 0;
   1.195 +  char16_t unicode;
   1.196 +  nsresult res = NS_OK;
   1.197 +  while (iSrcLength < *aSrcLength )
   1.198 +  {
   1.199 +    unicode = *aSrc;
   1.200 +    //if unicode's hi byte has something, it is not ASCII, must be a GB
   1.201 +    if(IS_ASCII(unicode))
   1.202 +    {
   1.203 +      // this is an ASCII
   1.204 +      *aDest = CAST_UNICHAR_TO_CHAR(*aSrc);
   1.205 +      aDest++; // increment 1 byte
   1.206 +      iDestLength +=1;
   1.207 +    } else {
   1.208 +      char byte1, byte2;
   1.209 +      if(mUtil.UnicodeToGBKChar( unicode, false, &byte1, &byte2))
   1.210 +      {
   1.211 +        // make sure we still have 2 bytes for output first
   1.212 +        if(iDestLength+2 > *aDestLength)
   1.213 +        {
   1.214 +          res = NS_OK_UENC_MOREOUTPUT;
   1.215 +          break;
   1.216 +        }
   1.217 +        aDest[0] = byte1;
   1.218 +        aDest[1] = byte2;
   1.219 +        aDest += 2;	// increment 2 bytes
   1.220 +        iDestLength +=2;
   1.221 +      } else {
   1.222 +        int32_t aOutLen = 2;
   1.223 +        // make sure we still have 2 bytes for output first
   1.224 +        if(iDestLength+2 > *aDestLength)
   1.225 +        {
   1.226 +          res = NS_OK_UENC_MOREOUTPUT;
   1.227 +          break;
   1.228 +        }
   1.229 +        // we cannot map in the common mapping. Let's try to
   1.230 +        // call the delegated 2 byte converter for the gbk or gb18030
   1.231 +        // unique 2 byte mapping
   1.232 +        if(TryExtensionEncoder(unicode, aDest, &aOutLen))
   1.233 +        {
   1.234 +          iDestLength += aOutLen;
   1.235 +          aDest += aOutLen;
   1.236 +        } else {
   1.237 +          // make sure we still have 4 bytes for output first
   1.238 +          if(iDestLength+4 > *aDestLength)
   1.239 +          {
   1.240 +            res = NS_OK_UENC_MOREOUTPUT;
   1.241 +            break;
   1.242 +          }
   1.243 +          // we still cannot map. Let's try to
   1.244 +          // call the delegated GB18030 4 byte converter 
   1.245 +          aOutLen = 4;
   1.246 +          if( NS_IS_HIGH_SURROGATE(unicode) )
   1.247 +          {
   1.248 +            if((iSrcLength+1) < *aSrcLength ) {
   1.249 +              if(EncodeSurrogate(aSrc[0],aSrc[1], aDest)) {
   1.250 +                // since we got a surrogate pair, we need to increment src.
   1.251 +                iSrcLength++ ; 
   1.252 +                aSrc++;
   1.253 +                iDestLength += aOutLen;
   1.254 +                aDest += aOutLen;
   1.255 +              } else {
   1.256 +                // only get a high surrogate, but not a low surrogate
   1.257 +                res = NS_ERROR_UENC_NOMAPPING;
   1.258 +                iSrcLength++;   // include length of the unmapped character
   1.259 +                break;
   1.260 +              }
   1.261 +            } else {
   1.262 +              mSurrogateHigh = aSrc[0];
   1.263 +              break; // this will go to afterwhileloop
   1.264 +            }
   1.265 +          } else {
   1.266 +            if( NS_IS_LOW_SURROGATE(unicode) )
   1.267 +            {
   1.268 +              if(NS_IS_HIGH_SURROGATE(mSurrogateHigh)) {
   1.269 +                if(EncodeSurrogate(mSurrogateHigh, aSrc[0], aDest)) {
   1.270 +                  iDestLength += aOutLen;
   1.271 +                  aDest += aOutLen;
   1.272 +                } else {
   1.273 +                  // only get a high surrogate, but not a low surrogate
   1.274 +                  res = NS_ERROR_UENC_NOMAPPING;
   1.275 +                  iSrcLength++;   // include length of the unmapped character
   1.276 +                  break;
   1.277 +                }
   1.278 +              } else {
   1.279 +                // only get a low surrogate, but not a low surrogate
   1.280 +                res = NS_ERROR_UENC_NOMAPPING;
   1.281 +                iSrcLength++;   // include length of the unmapped character
   1.282 +                break;
   1.283 +              }
   1.284 +            } else {
   1.285 +              if(Try4BytesEncoder(unicode, aDest, &aOutLen))
   1.286 +              {
   1.287 +                NS_ASSERTION((aOutLen == 4), "we should always generate 4 bytes here");
   1.288 +                iDestLength += aOutLen;
   1.289 +                aDest += aOutLen;
   1.290 +              } else {
   1.291 +                res = NS_ERROR_UENC_NOMAPPING;
   1.292 +                iSrcLength++;   // include length of the unmapped character
   1.293 +                break;
   1.294 +              }
   1.295 +            }
   1.296 +          }
   1.297 +        }
   1.298 +      } 
   1.299 +    }
   1.300 +    iSrcLength++ ; // Each unicode char just count as one in char16_t string;  	  
   1.301 +    mSurrogateHigh = 0;
   1.302 +    aSrc++;
   1.303 +    if ( iDestLength >= (*aDestLength) && (iSrcLength < *aSrcLength) )
   1.304 +    {
   1.305 +      res = NS_OK_UENC_MOREOUTPUT;
   1.306 +      break;
   1.307 +    }
   1.308 +  }
   1.309 +//afterwhileloop:
   1.310 +  *aDestLength = iDestLength;
   1.311 +  *aSrcLength = iSrcLength;
   1.312 +  return res;
   1.313 +}

mercurial