intl/uconv/ucvcn/nsUnicodeToGBK.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5  /**
     6  * A character set converter from Unicode to GBK.
     7  * 
     8  *
     9  * @created         08/Sept/1999
    10  * @author  Yueheng Xu, Yueheng.Xu@intel.com
    11  * Revision History
    12  * 04/Oct/1999. Yueheng Xu: used table gUnicodeToGBKTable[0x5200] to make 
    13  *              Unicode to GB mapping fast 
    14  */
    16 #include "nsUnicodeToGBK.h"
    17 #include "gbku.h"
    18 #include "uconvutil.h"
    19 #include "nsCharTraits.h"
    21 //-------------------------------------------------------------
    22 // Global table initialization function defined in gbku.h
    23 //-------------------------------------------------------------
    25 //-----------------------------------------------------------------------
    26 //  Private class used by nsUnicodeToGB18030 and nsUnicodeToGB18030Font0
    27 //    nsUnicodeToGB18030Uniq2Bytes
    28 //-----------------------------------------------------------------------
    29 static const uint16_t g_uf_gb18030_2bytes[] = {
    30 #include "gb18030uniq2b.uf"
    31 };
    32 class nsUnicodeToGB18030Uniq2Bytes : public nsTableEncoderSupport
    33 {
    34 public: 
    35   nsUnicodeToGB18030Uniq2Bytes() 
    36     : nsTableEncoderSupport(u2BytesCharset,
    37                             (uMappingTable*) &g_uf_gb18030_2bytes, 2) {}
    38 protected: 
    39 };
    40 //-----------------------------------------------------------------------
    41 //  Private class used by nsUnicodeToGB18030
    42 //    nsUnicodeTo4BytesGB18030
    43 //-----------------------------------------------------------------------
    44 static const uint16_t g_uf_gb18030_4bytes[] = {
    45 #include "gb180304bytes.uf"
    46 };
    47 class nsUnicodeTo4BytesGB18030 : public nsTableEncoderSupport
    48 {
    49 public: 
    50   nsUnicodeTo4BytesGB18030()
    51     : nsTableEncoderSupport(u4BytesGB18030Charset, 
    52                              (uMappingTable*) &g_uf_gb18030_4bytes, 4) {}
    53 protected: 
    54 };
    55 //-----------------------------------------------------------------------
    56 //  Private class used by nsUnicodeToGBK
    57 //    nsUnicodeToGBKUniq2Bytes
    58 //-----------------------------------------------------------------------
    59 static const uint16_t g_uf_gbk_2bytes[] = {
    60 #include "gbkuniq2b.uf"
    61 };
    62 class nsUnicodeToGBKUniq2Bytes : public nsTableEncoderSupport
    63 {
    64 public: 
    65   nsUnicodeToGBKUniq2Bytes()
    66     : nsTableEncoderSupport(u2BytesCharset, 
    67                              (uMappingTable*) &g_uf_gbk_2bytes, 2) {}
    68 protected: 
    69 };
    70 //-----------------------------------------------------------------------
    71 //  nsUnicodeToGB18030
    72 //-----------------------------------------------------------------------
    73 void nsUnicodeToGB18030::CreateExtensionEncoder()
    74 {
    75   mExtensionEncoder = new nsUnicodeToGB18030Uniq2Bytes();
    76 }
    77 void nsUnicodeToGB18030::Create4BytesEncoder()
    78 {
    79   m4BytesEncoder = new nsUnicodeTo4BytesGB18030();
    80 }
    82 bool nsUnicodeToGB18030::EncodeSurrogate(
    83   char16_t aSurrogateHigh,
    84   char16_t aSurrogateLow,
    85   char* aOut)
    86 {
    87   if( NS_IS_HIGH_SURROGATE(aSurrogateHigh) && 
    88       NS_IS_LOW_SURROGATE(aSurrogateLow) )
    89   {
    90     // notice that idx does not include the 0x10000 
    91     uint32_t idx = ((aSurrogateHigh - (char16_t)0xD800) << 10 ) |
    92                    (aSurrogateLow - (char16_t) 0xDC00);
    94     unsigned char *out = (unsigned char*) aOut;
    95     // notice this is from 0x90 for supplment planes
    96     out[0] = (idx / (10*126*10)) + 0x90; 
    97     idx %= (10*126*10);
    98     out[1] = (idx / (10*126)) + 0x30;
    99     idx %= (10*126);
   100     out[2] = (idx / (10)) + 0x81;
   101     out[3] = (idx % 10) + 0x30;
   102     return true;
   103   } 
   104   return false; 
   105 } 
   107 //----------------------------------------------------------------------
   108 // Class nsUnicodeToGBK [implementation]
   110 nsUnicodeToGBK::nsUnicodeToGBK(uint32_t aMaxLength) :
   111   nsEncoderSupport(aMaxLength)
   112 {
   113   mExtensionEncoder = nullptr;
   114   m4BytesEncoder = nullptr;
   115   mSurrogateHigh = 0;
   116 }
   117 void nsUnicodeToGBK::CreateExtensionEncoder()
   118 {
   119   mExtensionEncoder = new nsUnicodeToGBKUniq2Bytes();
   120 }
   121 void nsUnicodeToGBK::Create4BytesEncoder()
   122 {
   123   m4BytesEncoder = nullptr;
   124 }
   125 bool nsUnicodeToGBK::TryExtensionEncoder(
   126   char16_t aChar,
   127   char* aOut,
   128   int32_t *aOutLen
   129 )
   130 {
   131   if( NS_IS_HIGH_SURROGATE(aChar) || 
   132       NS_IS_LOW_SURROGATE(aChar) )
   133   {
   134     // performance tune for surrogate characters
   135     return false;
   136   }
   137   if(! mExtensionEncoder )
   138     CreateExtensionEncoder();
   139   if(mExtensionEncoder) 
   140   {
   141     int32_t len = 1;
   142     nsresult res = NS_OK;
   143     res = mExtensionEncoder->Convert(&aChar, &len, aOut, aOutLen);
   144     if(NS_SUCCEEDED(res) && (*aOutLen > 0))
   145       return true;
   146   }
   147   return false;
   148 }
   150 bool nsUnicodeToGBK::Try4BytesEncoder(
   151   char16_t aChar,
   152   char* aOut,
   153   int32_t *aOutLen
   154 )
   155 {
   156   if( NS_IS_HIGH_SURROGATE(aChar) || 
   157       NS_IS_LOW_SURROGATE(aChar) )
   158   {
   159     // performance tune for surrogate characters
   160     return false;
   161   }
   162   if(! m4BytesEncoder )
   163     Create4BytesEncoder();
   164   if(m4BytesEncoder) 
   165   {
   166     int32_t len = 1;
   167     nsresult res = NS_OK;
   168     res = m4BytesEncoder->Convert(&aChar, &len, aOut, aOutLen);
   169     NS_ASSERTION(NS_FAILED(res) || ((1 == len) && (4 == *aOutLen)),
   170       "unexpect conversion length");
   171     if(NS_SUCCEEDED(res) && (*aOutLen > 0))
   172       return true;
   173   }
   174   return false;
   175 }
   176 bool nsUnicodeToGBK::EncodeSurrogate(
   177   char16_t aSurrogateHigh,
   178   char16_t aSurrogateLow,
   179   char* aOut)
   180 {
   181   return false; // GBK cannot encode Surrogate, let the subclass encode it.
   182 } 
   184 NS_IMETHODIMP nsUnicodeToGBK::ConvertNoBuff(
   185   const char16_t * aSrc, 
   186   int32_t * aSrcLength, 
   187   char * aDest, 
   188   int32_t * aDestLength)
   189 {
   190   int32_t iSrcLength = 0;
   191   int32_t iDestLength = 0;
   192   char16_t unicode;
   193   nsresult res = NS_OK;
   194   while (iSrcLength < *aSrcLength )
   195   {
   196     unicode = *aSrc;
   197     //if unicode's hi byte has something, it is not ASCII, must be a GB
   198     if(IS_ASCII(unicode))
   199     {
   200       // this is an ASCII
   201       *aDest = CAST_UNICHAR_TO_CHAR(*aSrc);
   202       aDest++; // increment 1 byte
   203       iDestLength +=1;
   204     } else {
   205       char byte1, byte2;
   206       if(mUtil.UnicodeToGBKChar( unicode, false, &byte1, &byte2))
   207       {
   208         // make sure we still have 2 bytes for output first
   209         if(iDestLength+2 > *aDestLength)
   210         {
   211           res = NS_OK_UENC_MOREOUTPUT;
   212           break;
   213         }
   214         aDest[0] = byte1;
   215         aDest[1] = byte2;
   216         aDest += 2;	// increment 2 bytes
   217         iDestLength +=2;
   218       } else {
   219         int32_t aOutLen = 2;
   220         // make sure we still have 2 bytes for output first
   221         if(iDestLength+2 > *aDestLength)
   222         {
   223           res = NS_OK_UENC_MOREOUTPUT;
   224           break;
   225         }
   226         // we cannot map in the common mapping. Let's try to
   227         // call the delegated 2 byte converter for the gbk or gb18030
   228         // unique 2 byte mapping
   229         if(TryExtensionEncoder(unicode, aDest, &aOutLen))
   230         {
   231           iDestLength += aOutLen;
   232           aDest += aOutLen;
   233         } else {
   234           // make sure we still have 4 bytes for output first
   235           if(iDestLength+4 > *aDestLength)
   236           {
   237             res = NS_OK_UENC_MOREOUTPUT;
   238             break;
   239           }
   240           // we still cannot map. Let's try to
   241           // call the delegated GB18030 4 byte converter 
   242           aOutLen = 4;
   243           if( NS_IS_HIGH_SURROGATE(unicode) )
   244           {
   245             if((iSrcLength+1) < *aSrcLength ) {
   246               if(EncodeSurrogate(aSrc[0],aSrc[1], aDest)) {
   247                 // since we got a surrogate pair, we need to increment src.
   248                 iSrcLength++ ; 
   249                 aSrc++;
   250                 iDestLength += aOutLen;
   251                 aDest += aOutLen;
   252               } else {
   253                 // only get a high surrogate, but not a low surrogate
   254                 res = NS_ERROR_UENC_NOMAPPING;
   255                 iSrcLength++;   // include length of the unmapped character
   256                 break;
   257               }
   258             } else {
   259               mSurrogateHigh = aSrc[0];
   260               break; // this will go to afterwhileloop
   261             }
   262           } else {
   263             if( NS_IS_LOW_SURROGATE(unicode) )
   264             {
   265               if(NS_IS_HIGH_SURROGATE(mSurrogateHigh)) {
   266                 if(EncodeSurrogate(mSurrogateHigh, aSrc[0], aDest)) {
   267                   iDestLength += aOutLen;
   268                   aDest += aOutLen;
   269                 } else {
   270                   // only get a high surrogate, but not a low surrogate
   271                   res = NS_ERROR_UENC_NOMAPPING;
   272                   iSrcLength++;   // include length of the unmapped character
   273                   break;
   274                 }
   275               } else {
   276                 // only get a low surrogate, but not a low surrogate
   277                 res = NS_ERROR_UENC_NOMAPPING;
   278                 iSrcLength++;   // include length of the unmapped character
   279                 break;
   280               }
   281             } else {
   282               if(Try4BytesEncoder(unicode, aDest, &aOutLen))
   283               {
   284                 NS_ASSERTION((aOutLen == 4), "we should always generate 4 bytes here");
   285                 iDestLength += aOutLen;
   286                 aDest += aOutLen;
   287               } else {
   288                 res = NS_ERROR_UENC_NOMAPPING;
   289                 iSrcLength++;   // include length of the unmapped character
   290                 break;
   291               }
   292             }
   293           }
   294         }
   295       } 
   296     }
   297     iSrcLength++ ; // Each unicode char just count as one in char16_t string;  	  
   298     mSurrogateHigh = 0;
   299     aSrc++;
   300     if ( iDestLength >= (*aDestLength) && (iSrcLength < *aSrcLength) )
   301     {
   302       res = NS_OK_UENC_MOREOUTPUT;
   303       break;
   304     }
   305   }
   306 //afterwhileloop:
   307   *aDestLength = iDestLength;
   308   *aSrcLength = iSrcLength;
   309   return res;
   310 }

mercurial