intl/uconv/ucvcn/nsGBKToUnicode.cpp

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 /**
     6  * A character set converter from GBK to Unicode.
     7  * 
     8  *
     9  * @created         07/Sept/1999
    10  * @author  Yueheng Xu, Yueheng.Xu@intel.com
    11  */
    13 #include "nsGBKToUnicode.h"
    14 #include "gbku.h"
    17 //------------------------------------------------------------
    18 // nsGBKUnique2BytesToUnicode
    19 //------------------------------------------------------------
    20 class nsGBKUnique2BytesToUnicode : public nsTableDecoderSupport 
    21 {
    22 public:
    23   nsGBKUnique2BytesToUnicode();
    24   virtual ~nsGBKUnique2BytesToUnicode() 
    25     { }
    26 protected:
    27 };
    29 static const uint16_t g_utGBKUnique2Bytes[] = {
    30 #include "gbkuniq2b.ut"
    31 };
    32 nsGBKUnique2BytesToUnicode::nsGBKUnique2BytesToUnicode() 
    33   : nsTableDecoderSupport(u2BytesCharset, nullptr,
    34         (uMappingTable*) &g_utGBKUnique2Bytes, 1) 
    35 {
    36 }
    38 //------------------------------------------------------------
    39 // nsGB18030Unique2BytesToUnicode
    40 //------------------------------------------------------------
    41 class nsGB18030Unique2BytesToUnicode : public nsTableDecoderSupport 
    42 {
    43 public:
    44   nsGB18030Unique2BytesToUnicode();
    45   virtual ~nsGB18030Unique2BytesToUnicode() 
    46     { }
    47 protected:
    48 };
    50 static const uint16_t g_utGB18030Unique2Bytes[] = {
    51 #include "gb18030uniq2b.ut"
    52 };
    53 nsGB18030Unique2BytesToUnicode::nsGB18030Unique2BytesToUnicode() 
    54   : nsTableDecoderSupport(u2BytesCharset, nullptr,
    55         (uMappingTable*) &g_utGB18030Unique2Bytes, 1) 
    56 {
    57 }
    59 //------------------------------------------------------------
    60 // nsGB18030Unique4BytesToUnicode
    61 //------------------------------------------------------------
    62 class nsGB18030Unique4BytesToUnicode : public nsTableDecoderSupport 
    63 {
    64 public:
    65   nsGB18030Unique4BytesToUnicode();
    66   virtual ~nsGB18030Unique4BytesToUnicode() 
    67     { }
    68 protected:
    69 };
    71 static const uint16_t g_utGB18030Unique4Bytes[] = {
    72 #include "gb180304bytes.ut"
    73 };
    74 nsGB18030Unique4BytesToUnicode::nsGB18030Unique4BytesToUnicode() 
    75   : nsTableDecoderSupport(u4BytesGB18030Charset, nullptr,
    76         (uMappingTable*) &g_utGB18030Unique4Bytes, 1) 
    77 {
    78 }
    81 //----------------------------------------------------------------------
    82 // Class nsGBKToUnicode [implementation]
    84 //----------------------------------------------------------------------
    85 // Subclassing of nsTablesDecoderSupport class [implementation]
    87 #define LEGAL_GBK_MULTIBYTE_FIRST_BYTE(c)  \
    88       (UINT8_IN_RANGE(0x81, (c), 0xFE))
    89 #define FIRST_BYTE_IS_SURROGATE(c)  \
    90       (UINT8_IN_RANGE(0x90, (c), 0xFE))
    91 #define LEGAL_GBK_2BYTE_SECOND_BYTE(c) \
    92       (UINT8_IN_RANGE(0x40, (c), 0x7E)|| UINT8_IN_RANGE(0x80, (c), 0xFE))
    93 #define LEGAL_GBK_4BYTE_SECOND_BYTE(c) \
    94       (UINT8_IN_RANGE(0x30, (c), 0x39))
    95 #define LEGAL_GBK_4BYTE_THIRD_BYTE(c)  \
    96       (UINT8_IN_RANGE(0x81, (c), 0xFE))
    97 #define LEGAL_GBK_4BYTE_FORTH_BYTE(c) \
    98       (UINT8_IN_RANGE(0x30, (c), 0x39))
   100 NS_IMETHODIMP nsGBKToUnicode::ConvertNoBuff(const char* aSrc,
   101                                             int32_t * aSrcLength,
   102                                             char16_t *aDest,
   103                                             int32_t * aDestLength)
   104 {
   105   int32_t i=0;
   106   int32_t iSrcLength = (*aSrcLength);
   107   int32_t iDestlen = 0;
   108   nsresult rv=NS_OK;
   109   *aSrcLength = 0;
   111   for (i=0;i<iSrcLength;i++)
   112   {
   113     if ( iDestlen >= (*aDestLength) )
   114     {
   115       rv = NS_OK_UDEC_MOREOUTPUT;
   116       break;
   117     }
   118     // The valid range for the 1st byte is [0x81,0xFE] 
   119     if(LEGAL_GBK_MULTIBYTE_FIRST_BYTE(*aSrc))
   120     {
   121       if(i+1 >= iSrcLength) 
   122       {
   123         rv = NS_OK_UDEC_MOREINPUT;
   124         break;
   125       }
   126       // To make sure, the second byte has to be checked as well.
   127       // In GBK, the second byte range is [0x40,0x7E] and [0x80,0XFE]
   128       if(LEGAL_GBK_2BYTE_SECOND_BYTE(aSrc[1]))
   129       {
   130         // Valid GBK code
   131         *aDest = mUtil.GBKCharToUnicode(aSrc[0], aSrc[1]);
   132         if(UCS2_NO_MAPPING == *aDest)
   133         { 
   134           // We cannot map in the common mapping, let's call the
   135           // delegate 2 byte decoder to decode the gbk or gb18030 unique 
   136           // 2 byte mapping
   137           if(! TryExtensionDecoder(aSrc, aDest))
   138           {
   139             *aDest = UCS2_NO_MAPPING;
   140           }
   141         }
   142         aSrc += 2;
   143         i++;
   144       }
   145       else if (LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
   146       {
   147         // from the first 2 bytes, it looks like a 4 byte GB18030
   148         if(i+3 >= iSrcLength)  // make sure we got 4 bytes
   149         {
   150           rv = NS_OK_UDEC_MOREINPUT;
   151           break;
   152         }
   153         // 4 bytes patten
   154         // [0x81-0xfe][0x30-0x39][0x81-0xfe][0x30-0x39]
   155         // preset the 
   157         if (LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]) &&
   158             LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
   159         {
   160            if ( ! FIRST_BYTE_IS_SURROGATE(aSrc[0])) 
   161            {
   162              // let's call the delegated 4 byte gb18030 converter to convert it
   163              if(! Try4BytesDecoder(aSrc, aDest))
   164                *aDest = UCS2_NO_MAPPING;
   165            } else {
   166               // let's try supplement mapping
   167              if ( (iDestlen+1) < (*aDestLength) )
   168              {
   169                if(DecodeToSurrogate(aSrc, aDest))
   170                {
   171                  // surrogte two char16_t
   172                  iDestlen++;
   173                  aDest++;
   174                }  else {
   175                  *aDest = UCS2_NO_MAPPING;
   176               }
   177              } else {
   178                if (*aDestLength < 2) {
   179                  NS_ERROR("insufficient space in output buffer");
   180                  *aDest = UCS2_NO_MAPPING;
   181                } else {
   182                  rv = NS_OK_UDEC_MOREOUTPUT;
   183                  break;
   184                }
   185              }
   186            }
   187            aSrc += 4;
   188            i += 3;
   189         } else {
   190           *aDest = UCS2_NO_MAPPING; 
   191           // If the third and fourth bytes are not in the legal ranges for
   192           // a four-byte sequnce, resynchronize on the second byte
   193           // (which we know is in the range of LEGAL_GBK_4BYTE_SECOND_BYTE,
   194           //  0x30-0x39)
   195           aSrc++;
   196         }
   197       }
   198       else if ((uint8_t) aSrc[0] == (uint8_t)0xA0 )
   199       {
   200         // stand-alone (not followed by a valid second byte) 0xA0 !
   201         // treat it as valid a la Netscape 4.x
   202         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
   203         aSrc++;
   204       } else {
   205         // Invalid GBK code point (second byte should be 0x40 or higher)
   206         *aDest = UCS2_NO_MAPPING;
   207         aSrc++;
   208       }
   209     } else {
   210       if(IS_ASCII(*aSrc))
   211       {
   212         // The source is an ASCII
   213         *aDest = CAST_CHAR_TO_UNICHAR(*aSrc);
   214         aSrc++;
   215       } else {
   216         if(IS_GBK_EURO(*aSrc)) {
   217           *aDest = UCS2_EURO;
   218         } else {
   219           *aDest = UCS2_NO_MAPPING;
   220         }
   221         aSrc++;
   222       }
   223     }
   224     iDestlen++;
   225     aDest++;
   226     *aSrcLength = i+1;
   227   }
   228   *aDestLength = iDestlen;
   229   return rv;
   230 }
   233 void nsGBKToUnicode::CreateExtensionDecoder()
   234 {
   235   mExtensionDecoder = new nsGBKUnique2BytesToUnicode();
   236 }
   237 void nsGBKToUnicode::Create4BytesDecoder()
   238 {
   239   m4BytesDecoder =  nullptr;
   240 }
   241 void nsGB18030ToUnicode::CreateExtensionDecoder()
   242 {
   243   mExtensionDecoder = new nsGB18030Unique2BytesToUnicode();
   244 }
   245 void nsGB18030ToUnicode::Create4BytesDecoder()
   246 {
   247   m4BytesDecoder = new nsGB18030Unique4BytesToUnicode();
   248 }
   249 bool nsGB18030ToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
   250 {
   251   NS_ASSERTION(FIRST_BYTE_IS_SURROGATE(aSrc[0]),       "illegal first byte");
   252   NS_ASSERTION(LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]),   "illegal second byte");
   253   NS_ASSERTION(LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]),    "illegal third byte");
   254   NS_ASSERTION(LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]),    "illegal forth byte");
   255   if(! FIRST_BYTE_IS_SURROGATE(aSrc[0]))
   256     return false;
   257   if(! LEGAL_GBK_4BYTE_SECOND_BYTE(aSrc[1]))
   258     return false;
   259   if(! LEGAL_GBK_4BYTE_THIRD_BYTE(aSrc[2]))
   260     return false;
   261   if(! LEGAL_GBK_4BYTE_FORTH_BYTE(aSrc[3]))
   262     return false;
   264   uint8_t a1 = (uint8_t) aSrc[0];
   265   uint8_t a2 = (uint8_t) aSrc[1];
   266   uint8_t a3 = (uint8_t) aSrc[2];
   267   uint8_t a4 = (uint8_t) aSrc[3];
   268   a1 -= (uint8_t)0x90;
   269   a2 -= (uint8_t)0x30;
   270   a3 -= (uint8_t)0x81;
   271   a4 -= (uint8_t)0x30;
   272   uint32_t idx = (((a1 * 10 + a2 ) * 126 + a3) * 10) + a4;
   273   // idx == ucs4Codepoint - 0x10000
   274   if (idx > 0x000FFFFF)
   275     return false;
   277   *aOut++ = 0xD800 | (idx >> 10);
   278   *aOut = 0xDC00 | (0x000003FF & idx);
   280   return true;
   281 }
   282 bool nsGBKToUnicode::TryExtensionDecoder(const char* aSrc, char16_t* aOut)
   283 {
   284   if(!mExtensionDecoder)
   285     CreateExtensionDecoder();
   286   NS_ASSERTION(mExtensionDecoder, "cannot creqte 2 bytes unique converter");
   287   if(mExtensionDecoder)
   288   {
   289     nsresult res = mExtensionDecoder->Reset();
   290     NS_ASSERTION(NS_SUCCEEDED(res), "2 bytes unique conversoin reset failed");
   291     int32_t len = 2;
   292     int32_t dstlen = 1;
   293     res = mExtensionDecoder->Convert(aSrc,&len, aOut, &dstlen); 
   294     NS_ASSERTION(NS_FAILED(res) || ((len==2) && (dstlen == 1)), 
   295        "some strange conversion result");
   296      // if we failed, we then just use the 0xfffd 
   297      // therefore, we ignore the res here. 
   298     if(NS_SUCCEEDED(res)) 
   299       return true;
   300   }
   301   return  false;
   302 }
   303 bool nsGBKToUnicode::DecodeToSurrogate(const char* aSrc, char16_t* aOut)
   304 {
   305   return false;
   306 }
   307 bool nsGBKToUnicode::Try4BytesDecoder(const char* aSrc, char16_t* aOut)
   308 {
   309   if(!m4BytesDecoder)
   310     Create4BytesDecoder();
   311   if(m4BytesDecoder)
   312   {
   313     nsresult res = m4BytesDecoder->Reset();
   314     NS_ASSERTION(NS_SUCCEEDED(res), "4 bytes unique conversoin reset failed");
   315     int32_t len = 4;
   316     int32_t dstlen = 1;
   317     res = m4BytesDecoder->Convert(aSrc,&len, aOut, &dstlen); 
   318     NS_ASSERTION(NS_FAILED(res) || ((len==4) && (dstlen == 1)), 
   319        "some strange conversion result");
   320      // if we failed, we then just use the 0xfffd 
   321      // therefore, we ignore the res here. 
   322     if(NS_SUCCEEDED(res)) 
   323       return true;
   324   }
   325   return  false;
   326 }

mercurial