intl/uconv/ucvcn/nsISO2022CNToUnicode.cpp

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 #include "nsISO2022CNToUnicode.h"
     6 #include "nsUCSupport.h"
     7 #include "nsICharsetConverterManager.h"
     8 #include "nsServiceManagerUtils.h"
    10 static NS_DEFINE_CID(kCharsetConverterManagerCID, NS_ICHARSETCONVERTERMANAGER_CID);
    12 NS_IMETHODIMP nsISO2022CNToUnicode::GB2312_To_Unicode(unsigned char *aSrc, int32_t aSrcLength, char16_t * aDest, int32_t * aDestLength)
    13 {
    14     nsresult rv;
    16     if(!mGB2312_Decoder) {
    17        // creating a delegate converter (GB2312)
    18        nsCOMPtr<nsICharsetConverterManager> ccm =
    19               do_GetService(kCharsetConverterManagerCID, &rv);
    20        if(NS_FAILED(rv))
    21           return NS_ERROR_UNEXPECTED;
    23        rv = ccm->GetUnicodeDecoderRaw("GB2312", getter_AddRefs(mGB2312_Decoder));
    24        if(NS_FAILED(rv))
    25           return NS_ERROR_UNEXPECTED;
    26     }
    28     if(!mGB2312_Decoder) // failed creating a delegate converter
    29        return NS_ERROR_UNEXPECTED;
    31     rv = mGB2312_Decoder->Convert((const char *)aSrc, &aSrcLength, aDest, aDestLength);
    32     return rv;
    33 }
    35 NS_IMETHODIMP nsISO2022CNToUnicode::EUCTW_To_Unicode(unsigned char *aSrc, int32_t aSrcLength, char16_t * aDest, int32_t * aDestLength)
    36 {
    37     nsresult rv;
    39     if(!mEUCTW_Decoder) {
    40        // creating a delegate converter (x-euc-tw)
    41        nsCOMPtr<nsICharsetConverterManager> ccm =
    42               do_GetService(kCharsetConverterManagerCID, &rv);
    43        if(NS_FAILED(rv))
    44           return NS_ERROR_UNEXPECTED;
    46        rv = ccm->GetUnicodeDecoderRaw("x-euc-tw", getter_AddRefs(mEUCTW_Decoder));
    47        if(NS_FAILED(rv))
    48           return NS_ERROR_UNEXPECTED;
    49     }
    51     if(!mEUCTW_Decoder) // failed creating a delegate converter
    52        return NS_ERROR_UNEXPECTED;
    54     rv = mEUCTW_Decoder->Convert((const char *)aSrc, &aSrcLength, aDest, aDestLength);
    55     return(rv);
    56 }
    58 NS_IMETHODIMP nsISO2022CNToUnicode::Convert(const char * aSrc, int32_t * aSrcLen, char16_t * aDest, int32_t * aDestLen)
    59 {
    60   const unsigned char * srcEnd = (unsigned char *)aSrc + *aSrcLen;
    61   const unsigned char * src = (unsigned char *) aSrc;
    62   char16_t* destEnd = aDest + *aDestLen;
    63   char16_t* dest = aDest;
    64   nsresult rv;
    65   int32_t aLen; 
    67   while ((src < srcEnd))
    68   {
    69     switch (mState)
    70     {
    71       case eState_ASCII:
    72         if(ESC == *src) {
    73            mState = eState_ESC;
    74         } else {
    75            if (CHECK_OVERRUN(dest, destEnd, 1))
    76               goto error1;
    77            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
    79            mState = eState_ASCII;
    80         }
    81         break;
    83       case eState_ESC:    // ESC
    84         if('$' == *src) {
    85            mState = eState_ESC_24;
    86         } else {
    87            if (CHECK_OVERRUN(dest, destEnd, 2))
    88               goto error1;
    89            *dest++ = (char16_t) ESC;
    90            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
    92            mState = eState_ASCII;
    93         }
    94         break;
    96       case eState_ESC_24: // ESC $
    97         if(')' == *src) {
    98            mState = eState_ESC_24_29;
    99         } else if('*' == *src) {
   100            mState = eState_ESC_24_2A;
   101         } else if('+' == *src) {
   102            mState = eState_ESC_24_2B;
   103         } else {
   104            if (CHECK_OVERRUN(dest, destEnd, 3))
   105               goto error1;
   106            *dest++ = (char16_t) ESC;
   107            *dest++ = (char16_t) '$';
   108            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   110            mState = eState_ASCII;
   111         }
   112         break;
   114       case eState_ESC_24_29: // ESC $ )
   115         if('A' == *src) {
   116            mState = eState_ESC_24_29_A;
   117         } else if('G' == *src) {
   118            mState = eState_ESC_24_29_G;
   119         } else {
   120            if (CHECK_OVERRUN(dest, destEnd, 4))
   121               goto error1;
   122            *dest++ = (char16_t) ESC;
   123            *dest++ = (char16_t) '$';
   124            *dest++ = (char16_t) ')';
   125            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   127            mState = eState_ASCII;
   128         }
   129         break;
   131       case eState_ESC_24_29_A:  // ESC $ ) A
   132         if(SO == *src) {
   133            mState = eState_GB2312_1980;
   134            mRunLength = 0;
   135         } else {
   136            if (CHECK_OVERRUN(dest, destEnd, 5))
   137               goto error1;
   138            *dest++ = (char16_t) ESC;
   139            *dest++ = (char16_t) '$';
   140            *dest++ = (char16_t) ')';
   141            *dest++ = (char16_t) 'A';
   142            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   144            mState = eState_ASCII;
   145         }
   146         break;
   148       case eState_GB2312_1980:   // ESC $ ) A SO
   149         if(SI == *src) { // Shift-In (SI)
   150            mState = eState_ESC_24_29_A_SO_SI;
   151            if (mRunLength == 0) {
   152               if (CHECK_OVERRUN(dest, destEnd, 1))
   153                  goto error1;
   154               *dest++ = 0xFFFD;
   155            }
   156            mRunLength = 0;
   157         } else if(ESC == *src) {
   158            mState = eState_ESC;
   159         } else {
   160            if(0x20 < *src && *src < 0x7f) {
   161               mData = *src;
   162               mState = eState_GB2312_1980_2ndbyte;
   163            } else {
   164               if (CHECK_OVERRUN(dest, destEnd, 1))
   165                  goto error1;
   166               *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   167            }
   168         }
   169         break; 
   171       case eState_GB2312_1980_2ndbyte:  // ESC $ ) A SO
   172         if(0x20 < *src && *src < 0x7f) {
   173            unsigned char gb[2];
   174            int32_t gbLen = 2;
   176            gb[0] = mData | 0x80;
   177            gb[1] = *src | 0x80;
   179            aLen = destEnd - dest;
   180            rv = GB2312_To_Unicode(gb, gbLen, dest, &aLen);
   181            ++mRunLength;
   182            if(rv == NS_OK_UDEC_MOREOUTPUT) {
   183               goto error1;
   184            } else if(NS_FAILED(rv)) {
   185               goto error2;
   186            }
   188            dest += aLen;
   189         } else {
   190            if (CHECK_OVERRUN(dest, destEnd, 2))
   191               goto error1;
   192            *dest++ = (char16_t) mData;
   193            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   194         }
   195         mState = eState_GB2312_1980;
   196         break;
   198       case eState_ESC_24_29_A_SO_SI:  // ESC $ ) A SO SI
   199         if(SO == *src) {
   200            mState = eState_GB2312_1980;
   201            mRunLength = 0;
   202         } else if(ESC == *src) {
   203            mState = eState_ESC;
   204         } else {
   205            if (CHECK_OVERRUN(dest, destEnd, 1))
   206               goto error1;
   207            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   209            mState = eState_ESC_24_29_A_SO_SI;
   210         }
   211         break;
   213       case eState_ESC_24_29_G:   // ESC $ ) G
   214         if(SO == *src) {
   215            mState = eState_CNS11643_1;
   216            mRunLength = 0;
   217         } else {
   218            if (CHECK_OVERRUN(dest, destEnd, 5))
   219               goto error1;
   220            *dest++ = (char16_t) ESC;
   221            *dest++ = (char16_t) '$';
   222            *dest++ = (char16_t) ')';
   223            *dest++ = (char16_t) 'G';
   224            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   226            mState = eState_ASCII;
   227         }
   228         break;
   230       case eState_CNS11643_1:   // ESC $ ) G SO
   231         if(SI == *src) { // Shift-In (SI)
   232            mState = eState_ESC_24_29_G_SO_SI;
   233            if (mRunLength == 0) {
   234               if (CHECK_OVERRUN(dest, destEnd, 1))
   235                  goto error1;
   236               *dest++ = 0xFFFD;
   237            }
   238            mRunLength = 0;
   239         } else if(ESC == *src) {
   240            mState = eState_ESC;
   241         } else {
   242            if(0x20 < *src && *src < 0x7f) {
   243               mData = *src;
   244               mState = eState_CNS11643_1_2ndbyte;
   245            } else {
   246               if (CHECK_OVERRUN(dest, destEnd, 1))
   247                  goto error1;
   248               *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   249            }
   250         }
   251         break;
   253       case eState_CNS11643_1_2ndbyte:  // ESC $ ) G SO
   254         if(0x20 < *src && *src < 0x7f) {
   255            unsigned char cns[4];
   256            int32_t cnsLen = 2;
   258            cns[0] = mData | 0x80;
   259            cns[1] = *src | 0x80;
   261            aLen = destEnd - dest;
   262            rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
   263            ++mRunLength;
   264            if(rv == NS_OK_UDEC_MOREOUTPUT) {
   265               goto error1;
   266            } else if(NS_FAILED(rv)) {
   267               goto error2;
   268            }
   270            dest += aLen;
   271         } else {
   272            if (CHECK_OVERRUN(dest, destEnd, 2))
   273               goto error1;
   274            *dest++ = (char16_t) mData;
   275            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   276         }
   277         mState = eState_CNS11643_1;
   278         break;
   280       case eState_ESC_24_29_G_SO_SI: // ESC $ ) G SO SI
   281         if(SO == *src) {
   282            mState = eState_CNS11643_1;
   283            mRunLength = 0;
   284         } else if(ESC == *src) {
   285            mState = eState_ESC;
   286         } else {
   287            if (CHECK_OVERRUN(dest, destEnd, 1))
   288               goto error1;
   289            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   291            mState = eState_ESC_24_29_G_SO_SI;
   292         }
   293         break;
   295       case eState_ESC_24_2A: // ESC $ *
   296         if('H' == *src) {
   297            mState = eState_ESC_24_2A_H;
   298         } else {
   299            if (CHECK_OVERRUN(dest, destEnd, 4))
   300               goto error1;
   301            *dest++ = (char16_t) ESC;
   302            *dest++ = (char16_t) '$';
   303            *dest++ = (char16_t) '*';
   304            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   306            mState = eState_ASCII;
   307         }
   308         break;
   310       case eState_ESC_24_2A_H:  // ESC $ * H
   311         if(ESC == *src) {
   312            mState = eState_ESC_24_2A_H_ESC;
   313         } else {
   314            if (CHECK_OVERRUN(dest, destEnd, 5))
   315               goto error1;
   316            *dest++ = (char16_t) ESC;
   317            *dest++ = (char16_t) '$';
   318            *dest++ = (char16_t) '*';
   319            *dest++ = (char16_t) 'H';
   320            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   322            mState = eState_ASCII;
   323         } 
   324         break;
   326       case eState_ESC_24_2A_H_ESC:  // ESC $ * H ESC
   327         if(SS2 == *src) {
   328            mState = eState_CNS11643_2;
   329            mRunLength = 0;
   330         } else if('$' == *src) {
   331            mState = eState_ESC_24;
   332         } else {
   333            if (CHECK_OVERRUN(dest, destEnd, 6))
   334               goto error1;
   335            *dest++ = (char16_t) ESC;
   336            *dest++ = (char16_t) '$';
   337            *dest++ = (char16_t) '*';
   338            *dest++ = (char16_t) 'H';
   339            *dest++ = (char16_t) ESC;
   340            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   342            mState = eState_ASCII;
   343         }
   344         break;
   346       case eState_CNS11643_2:  // ESC $ * H ESC SS2
   347         if(SI == *src) { // Shift-In (SI)
   348            mState = eState_ESC_24_2A_H_ESC_SS2_SI;
   349            if (mRunLength == 0) {
   350               if (CHECK_OVERRUN(dest, destEnd, 1))
   351                  goto error1;
   352               *dest++ = 0xFFFD;
   353            }
   354            mRunLength = 0;
   355         } else if(ESC == *src) {
   356            mState = eState_ESC_24_2A_H_ESC;
   357         } else {
   358            if(0x20 < *src && *src < 0x7f) {
   359               mData = *src;
   360               mState = eState_CNS11643_2_2ndbyte;
   361            } else {
   362               if (CHECK_OVERRUN(dest, destEnd, 1))
   363                  goto error1;
   364               *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   365            }
   366         }
   367         break;
   369       case eState_CNS11643_2_2ndbyte:   // ESC $ * H ESC SS2
   370         if(0x20 < *src && *src < 0x7f) {
   371            unsigned char cns[4];
   372            int32_t cnsLen = 4;
   374            cns[0] = (unsigned char) MBYTE;
   375            cns[1] = (unsigned char) (PMASK + 2);
   376            cns[2] = mData | 0x80;
   377            cns[3] = *src | 0x80;
   379            aLen = destEnd - dest;
   380            rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
   381            ++mRunLength;
   382            if(rv == NS_OK_UDEC_MOREOUTPUT) {
   383               goto error1;
   384            } else if(NS_FAILED(rv)) {
   385               goto error2;
   386            }
   388            dest += aLen;
   389         } else {
   390            if (CHECK_OVERRUN(dest, destEnd, 2))
   391               goto error1;
   392            *dest++ = (char16_t) mData;
   393            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   394         }
   395         mState = eState_CNS11643_2;
   396         break;
   398       case eState_ESC_24_2A_H_ESC_SS2_SI:  // ESC $ * H ESC SS2 SI
   399         if(ESC == *src) {
   400            mState = eState_ESC_24_2A_H_ESC_SS2_SI_ESC;
   401         } else {
   402            if (CHECK_OVERRUN(dest, destEnd, 1))
   403               goto error1;
   404            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   406            mState = eState_ESC_24_2A_H_ESC_SS2_SI;
   407         }
   408         break;
   410       case eState_ESC_24_2A_H_ESC_SS2_SI_ESC:  // ESC $ * H ESC SS2 SI ESC
   411         if(SS2 == *src) {
   412            mState = eState_CNS11643_2;
   413            mRunLength = 0;
   414         } else if('$' == *src) {
   415            mState = eState_ESC_24;
   416         } else {
   417            if (CHECK_OVERRUN(dest, destEnd, 1))
   418               goto error1;
   419            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   421            mState = eState_ESC_24_2A_H_ESC_SS2_SI;
   422         }
   423         break;
   425       case eState_ESC_24_2B: // ESC $ +
   426         if('I' <= *src && *src <= 'M') {
   427             mState = eState_ESC_24_2B_I;
   428             mPlaneID = *src - 'I' + 3;
   429         } else {
   430            if (CHECK_OVERRUN(dest, destEnd, 4))
   431               goto error1;
   432            *dest++ = (char16_t) ESC;
   433            *dest++ = (char16_t) '$';
   434            *dest++ = (char16_t) '+';
   435            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   437            mState = eState_ASCII;
   438         }
   439         break;
   441       case eState_ESC_24_2B_I:  // ESC $ + I
   442         if(ESC == *src) {
   443            mState = eState_ESC_24_2B_I_ESC;
   444         } else {
   445            if (CHECK_OVERRUN(dest, destEnd, 5))
   446               goto error1;
   447            *dest++ = (char16_t) ESC;
   448            *dest++ = (char16_t) '$';
   449            *dest++ = (char16_t) '+';
   450            *dest++ = (char16_t) 'I' + mPlaneID - 3;
   451            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   453            mState = eState_ASCII;
   454         }
   455         break;
   457       case eState_ESC_24_2B_I_ESC:  // ESC $ + I ESC
   458         if(SS3 == *src) {
   459            mState = eState_CNS11643_3;
   460            mRunLength = 0;
   461         } else if('$' == *src) {
   462            mState = eState_ESC_24;
   463         } else {
   464            if (CHECK_OVERRUN(dest, destEnd, 6))
   465               goto error1;
   466            *dest++ = (char16_t) ESC;
   467            *dest++ = (char16_t) '$';
   468            *dest++ = (char16_t) '+';
   469            *dest++ = (char16_t) 'I' + mPlaneID - 3;
   470            *dest++ = (char16_t) ESC;
   471            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   473            mState = eState_ASCII;
   474         }
   475         break;
   477       case eState_CNS11643_3:   // ESC $ + I ESC SS3
   478         if(SI == *src) { // Shift-In (SI)
   479            mState = eState_ESC_24_2B_I_ESC_SS3_SI;
   480            if (mRunLength == 0) {
   481               if (CHECK_OVERRUN(dest, destEnd, 1))
   482                  goto error1;
   483               *dest++ = 0xFFFD;
   484            }
   485            mRunLength = 0;
   486         } else if(ESC == *src) {
   487            mState = eState_ESC_24_2B_I_ESC;
   488         } else {
   489            if(0x20 < *src && *src < 0x7f) {
   490               mData = *src;
   491               mState = eState_CNS11643_3_2ndbyte;
   492            } else {
   493               if (CHECK_OVERRUN(dest, destEnd, 1))
   494                  goto error1;
   495               *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   496            }
   497         }
   499         break;
   501       case eState_CNS11643_3_2ndbyte:  // ESC $ + I ESC SS3
   502         if(0x20 < *src && *src < 0x7f) {
   503            unsigned char cns[4];
   504            int32_t cnsLen = 4;
   506            cns[0] = (unsigned char) MBYTE;
   507            cns[1] = (unsigned char) (PMASK + mPlaneID);
   508            cns[2] = mData | 0x80;
   509            cns[3] = *src | 0x80;
   511            aLen = destEnd - dest;
   512            rv = EUCTW_To_Unicode(cns, cnsLen, dest, &aLen);
   513            ++mRunLength;
   514            if(rv == NS_OK_UDEC_MOREOUTPUT) {
   515               goto error1;
   516            } else if(NS_FAILED(rv)) {
   517               goto error2;
   518            }
   520            dest += aLen;
   521         } else {
   522            if (CHECK_OVERRUN(dest, destEnd, 2))
   523               goto error1;
   524            *dest++ = (char16_t) mData;
   525            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   526         }
   527         mState = eState_CNS11643_3;
   528         break;
   530       case eState_ESC_24_2B_I_ESC_SS3_SI:  // ESC $ + I ESC SS3 SI
   531         if(ESC == *src) {
   532            mState = eState_ESC_24_2B_I_ESC_SS3_SI_ESC;
   533         } else {
   534            if (CHECK_OVERRUN(dest, destEnd, 1))
   535               goto error1;
   536            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   538            mState = eState_ESC_24_2B_I_ESC_SS3_SI;
   539         }
   540         break;
   542       case eState_ESC_24_2B_I_ESC_SS3_SI_ESC:  // ESC $ + I ESC SS3 SI ESC
   543         if(SS3 == *src) {
   544            mState = eState_CNS11643_3;
   545            mRunLength = 0;
   546         } else if('$' == *src) {
   547            mState = eState_ESC_24;
   548         } else {
   549            if (CHECK_OVERRUN(dest, destEnd, 1))
   550               goto error1;
   551            *dest++ = (0x80 & *src) ? 0xFFFD : (char16_t) *src;
   553            mState = eState_ESC_24_2B_I_ESC_SS3_SI;
   554         }
   555         break;
   557       case eState_ERROR:
   558         NS_NOTREACHED("unhandled case");
   559         goto error2;
   561     } // switch
   562     src++;
   563   }
   565   *aDestLen = dest- aDest;
   566   return NS_OK;
   568 error1:
   569   *aDestLen = dest-aDest;
   570   *aSrcLen = src - (const unsigned char*)aSrc;
   571   return NS_OK_UDEC_MOREOUTPUT;
   573 error2:
   574   *aSrcLen = src - (const unsigned char*)aSrc;
   575   *aDestLen = dest-aDest;
   576   mState = eState_ASCII;
   577   return NS_ERROR_UNEXPECTED;
   578 }

mercurial