intl/uconv/ucvlatin/nsUTF16ToUnicode.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "nsUTF16ToUnicode.h"
     7 #include "nsCharTraits.h"
     8 #include "mozilla/Endian.h"
    10 enum {
    11   STATE_NORMAL = 0,
    12   STATE_HALF_CODE_POINT = 1,
    13   STATE_FIRST_CALL = 2,
    14   STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
    15   STATE_ODD_SURROGATE_PAIR = 4
    16 };
    18 nsresult
    19 nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
    20                                             int32_t * aSrcLength,
    21                                             char16_t * aDest,
    22                                             int32_t * aDestLength,
    23                                             bool aSwapBytes)
    24 {
    25   const char* src = aSrc;
    26   const char* srcEnd = aSrc + *aSrcLength;
    27   char16_t* dest = aDest;
    28   char16_t* destEnd = aDest + *aDestLength;
    29   char16_t oddHighSurrogate;
    31   switch(mState) {
    32     case STATE_FIRST_CALL:
    33       NS_ASSERTION(*aSrcLength > 1, "buffer too short");
    34       src+=2;
    35       mState = STATE_NORMAL;
    36       break;
    38     case STATE_SECOND_BYTE:
    39       NS_ASSERTION(*aSrcLength > 0, "buffer too short");
    40       src++;
    41       mState = STATE_NORMAL;
    42       break;
    44     case STATE_ODD_SURROGATE_PAIR:
    45       if (*aDestLength < 2)
    46         goto error;
    47       else {
    48         *dest++ = mOddHighSurrogate;
    49         *dest++ = mOddLowSurrogate;
    50         mOddHighSurrogate = mOddLowSurrogate = 0;
    51         mState = STATE_NORMAL;
    52       }
    53       break;
    55     case STATE_NORMAL:
    56     case STATE_HALF_CODE_POINT:
    57     default:
    58       break;
    59   }
    61   oddHighSurrogate = mOddHighSurrogate;
    63   if (src == srcEnd) {
    64     *aDestLength = dest - aDest;
    65     return (mState != STATE_NORMAL || oddHighSurrogate) ?
    66            NS_OK_UDEC_MOREINPUT : NS_OK;
    67   }
    69   const char* srcEvenEnd;
    71   char16_t u;
    72   if (mState == STATE_HALF_CODE_POINT) {
    73     if (dest == destEnd)
    74       goto error;
    76     // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
    77     // previous run while the 2nd byte has to come from |*src|.
    78     mState = STATE_NORMAL;
    79 #if MOZ_BIG_ENDIAN
    80     u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
    81 #else
    82     u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
    83 #endif
    84     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
    85     goto have_codepoint;
    86   } else {
    87     srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
    88   }
    90   while (src != srcEvenEnd) {
    91     if (dest == destEnd)
    92       goto error;
    94 #if !defined(__sparc__) && !defined(__arm__)
    95     u = *(const char16_t*)src;
    96 #else
    97     memcpy(&u, src, 2);
    98 #endif
    99     src += 2;
   101 have_codepoint:
   102     if (aSwapBytes)
   103       u = u << 8 | u >> 8;
   105     if (!IS_SURROGATE(u)) {
   106       if (oddHighSurrogate) {
   107         if (mErrBehavior == kOnError_Signal) {
   108           goto error2;
   109         }
   110         *dest++ = UCS2_REPLACEMENT_CHAR;
   111         if (dest == destEnd)
   112           goto error;
   113         oddHighSurrogate = 0;
   114       }
   115       *dest++ = u;
   116     } else if (NS_IS_HIGH_SURROGATE(u)) {
   117       if (oddHighSurrogate) {
   118         if (mErrBehavior == kOnError_Signal) {
   119           goto error2;
   120         }
   121         *dest++ = UCS2_REPLACEMENT_CHAR;
   122         if (dest == destEnd)
   123           goto error;
   124       }
   125       oddHighSurrogate = u;
   126     }
   127     else /* if (NS_IS_LOW_SURROGATE(u)) */ {
   128       if (oddHighSurrogate && *aDestLength > 1) {
   129         if (dest + 1 >= destEnd) {
   130           mOddLowSurrogate = u;
   131           mOddHighSurrogate = oddHighSurrogate;
   132           mState = STATE_ODD_SURROGATE_PAIR;
   133           goto error;
   134         }
   135         *dest++ = oddHighSurrogate;
   136         *dest++ = u;
   137       } else {
   138         if (mErrBehavior == kOnError_Signal) {
   139           goto error2;
   140         }
   141         *dest++ = UCS2_REPLACEMENT_CHAR;
   142       }
   143       oddHighSurrogate = 0;
   144     }
   145   }
   146   if (src != srcEnd) {
   147     // store the lead byte of a 16-bit unit for the next run.
   148     mOddByte = *src++;
   149     mState = STATE_HALF_CODE_POINT;
   150   }
   152   mOddHighSurrogate = oddHighSurrogate;
   154   *aDestLength = dest - aDest;
   155   *aSrcLength =  src  - aSrc; 
   156   return (mState != STATE_NORMAL || oddHighSurrogate) ?
   157          NS_OK_UDEC_MOREINPUT : NS_OK;
   159 error:
   160   *aDestLength = dest - aDest;
   161   *aSrcLength =  src  - aSrc; 
   162   return  NS_OK_UDEC_MOREOUTPUT;
   164 error2:
   165   *aDestLength = dest - aDest;
   166   *aSrcLength = --src - aSrc; 
   167   return  NS_ERROR_ILLEGAL_INPUT;
   168 }
   170 NS_IMETHODIMP
   171 nsUTF16ToUnicodeBase::Reset()
   172 {
   173   mState = STATE_FIRST_CALL;
   174   mOddByte = 0;
   175   mOddHighSurrogate = 0;
   176   mOddLowSurrogate = 0;
   177   return NS_OK;
   178 }
   180 NS_IMETHODIMP
   181 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, 
   182                                    int32_t * aDestLength)
   183 {
   184   // the left-over data of the previous run have to be taken into account.
   185   *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
   186   if (mOddHighSurrogate)
   187     (*aDestLength)++;
   188   if (mOddLowSurrogate)
   189     (*aDestLength)++;
   190   return NS_OK;
   191 }
   194 NS_IMETHODIMP
   195 nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   196                             char16_t * aDest, int32_t * aDestLength)
   197 {
   198   switch (mState) {
   199     case STATE_FIRST_CALL:
   200       if (*aSrcLength < 2) {
   201         if (*aSrcLength < 1) {
   202           *aDestLength = 0;
   203           return NS_OK;
   204         }
   205         if (uint8_t(*aSrc) != 0xFE) {
   206           mState = STATE_NORMAL;
   207           break;
   208         }
   209         *aDestLength = 0;
   210         mState = STATE_SECOND_BYTE;
   211         return NS_OK_UDEC_MOREINPUT;
   212       }
   213 #if MOZ_LITTLE_ENDIAN
   214       // on LE machines, BE BOM is 0xFFFE
   215       if (0xFFFE != *((char16_t*)aSrc)) {
   216         mState = STATE_NORMAL;
   217       }
   218 #else
   219       if (0xFEFF != *((char16_t*)aSrc)) {
   220         mState = STATE_NORMAL;
   221       }
   222 #endif
   223       break;
   225     case STATE_SECOND_BYTE:
   226       if (*aSrcLength < 1) {
   227         *aDestLength = 0;
   228         return NS_OK_UDEC_MOREINPUT;
   229       }
   230       if (uint8_t(*aSrc) != 0xFF) {
   231         mOddByte = 0xFE;
   232         mState = STATE_HALF_CODE_POINT;
   233       }
   234       break;
   235   }
   237   return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   238                                bool(MOZ_LITTLE_ENDIAN));
   239 }
   241 NS_IMETHODIMP
   242 nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   243                             char16_t * aDest, int32_t * aDestLength)
   244 {
   245   switch (mState) {
   246     case STATE_FIRST_CALL:
   247       if (*aSrcLength < 2) {
   248         if (*aSrcLength < 1) {
   249           *aDestLength = 0;
   250           return NS_OK;
   251         }
   252         if (uint8_t(*aSrc) != 0xFF) {
   253           mState = STATE_NORMAL;
   254           break;
   255         }
   256         *aDestLength = 0;
   257         mState = STATE_SECOND_BYTE;
   258         return NS_OK_UDEC_MOREINPUT;
   259       }
   260 #if MOZ_BIG_ENDIAN
   261       // on BE machines, LE BOM is 0xFFFE
   262       if (0xFFFE != *((char16_t*)aSrc)) {
   263         mState = STATE_NORMAL;
   264       }
   265 #else
   266       if (0xFEFF != *((char16_t*)aSrc)) {
   267         mState = STATE_NORMAL;
   268       }
   269 #endif
   270       break;
   272     case STATE_SECOND_BYTE:
   273       if (*aSrcLength < 1) {
   274         *aDestLength = 0;
   275         return NS_OK_UDEC_MOREINPUT;
   276       }
   277       if (uint8_t(*aSrc) != 0xFE) {
   278         mOddByte = 0xFF;
   279         mState = STATE_HALF_CODE_POINT;
   280       }
   281       break;
   282   }
   284   return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   285                                bool(MOZ_BIG_ENDIAN));
   286 }
   288 NS_IMETHODIMP
   289 nsUTF16ToUnicode::Reset()
   290 {
   291   mEndian = kUnknown;
   292   mFoundBOM = false;
   293   return nsUTF16ToUnicodeBase::Reset();
   294 }
   296 NS_IMETHODIMP
   297 nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
   298                           char16_t * aDest, int32_t * aDestLength)
   299 {
   300     if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
   301     {
   302       nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
   303       *aSrcLength=0;
   304       *aDestLength=0;
   305       return res;
   306     }
   307     if(STATE_FIRST_CALL == mState) // first time called
   308     {
   309       // check if BOM (0xFEFF) is at the beginning, remove it if found, and
   310       // set mEndian accordingly.
   311       if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
   312         mEndian = kLittleEndian;
   313         mFoundBOM = true;
   314       }
   315       else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
   316         mEndian = kBigEndian;
   317         mFoundBOM = true;
   318       }
   319       // BOM is not found, but we can use a simple heuristic to determine
   320       // the endianness. Assume the first character is [U+0001, U+00FF].
   321       // Not always valid, but it's very likely to hold for html/xml/css. 
   322       else if(!aSrc[0] && aSrc[1]) {  // 0x00 0xhh (hh != 00)
   323         mState = STATE_NORMAL;
   324         mEndian = kBigEndian;
   325       }
   326       else if(aSrc[0] && !aSrc[1]) {  // 0xhh 0x00 (hh != 00)
   327         mState = STATE_NORMAL;
   328         mEndian = kLittleEndian;
   329       }
   330       else { // Neither BOM nor 'plausible' byte patterns at the beginning.
   331              // Just assume it's BE (following Unicode standard)
   332              // and let the garbage show up in the browser. (security concern?)
   333              // (bug 246194)
   334         mState = STATE_NORMAL;
   335         mEndian = kBigEndian;
   336       }
   337     }
   339     nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
   340 #if MOZ_BIG_ENDIAN
   341                                         (mEndian == kLittleEndian)
   342 #else
   343                                         (mEndian == kBigEndian)
   344 #endif
   345                                         );
   347     // If BOM is not found and we're to return NS_OK, signal that BOM
   348     // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
   349     return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
   350 }

mercurial