xpcom/string/public/nsUTF8Utils.h

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     2 /* This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     5 #ifndef nsUTF8Utils_h_
     6 #define nsUTF8Utils_h_
     8 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
     9 // file will provide signatures for the Mozilla abstract string types. It will
    10 // use XPCOM assertion/debugging macros, etc.
    12 #include "nscore.h"
    13 #include "mozilla/SSE.h"
    15 #include "nsCharTraits.h"
    17 class UTF8traits
    18   {
    19     public:
    20       static bool isASCII(char c) { return (c & 0x80) == 0x00; }
    21       static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
    22       static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
    23       static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
    24       static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
    25       static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
    26       static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
    27   };
    29 /**
    30  * Extract the next UCS-4 character from the buffer and return it.  The
    31  * pointer passed in is advanced to the start of the next character in the
    32  * buffer.  If non-null, the parameters err and overlong are filled in to
    33  * indicate that the character was represented by an overlong sequence, or
    34  * that an error occurred.
    35  */
    37 class UTF8CharEnumerator
    38 {
    39 public:
    40   static uint32_t NextChar(const char **buffer, const char *end,
    41                            bool *err)
    42   {
    43     NS_ASSERTION(buffer && *buffer, "null buffer!");
    45     const char *p = *buffer;
    46     *err = false;
    48     if (p >= end)
    49       {
    50         *err = true;
    52         return 0;
    53       }
    55     char c = *p++;
    57     if ( UTF8traits::isASCII(c) )
    58       {
    59         *buffer = p;
    60         return c;
    61       }
    63     uint32_t ucs4;
    64     uint32_t minUcs4;
    65     int32_t state = 0;
    67     if (!CalcState(c, ucs4, minUcs4, state)) {
    68         NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
    69         *err = true;
    71         return 0;
    72     }
    74     while ( state-- )
    75       {
    76         if (p == end)
    77           {
    78             *err = true;
    80             return 0;
    81           }
    83         c = *p++;
    85         if (!AddByte(c, state, ucs4))
    86           {
    87             *err = true;
    89             return 0;
    90           }
    91       }
    93       if ( ucs4 < minUcs4 )
    94         {
    95           // Overlong sequence
    96           ucs4 = UCS2_REPLACEMENT_CHAR;
    97         }
    98       else if ( ucs4 >= 0xD800 &&
    99                 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
   100         {
   101           // Surrogates and code points outside the Unicode range.
   102           ucs4 = UCS2_REPLACEMENT_CHAR;
   103         }
   105     *buffer = p;
   106     return ucs4;
   107   }
   109 private:
   110   static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
   111                           int32_t& state)
   112   {
   113     if ( UTF8traits::is2byte(c) )
   114       {
   115         ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
   116         state = 1;
   117         minUcs4 = 0x00000080;
   118       }
   119     else if ( UTF8traits::is3byte(c) )
   120       {
   121         ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
   122         state = 2;
   123         minUcs4 = 0x00000800;
   124       }
   125     else if ( UTF8traits::is4byte(c) )
   126       {
   127         ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
   128         state = 3;
   129         minUcs4 = 0x00010000;
   130       }
   131     else if ( UTF8traits::is5byte(c) )
   132       {
   133         ucs4 = (uint32_t(c) << 24) & 0x03000000L;
   134         state = 4;
   135         minUcs4 = 0x00200000;
   136       }
   137     else if ( UTF8traits::is6byte(c) )
   138       {
   139         ucs4 = (uint32_t(c) << 30) & 0x40000000L;
   140         state = 5;
   141         minUcs4 = 0x04000000;
   142       }
   143     else
   144       {
   145         return false;
   146       }
   148     return true;
   149   }
   151   static bool AddByte(char c, int32_t state, uint32_t& ucs4)
   152   {
   153     if ( UTF8traits::isInSeq(c) )
   154       {
   155         int32_t shift = state * 6;
   156         ucs4 |= (uint32_t(c) & 0x3F) << shift;
   157         return true;
   158       }
   160     return false;
   161   }
   162 };
   165 /**
   166  * Extract the next UCS-4 character from the buffer and return it.  The
   167  * pointer passed in is advanced to the start of the next character in the
   168  * buffer.  If non-null, the err parameter is filled in if an error occurs.
   169  */
   172 class UTF16CharEnumerator
   173 {
   174 public:
   175   static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
   176                            bool *err = nullptr)
   177   {
   178     NS_ASSERTION(buffer && *buffer, "null buffer!");
   180     const char16_t *p = *buffer;
   182     if (p >= end)
   183       {
   184         NS_ERROR("No input to work with");
   185         if (err)
   186           *err = true;
   188         return 0;
   189       }
   191     char16_t c = *p++;
   193     if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
   194       {
   195         if (err)
   196           *err = false;
   197         *buffer = p;
   198         return c;
   199       }
   200     else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
   201       {
   202         if (p == end)
   203           {
   204             // Found a high surrogate the end of the buffer. Flag this
   205             // as an error and return the Unicode replacement
   206             // character 0xFFFD.
   208             NS_WARNING("Unexpected end of buffer after high surrogate");
   210             if (err)
   211               *err = true;
   212             *buffer = p;
   213             return 0xFFFD;
   214           }
   216         // D800- DBFF - High Surrogate
   217         char16_t h = c;
   219         c = *p++;
   221         if (NS_IS_LOW_SURROGATE(c))
   222           {
   223             // DC00- DFFF - Low Surrogate
   224             // N = (H - D800) *400 + 10000 + (L - DC00)
   225             uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
   226             if (err)
   227               *err = false;
   228             *buffer = p;
   229             return ucs4;
   230           }
   231         else
   232           {
   233             // Found a high surrogate followed by something other than
   234             // a low surrogate. Flag this as an error and return the
   235             // Unicode replacement character 0xFFFD.  Note that the
   236             // pointer to the next character points to the second 16-bit
   237             // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
   238             // only the first code unit of an illegal sequence must be
   239             // treated as an illegally terminated code unit sequence
   240             // (also Chapter 3 D91, "isolated [not paired and ill-formed]
   241             // UTF-16 code units in the range D800..DFFF are ill-formed").
   242             NS_WARNING("got a High Surrogate but no low surrogate");
   244             if (err)
   245               *err = true;
   246             *buffer = p - 1;
   247             return 0xFFFD;
   248           }
   249       }
   250     else // U+DC00 - U+DFFF
   251       {
   252         // DC00- DFFF - Low Surrogate
   254         // Found a low surrogate w/o a preceding high surrogate. Flag
   255         // this as an error and return the Unicode replacement
   256         // character 0xFFFD.
   258         NS_WARNING("got a low Surrogate but no high surrogate");
   259         if (err)
   260           *err = true;
   261         *buffer = p;
   262         return 0xFFFD;
   263       }
   265     if (err)
   266       *err = true;
   267     return 0;
   268   }
   269 };
   272 /**
   273  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
   274  * UTF-8 to UTF-16
   275  */
   276 class ConvertUTF8toUTF16
   277   {
   278     public:
   279       typedef char      value_type;
   280       typedef char16_t buffer_type;
   282     ConvertUTF8toUTF16( buffer_type* aBuffer )
   283         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
   285     size_t Length() const { return mBuffer - mStart; }
   287     bool ErrorEncountered() const { return mErrorEncountered; }
   289     void write( const value_type* start, uint32_t N )
   290       {
   291         if ( mErrorEncountered )
   292           return;
   294         // algorithm assumes utf8 units won't
   295         // be spread across fragments
   296         const value_type* p = start;
   297         const value_type* end = start + N;
   298         buffer_type* out = mBuffer;
   299         for ( ; p != end /* && *p */; )
   300           {
   301             bool err;
   302             uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
   304             if ( err )
   305               {
   306                 mErrorEncountered = true;
   307                 mBuffer = out;
   308                 return;
   309               }
   311             if ( ucs4 >= PLANE1_BASE )
   312               {
   313                 *out++ = (buffer_type)H_SURROGATE(ucs4);
   314                 *out++ = (buffer_type)L_SURROGATE(ucs4);
   315               }
   316             else
   317               {
   318                 *out++ = ucs4;
   319               }
   320           }
   321         mBuffer = out;
   322       }
   324     void write_terminator()
   325       {
   326         *mBuffer = buffer_type(0);
   327       }
   329     private:
   330       buffer_type* const mStart;
   331       buffer_type* mBuffer;
   332       bool mErrorEncountered;
   333   };
   335 /**
   336  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
   337  * the length of the UTF-16 string equivalent to a UTF-8 string.
   338  */
   339 class CalculateUTF8Length
   340   {
   341     public:
   342       typedef char value_type;
   344     CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
   346     size_t Length() const { return mLength; }
   348     void write( const value_type* start, uint32_t N )
   349       {
   350           // ignore any further requests
   351         if ( mErrorEncountered )
   352             return;
   354         // algorithm assumes utf8 units won't
   355         // be spread across fragments
   356         const value_type* p = start;
   357         const value_type* end = start + N;
   358         for ( ; p < end /* && *p */; ++mLength )
   359           {
   360             if ( UTF8traits::isASCII(*p) )
   361                 p += 1;
   362             else if ( UTF8traits::is2byte(*p) )
   363                 p += 2;
   364             else if ( UTF8traits::is3byte(*p) )
   365                 p += 3;
   366             else if ( UTF8traits::is4byte(*p) ) {
   367                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
   368                 // greater than 0xFFFF, it will become a surrogate pair in the
   369                 // UTF-16 string, so add 1 more to mLength.
   370                 // This doesn't happen with is5byte and is6byte because they
   371                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
   372                 // converted to a single replacement character.
   374                 // However, there is one case when a 4 byte UTF-8 sequence will
   375                 // only generate 2 UTF-16 bytes. If we have a properly encoded
   376                 // sequence, but with an invalid value (too small or too big),
   377                 // that will result in a replacement character being written
   378                 // This replacement character is encoded as just 1 single
   379                 // UTF-16 character, which is 2 bytes.
   381                 // The below code therefore only adds 1 to mLength if the UTF8
   382                 // data will produce a decoded character which is greater than
   383                 // or equal to 0x010000 and less than 0x0110000.
   385                 // A 4byte UTF8 character is encoded as
   386                 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   387                 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
   388                 // map to bit 17-21 in the final result. If these bits are
   389                 // between 0x01 and 0x11, that means that the final result is
   390                 // between 0x010000 and 0x110000. The below code reads these
   391                 // bits out and assigns them to c, but shifted up 4 bits to
   392                 // avoid having to shift twice.
   394                 // It doesn't matter what to do in the case where p + 4 > end
   395                 // since no UTF16 characters will be written in that case by
   396                 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
   397                 // any of the surrogate bits are wrong since no UTF16
   398                 // characters will be written in that case either.
   400                 if (p + 4 <= end) {
   401                   uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
   402                                ((uint32_t)(p[1] & 0x30));
   403                   if (c >= 0x010 && c < 0x110)
   404                     ++mLength;
   405                 }
   407                 p += 4;
   408             }
   409             else if ( UTF8traits::is5byte(*p) )
   410                 p += 5;
   411             else if ( UTF8traits::is6byte(*p) )
   412                 p += 6;
   413             else // error
   414               {
   415                 ++mLength; // to account for the decrement below
   416                 break;
   417               }
   418           }
   419         if ( p != end )
   420           {
   421             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
   422             --mLength; // The last multi-byte char wasn't complete, discard it.
   423             mErrorEncountered = true;
   424           }
   425       }
   427     private:
   428       size_t mLength;
   429       bool mErrorEncountered;
   430   };
   432 /**
   433  * A character sink (see |copy_string| in nsAlgorithm.h) for
   434  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
   435  * (0xEFBFBD in UTF-8).
   436  */
   437 class ConvertUTF16toUTF8
   438   {
   439     public:
   440       typedef char16_t value_type;
   441       typedef char      buffer_type;
   443     // The error handling here is more lenient than that in
   444     // |ConvertUTF8toUTF16|, but it's that way for backwards
   445     // compatibility.
   447     ConvertUTF16toUTF8( buffer_type* aBuffer )
   448         : mStart(aBuffer), mBuffer(aBuffer) {}
   450     size_t Size() const { return mBuffer - mStart; }
   452     void write( const value_type* start, uint32_t N )
   453       {
   454         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
   456         for (const value_type *p = start, *end = start + N; p < end; ++p )
   457           {
   458             value_type c = *p;
   459             if (! (c & 0xFF80)) // U+0000 - U+007F
   460               {
   461                 *out++ = (char)c;
   462               }
   463             else if (! (c & 0xF800)) // U+0100 - U+07FF
   464               {
   465                 *out++ = 0xC0 | (char)(c >> 6);
   466                 *out++ = 0x80 | (char)(0x003F & c);
   467               }
   468             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
   469               {
   470                 *out++ = 0xE0 | (char)(c >> 12);
   471                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
   472                 *out++ = 0x80 | (char)(0x003F & c );
   473               }
   474             else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
   475               {
   476                 // D800- DBFF - High Surrogate
   477                 value_type h = c;
   479                 ++p;
   480                 if (p == end)
   481                   {
   482                     // Treat broken characters as the Unicode
   483                     // replacement character 0xFFFD (0xEFBFBD in
   484                     // UTF-8)
   485                     *out++ = '\xEF';
   486                     *out++ = '\xBF';
   487                     *out++ = '\xBD';
   489                     NS_WARNING("String ending in half a surrogate pair!");
   491                     break;
   492                   }
   493                 c = *p;
   495                 if (NS_IS_LOW_SURROGATE(c))
   496                   {
   497                     // DC00- DFFF - Low Surrogate
   498                     // N = (H - D800) *400 + 10000 + ( L - DC00 )
   499                     uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
   501                     // 0001 0000-001F FFFF
   502                     *out++ = 0xF0 | (char)(ucs4 >> 18);
   503                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
   504                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
   505                     *out++ = 0x80 | (char)(0x003F & ucs4);
   506                   }
   507                 else
   508                   {
   509                     // Treat broken characters as the Unicode
   510                     // replacement character 0xFFFD (0xEFBFBD in
   511                     // UTF-8)
   512                     *out++ = '\xEF';
   513                     *out++ = '\xBF';
   514                     *out++ = '\xBD';
   516                     // The pointer to the next character points to the second
   517                     // 16-bit value, not beyond it, as per Unicode 5.0.0
   518                     // Chapter 3 C10, only the first code unit of an illegal
   519                     // sequence must be treated as an illegally terminated
   520                     // code unit sequence (also Chapter 3 D91, "isolated [not
   521                     // paired and ill-formed] UTF-16 code units in the range
   522                     // D800..DFFF are ill-formed").
   523                     p--;
   525                     NS_WARNING("got a High Surrogate but no low surrogate");
   526                   }
   527               }
   528             else // U+DC00 - U+DFFF
   529               {
   530                 // Treat broken characters as the Unicode replacement
   531                 // character 0xFFFD (0xEFBFBD in UTF-8)
   532                 *out++ = '\xEF';
   533                 *out++ = '\xBF';
   534                 *out++ = '\xBD';
   536                 // DC00- DFFF - Low Surrogate
   537                 NS_WARNING("got a low Surrogate but no high surrogate");
   538               }
   539           }
   541         mBuffer = out;
   542       }
   544     void write_terminator()
   545       {
   546         *mBuffer = buffer_type(0);
   547       }
   549     private:
   550       buffer_type* const mStart;
   551       buffer_type* mBuffer;
   552   };
   554 /**
   555  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
   556  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
   557  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
   558  */
   559 class CalculateUTF8Size
   560   {
   561     public:
   562       typedef char16_t value_type;
   564     CalculateUTF8Size()
   565       : mSize(0) { }
   567     size_t Size() const { return mSize; }
   569     void write( const value_type* start, uint32_t N )
   570       {
   571         // Assume UCS2 surrogate pairs won't be spread across fragments.
   572         for (const value_type *p = start, *end = start + N; p < end; ++p )
   573           {
   574             value_type c = *p;
   575             if (! (c & 0xFF80)) // U+0000 - U+007F
   576               mSize += 1;
   577             else if (! (c & 0xF800)) // U+0100 - U+07FF
   578               mSize += 2;
   579             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
   580               mSize += 3;
   581             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
   582               {
   583                 ++p;
   584                 if (p == end)
   585                   {
   586                     // Treat broken characters as the Unicode
   587                     // replacement character 0xFFFD (0xEFBFBD in
   588                     // UTF-8)
   589                     mSize += 3;
   591                     NS_WARNING("String ending in half a surrogate pair!");
   593                     break;
   594                   }
   595                 c = *p;
   597                 if (0xDC00 == (0xFC00 & c))
   598                   mSize += 4;
   599                 else
   600                   {
   601                     // Treat broken characters as the Unicode
   602                     // replacement character 0xFFFD (0xEFBFBD in
   603                     // UTF-8)
   604                     mSize += 3;
   606                     // The next code unit is the second 16-bit value, not
   607                     // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
   608                     // only the first code unit of an illegal sequence must
   609                     // be treated as an illegally terminated code unit
   610                     // sequence (also Chapter 3 D91, "isolated [not paired and
   611                     // ill-formed] UTF-16 code units in the range D800..DFFF
   612                     // are ill-formed").
   613                     p--;
   615                     NS_WARNING("got a high Surrogate but no low surrogate");
   616                   }
   617               }
   618             else // U+DC00 - U+DFFF
   619               {
   620                 // Treat broken characters as the Unicode replacement
   621                 // character 0xFFFD (0xEFBFBD in UTF-8)
   622                 mSize += 3;
   624                 NS_WARNING("got a low Surrogate but no high surrogate");
   625               }
   626           }
   627       }
   629     private:
   630       size_t mSize;
   631   };
   633 #ifdef MOZILLA_INTERNAL_API
   634 /**
   635  * A character sink that performs a |reinterpret_cast|-style conversion
   636  * from char to char16_t.
   637  */
   638 class LossyConvertEncoding8to16
   639   {
   640     public:
   641       typedef char      value_type;
   642       typedef char      input_type;
   643       typedef char16_t output_type;
   645     public:
   646       LossyConvertEncoding8to16( char16_t* aDestination ) :
   647         mDestination(aDestination) { }
   649       void
   650       write( const char* aSource, uint32_t aSourceLength )
   651         {
   652 #ifdef MOZILLA_MAY_SUPPORT_SSE2
   653           if (mozilla::supports_sse2())
   654             {
   655               write_sse2(aSource, aSourceLength);
   656               return;
   657             }
   658 #endif
   659           const char* done_writing = aSource + aSourceLength;
   660           while ( aSource < done_writing )
   661             *mDestination++ = (char16_t)(unsigned char)(*aSource++);
   662         }
   664       void
   665       write_sse2( const char* aSource, uint32_t aSourceLength );
   667       void
   668       write_terminator()
   669         {
   670           *mDestination = (char16_t)(0);
   671         }
   673     private:
   674       char16_t* mDestination;
   675   };
   677 /**
   678  * A character sink that performs a |reinterpret_cast|-style conversion
   679  * from char16_t to char.
   680  */
   681 class LossyConvertEncoding16to8
   682   {
   683     public:
   684       typedef char16_t value_type;
   685       typedef char16_t input_type;
   686       typedef char      output_type;
   688       LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
   690       void
   691       write( const char16_t* aSource, uint32_t aSourceLength)
   692         {
   693 #ifdef MOZILLA_MAY_SUPPORT_SSE2
   694           if (mozilla::supports_sse2())
   695             {
   696               write_sse2(aSource, aSourceLength);
   697               return;
   698             }
   699 #endif
   700             const char16_t* done_writing = aSource + aSourceLength;
   701             while ( aSource < done_writing )
   702               *mDestination++ = (char)(*aSource++);
   703         }
   705 #ifdef MOZILLA_MAY_SUPPORT_SSE2
   706       void
   707       write_sse2( const char16_t* aSource, uint32_t aSourceLength );
   708 #endif
   710       void
   711       write_terminator()
   712         {
   713           *mDestination = '\0';
   714         }
   716     private:
   717       char *mDestination;
   718   };
   719 #endif // MOZILLA_INTERNAL_API
   721 #endif /* !defined(nsUTF8Utils_h_) */

mercurial