The Tor Browser: xpcom/string/public/nsUTF8Utils.h@129ffea94266

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */

     2 /* This Source Code Form is subject to the terms of the Mozilla Public

     3  * License, v. 2.0. If a copy of the MPL was not distributed with this

     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

     5 #ifndef nsUTF8Utils_h_

     6 #define nsUTF8Utils_h_

     8 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this

     9 // file will provide signatures for the Mozilla abstract string types. It will

    10 // use XPCOM assertion/debugging macros, etc.

    12 #include "nscore.h"

    13 #include "mozilla/SSE.h"

    15 #include "nsCharTraits.h"

    17 class UTF8traits

    18   {

    19     public:

    20       static bool isASCII(char c) { return (c & 0x80) == 0x00; }

    21       static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }

    22       static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }

    23       static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }

    24       static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }

    25       static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }

    26       static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }

    27   };

    29 /**

    30  * Extract the next UCS-4 character from the buffer and return it.  The

    31  * pointer passed in is advanced to the start of the next character in the

    32  * buffer.  If non-null, the parameters err and overlong are filled in to

    33  * indicate that the character was represented by an overlong sequence, or

    34  * that an error occurred.

    35  */

    37 class UTF8CharEnumerator

    38 {

    39 public:

    40   static uint32_t NextChar(const char **buffer, const char *end,

    41                            bool *err)

    42   {

    43     NS_ASSERTION(buffer && *buffer, "null buffer!");

    45     const char *p = *buffer;

    46     *err = false;

    48     if (p >= end)

    49       {

    50         *err = true;

    52         return 0;

    53       }

    55     char c = *p++;

    57     if ( UTF8traits::isASCII(c) )

    58       {

    59         *buffer = p;

    60         return c;

    61       }

    63     uint32_t ucs4;

    64     uint32_t minUcs4;

    65     int32_t state = 0;

    67     if (!CalcState(c, ucs4, minUcs4, state)) {

    68         NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");

    69         *err = true;

    71         return 0;

    72     }

    74     while ( state-- )

    75       {

    76         if (p == end)

    77           {

    78             *err = true;

    80             return 0;

    81           }

    83         c = *p++;

    85         if (!AddByte(c, state, ucs4))

    86           {

    87             *err = true;

    89             return 0;

    90           }

    91       }

    93       if ( ucs4 < minUcs4 )

    94         {

    95           // Overlong sequence

    96           ucs4 = UCS2_REPLACEMENT_CHAR;

    97         }

    98       else if ( ucs4 >= 0xD800 &&

    99                 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))

   100         {

   101           // Surrogates and code points outside the Unicode range.

   102           ucs4 = UCS2_REPLACEMENT_CHAR;

   103         }

   105     *buffer = p;

   106     return ucs4;

   107   }

   109 private:

   110   static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,

   111                           int32_t& state)

   112   {

   113     if ( UTF8traits::is2byte(c) )

   114       {

   115         ucs4 = (uint32_t(c) << 6) & 0x000007C0L;

   116         state = 1;

   117         minUcs4 = 0x00000080;

   118       }

   119     else if ( UTF8traits::is3byte(c) )

   120       {

   121         ucs4 = (uint32_t(c) << 12) & 0x0000F000L;

   122         state = 2;

   123         minUcs4 = 0x00000800;

   124       }

   125     else if ( UTF8traits::is4byte(c) )

   126       {

   127         ucs4 = (uint32_t(c) << 18) & 0x001F0000L;

   128         state = 3;

   129         minUcs4 = 0x00010000;

   130       }

   131     else if ( UTF8traits::is5byte(c) )

   132       {

   133         ucs4 = (uint32_t(c) << 24) & 0x03000000L;

   134         state = 4;

   135         minUcs4 = 0x00200000;

   136       }

   137     else if ( UTF8traits::is6byte(c) )

   138       {

   139         ucs4 = (uint32_t(c) << 30) & 0x40000000L;

   140         state = 5;

   141         minUcs4 = 0x04000000;

   142       }

   143     else

   144       {

   145         return false;

   146       }

   148     return true;

   149   }

   151   static bool AddByte(char c, int32_t state, uint32_t& ucs4)

   152   {

   153     if ( UTF8traits::isInSeq(c) )

   154       {

   155         int32_t shift = state * 6;

   156         ucs4 |= (uint32_t(c) & 0x3F) << shift;

   157         return true;

   158       }

   160     return false;

   161   }

   162 };

   165 /**

   166  * Extract the next UCS-4 character from the buffer and return it.  The

   167  * pointer passed in is advanced to the start of the next character in the

   168  * buffer.  If non-null, the err parameter is filled in if an error occurs.

   169  */

   172 class UTF16CharEnumerator

   173 {

   174 public:

   175   static uint32_t NextChar(const char16_t **buffer, const char16_t *end,

   176                            bool *err = nullptr)

   177   {

   178     NS_ASSERTION(buffer && *buffer, "null buffer!");

   180     const char16_t *p = *buffer;

   182     if (p >= end)

   183       {

   184         NS_ERROR("No input to work with");

   185         if (err)

   186           *err = true;

   188         return 0;

   189       }

   191     char16_t c = *p++;

   193     if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF

   194       {

   195         if (err)

   196           *err = false;

   197         *buffer = p;

   198         return c;

   199       }

   200     else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF

   201       {

   202         if (p == end)

   203           {

   204             // Found a high surrogate the end of the buffer. Flag this

   205             // as an error and return the Unicode replacement

   206             // character 0xFFFD.

   208             NS_WARNING("Unexpected end of buffer after high surrogate");

   210             if (err)

   211               *err = true;

   212             *buffer = p;

   213             return 0xFFFD;

   214           }

   216         // D800- DBFF - High Surrogate

   217         char16_t h = c;

   219         c = *p++;

   221         if (NS_IS_LOW_SURROGATE(c))

   222           {

   223             // DC00- DFFF - Low Surrogate

   224             // N = (H - D800) *400 + 10000 + (L - DC00)

   225             uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);

   226             if (err)

   227               *err = false;

   228             *buffer = p;

   229             return ucs4;

   230           }

   231         else

   232           {

   233             // Found a high surrogate followed by something other than

   234             // a low surrogate. Flag this as an error and return the

   235             // Unicode replacement character 0xFFFD.  Note that the

   236             // pointer to the next character points to the second 16-bit

   237             // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,

   238             // only the first code unit of an illegal sequence must be

   239             // treated as an illegally terminated code unit sequence

   240             // (also Chapter 3 D91, "isolated [not paired and ill-formed]

   241             // UTF-16 code units in the range D800..DFFF are ill-formed").

   242             NS_WARNING("got a High Surrogate but no low surrogate");

   244             if (err)

   245               *err = true;

   246             *buffer = p - 1;

   247             return 0xFFFD;

   248           }

   249       }

   250     else // U+DC00 - U+DFFF

   251       {

   252         // DC00- DFFF - Low Surrogate

   254         // Found a low surrogate w/o a preceding high surrogate. Flag

   255         // this as an error and return the Unicode replacement

   256         // character 0xFFFD.

   258         NS_WARNING("got a low Surrogate but no high surrogate");

   259         if (err)

   260           *err = true;

   261         *buffer = p;

   262         return 0xFFFD;

   263       }

   265     if (err)

   266       *err = true;

   267     return 0;

   268   }

   269 };

   272 /**

   273  * A character sink (see |copy_string| in nsAlgorithm.h) for converting

   274  * UTF-8 to UTF-16

   275  */

   276 class ConvertUTF8toUTF16

   277   {

   278     public:

   279       typedef char      value_type;

   280       typedef char16_t buffer_type;

   282     ConvertUTF8toUTF16( buffer_type* aBuffer )

   283         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}

   285     size_t Length() const { return mBuffer - mStart; }

   287     bool ErrorEncountered() const { return mErrorEncountered; }

   289     void write( const value_type* start, uint32_t N )

   290       {

   291         if ( mErrorEncountered )

   292           return;

   294         // algorithm assumes utf8 units won't

   295         // be spread across fragments

   296         const value_type* p = start;

   297         const value_type* end = start + N;

   298         buffer_type* out = mBuffer;

   299         for ( ; p != end /* && *p */; )

   300           {

   301             bool err;

   302             uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);

   304             if ( err )

   305               {

   306                 mErrorEncountered = true;

   307                 mBuffer = out;

   308                 return;

   309               }

   311             if ( ucs4 >= PLANE1_BASE )

   312               {

   313                 *out++ = (buffer_type)H_SURROGATE(ucs4);

   314                 *out++ = (buffer_type)L_SURROGATE(ucs4);

   315               }

   316             else

   317               {

   318                 *out++ = ucs4;

   319               }

   320           }

   321         mBuffer = out;

   322       }

   324     void write_terminator()

   325       {

   326         *mBuffer = buffer_type(0);

   327       }

   329     private:

   330       buffer_type* const mStart;

   331       buffer_type* mBuffer;

   332       bool mErrorEncountered;

   333   };

   335 /**

   336  * A character sink (see |copy_string| in nsAlgorithm.h) for computing

   337  * the length of the UTF-16 string equivalent to a UTF-8 string.

   338  */

   339 class CalculateUTF8Length

   340   {

   341     public:

   342       typedef char value_type;

   344     CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }

   346     size_t Length() const { return mLength; }

   348     void write( const value_type* start, uint32_t N )

   349       {

   350           // ignore any further requests

   351         if ( mErrorEncountered )

   352             return;

   354         // algorithm assumes utf8 units won't

   355         // be spread across fragments

   356         const value_type* p = start;

   357         const value_type* end = start + N;

   358         for ( ; p < end /* && *p */; ++mLength )

   359           {

   360             if ( UTF8traits::isASCII(*p) )

   361                 p += 1;

   362             else if ( UTF8traits::is2byte(*p) )

   363                 p += 2;

   364             else if ( UTF8traits::is3byte(*p) )

   365                 p += 3;

   366             else if ( UTF8traits::is4byte(*p) ) {

   367                 // Because a UTF-8 sequence of 4 bytes represents a codepoint

   368                 // greater than 0xFFFF, it will become a surrogate pair in the

   369                 // UTF-16 string, so add 1 more to mLength.

   370                 // This doesn't happen with is5byte and is6byte because they

   371                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get

   372                 // converted to a single replacement character.

   374                 // However, there is one case when a 4 byte UTF-8 sequence will

   375                 // only generate 2 UTF-16 bytes. If we have a properly encoded

   376                 // sequence, but with an invalid value (too small or too big),

   377                 // that will result in a replacement character being written

   378                 // This replacement character is encoded as just 1 single

   379                 // UTF-16 character, which is 2 bytes.

   381                 // The below code therefore only adds 1 to mLength if the UTF8

   382                 // data will produce a decoded character which is greater than

   383                 // or equal to 0x010000 and less than 0x0110000.

   385                 // A 4byte UTF8 character is encoded as

   386                 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

   387                 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,

   388                 // map to bit 17-21 in the final result. If these bits are

   389                 // between 0x01 and 0x11, that means that the final result is

   390                 // between 0x010000 and 0x110000. The below code reads these

   391                 // bits out and assigns them to c, but shifted up 4 bits to

   392                 // avoid having to shift twice.

   394                 // It doesn't matter what to do in the case where p + 4 > end

   395                 // since no UTF16 characters will be written in that case by

   396                 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if

   397                 // any of the surrogate bits are wrong since no UTF16

   398                 // characters will be written in that case either.

   400                 if (p + 4 <= end) {

   401                   uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |

   402                                ((uint32_t)(p[1] & 0x30));

   403                   if (c >= 0x010 && c < 0x110)

   404                     ++mLength;

   405                 }

   407                 p += 4;

   408             }

   409             else if ( UTF8traits::is5byte(*p) )

   410                 p += 5;

   411             else if ( UTF8traits::is6byte(*p) )

   412                 p += 6;

   413             else // error

   414               {

   415                 ++mLength; // to account for the decrement below

   416                 break;

   417               }

   418           }

   419         if ( p != end )

   420           {

   421             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");

   422             --mLength; // The last multi-byte char wasn't complete, discard it.

   423             mErrorEncountered = true;

   424           }

   425       }

   427     private:

   428       size_t mLength;

   429       bool mErrorEncountered;

   430   };

   432 /**

   433  * A character sink (see |copy_string| in nsAlgorithm.h) for

   434  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD

   435  * (0xEFBFBD in UTF-8).

   436  */

   437 class ConvertUTF16toUTF8

   438   {

   439     public:

   440       typedef char16_t value_type;

   441       typedef char      buffer_type;

   443     // The error handling here is more lenient than that in

   444     // |ConvertUTF8toUTF16|, but it's that way for backwards

   445     // compatibility.

   447     ConvertUTF16toUTF8( buffer_type* aBuffer )

   448         : mStart(aBuffer), mBuffer(aBuffer) {}

   450     size_t Size() const { return mBuffer - mStart; }

   452     void write( const value_type* start, uint32_t N )

   453       {

   454         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!

   456         for (const value_type *p = start, *end = start + N; p < end; ++p )

   457           {

   458             value_type c = *p;

   459             if (! (c & 0xFF80)) // U+0000 - U+007F

   460               {

   461                 *out++ = (char)c;

   462               }

   463             else if (! (c & 0xF800)) // U+0100 - U+07FF

   464               {

   465                 *out++ = 0xC0 | (char)(c >> 6);

   466                 *out++ = 0x80 | (char)(0x003F & c);

   467               }

   468             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF

   469               {

   470                 *out++ = 0xE0 | (char)(c >> 12);

   471                 *out++ = 0x80 | (char)(0x003F & (c >> 6));

   472                 *out++ = 0x80 | (char)(0x003F & c );

   473               }

   474             else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF

   475               {

   476                 // D800- DBFF - High Surrogate

   477                 value_type h = c;

   479                 ++p;

   480                 if (p == end)

   481                   {

   482                     // Treat broken characters as the Unicode

   483                     // replacement character 0xFFFD (0xEFBFBD in

   484                     // UTF-8)

   485                     *out++ = '\xEF';

   486                     *out++ = '\xBF';

   487                     *out++ = '\xBD';

   489                     NS_WARNING("String ending in half a surrogate pair!");

   491                     break;

   492                   }

   493                 c = *p;

   495                 if (NS_IS_LOW_SURROGATE(c))

   496                   {

   497                     // DC00- DFFF - Low Surrogate

   498                     // N = (H - D800) *400 + 10000 + ( L - DC00 )

   499                     uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);

   501                     // 0001 0000-001F FFFF

   502                     *out++ = 0xF0 | (char)(ucs4 >> 18);

   503                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));

   504                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));

   505                     *out++ = 0x80 | (char)(0x003F & ucs4);

   506                   }

   507                 else

   508                   {

   509                     // Treat broken characters as the Unicode

   510                     // replacement character 0xFFFD (0xEFBFBD in

   511                     // UTF-8)

   512                     *out++ = '\xEF';

   513                     *out++ = '\xBF';

   514                     *out++ = '\xBD';

   516                     // The pointer to the next character points to the second

   517                     // 16-bit value, not beyond it, as per Unicode 5.0.0

   518                     // Chapter 3 C10, only the first code unit of an illegal

   519                     // sequence must be treated as an illegally terminated

   520                     // code unit sequence (also Chapter 3 D91, "isolated [not

   521                     // paired and ill-formed] UTF-16 code units in the range

   522                     // D800..DFFF are ill-formed").

   523                     p--;

   525                     NS_WARNING("got a High Surrogate but no low surrogate");

   526                   }

   527               }

   528             else // U+DC00 - U+DFFF

   529               {

   530                 // Treat broken characters as the Unicode replacement

   531                 // character 0xFFFD (0xEFBFBD in UTF-8)

   532                 *out++ = '\xEF';

   533                 *out++ = '\xBF';

   534                 *out++ = '\xBD';

   536                 // DC00- DFFF - Low Surrogate

   537                 NS_WARNING("got a low Surrogate but no high surrogate");

   538               }

   539           }

   541         mBuffer = out;

   542       }

   544     void write_terminator()

   545       {

   546         *mBuffer = buffer_type(0);

   547       }

   549     private:

   550       buffer_type* const mStart;

   551       buffer_type* mBuffer;

   552   };

   554 /**

   555  * A character sink (see |copy_string| in nsAlgorithm.h) for computing

   556  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid

   557  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).

   558  */

   559 class CalculateUTF8Size

   560   {

   561     public:

   562       typedef char16_t value_type;

   564     CalculateUTF8Size()

   565       : mSize(0) { }

   567     size_t Size() const { return mSize; }

   569     void write( const value_type* start, uint32_t N )

   570       {

   571         // Assume UCS2 surrogate pairs won't be spread across fragments.

   572         for (const value_type *p = start, *end = start + N; p < end; ++p )

   573           {

   574             value_type c = *p;

   575             if (! (c & 0xFF80)) // U+0000 - U+007F

   576               mSize += 1;

   577             else if (! (c & 0xF800)) // U+0100 - U+07FF

   578               mSize += 2;

   579             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF

   580               mSize += 3;

   581             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF

   582               {

   583                 ++p;

   584                 if (p == end)

   585                   {

   586                     // Treat broken characters as the Unicode

   587                     // replacement character 0xFFFD (0xEFBFBD in

   588                     // UTF-8)

   589                     mSize += 3;

   591                     NS_WARNING("String ending in half a surrogate pair!");

   593                     break;

   594                   }

   595                 c = *p;

   597                 if (0xDC00 == (0xFC00 & c))

   598                   mSize += 4;

   599                 else

   600                   {

   601                     // Treat broken characters as the Unicode

   602                     // replacement character 0xFFFD (0xEFBFBD in

   603                     // UTF-8)

   604                     mSize += 3;

   606                     // The next code unit is the second 16-bit value, not

   607                     // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,

   608                     // only the first code unit of an illegal sequence must

   609                     // be treated as an illegally terminated code unit

   610                     // sequence (also Chapter 3 D91, "isolated [not paired and

   611                     // ill-formed] UTF-16 code units in the range D800..DFFF

   612                     // are ill-formed").

   613                     p--;

   615                     NS_WARNING("got a high Surrogate but no low surrogate");

   616                   }

   617               }

   618             else // U+DC00 - U+DFFF

   619               {

   620                 // Treat broken characters as the Unicode replacement

   621                 // character 0xFFFD (0xEFBFBD in UTF-8)

   622                 mSize += 3;

   624                 NS_WARNING("got a low Surrogate but no high surrogate");

   625               }

   626           }

   627       }

   629     private:

   630       size_t mSize;

   631   };

   633 #ifdef MOZILLA_INTERNAL_API

   634 /**

   635  * A character sink that performs a |reinterpret_cast|-style conversion

   636  * from char to char16_t.

   637  */

   638 class LossyConvertEncoding8to16

   639   {

   640     public:

   641       typedef char      value_type;

   642       typedef char      input_type;

   643       typedef char16_t output_type;

   645     public:

   646       LossyConvertEncoding8to16( char16_t* aDestination ) :

   647         mDestination(aDestination) { }

   649       void

   650       write( const char* aSource, uint32_t aSourceLength )

   651         {

   652 #ifdef MOZILLA_MAY_SUPPORT_SSE2

   653           if (mozilla::supports_sse2())

   654             {

   655               write_sse2(aSource, aSourceLength);

   656               return;

   657             }

   658 #endif

   659           const char* done_writing = aSource + aSourceLength;

   660           while ( aSource < done_writing )

   661             *mDestination++ = (char16_t)(unsigned char)(*aSource++);

   662         }

   664       void

   665       write_sse2( const char* aSource, uint32_t aSourceLength );

   667       void

   668       write_terminator()

   669         {

   670           *mDestination = (char16_t)(0);

   671         }

   673     private:

   674       char16_t* mDestination;

   675   };

   677 /**

   678  * A character sink that performs a |reinterpret_cast|-style conversion

   679  * from char16_t to char.

   680  */

   681 class LossyConvertEncoding16to8

   682   {

   683     public:

   684       typedef char16_t value_type;

   685       typedef char16_t input_type;

   686       typedef char      output_type;

   688       LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }

   690       void

   691       write( const char16_t* aSource, uint32_t aSourceLength)

   692         {

   693 #ifdef MOZILLA_MAY_SUPPORT_SSE2

   694           if (mozilla::supports_sse2())

   695             {

   696               write_sse2(aSource, aSourceLength);

   697               return;

   698             }

   699 #endif

   700             const char16_t* done_writing = aSource + aSourceLength;

   701             while ( aSource < done_writing )

   702               *mDestination++ = (char)(*aSource++);

   703         }

   705 #ifdef MOZILLA_MAY_SUPPORT_SSE2

   706       void

   707       write_sse2( const char16_t* aSource, uint32_t aSourceLength );

   708 #endif

   710       void

   711       write_terminator()

   712         {

   713           *mDestination = '\0';

   714         }

   716     private:

   717       char *mDestination;

   718   };

   719 #endif // MOZILLA_INTERNAL_API

   721 #endif /* !defined(nsUTF8Utils_h_) */

The Tor Browser / file revision

xpcom/string/public/nsUTF8Utils.h@129ffea94266

xpcom/string/public/nsUTF8Utils.h