The Tor Browser: xpcom/string/public/nsUTF8Utils.h@6474c204b198 (annotated)

xpcom/string/public/nsUTF8Utils.h@6474c204b198 (annotated)

xpcom/string/public/nsUTF8Utils.h

Wed, 31 Dec 2014 06:09:35 +0100

author: Michael Schloh von Bennewitz <michael@schloh.com>
date: Wed, 31 Dec 2014 06:09:35 +0100
changeset 0: 6474c204b198
permissions: -rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 /* This Source Code Form is subject to the terms of the Mozilla Public
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 #ifndef nsUTF8Utils_h_
 #define nsUTF8Utils_h_
 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
 // file will provide signatures for the Mozilla abstract string types. It will
 // use XPCOM assertion/debugging macros, etc.
 #include "nscore.h"
 #include "mozilla/SSE.h"
 #include "nsCharTraits.h"
 class UTF8traits
   {
     public:
       static bool isASCII(char c) { return (c & 0x80) == 0x00; }
       static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
       static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
       static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
       static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
       static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
       static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
   };
 /**
  * Extract the next UCS-4 character from the buffer and return it.  The
  * pointer passed in is advanced to the start of the next character in the
  * buffer.  If non-null, the parameters err and overlong are filled in to
  * indicate that the character was represented by an overlong sequence, or
  * that an error occurred.
  */
 class UTF8CharEnumerator
 {
 public:
   static uint32_t NextChar(const char **buffer, const char *end,
                            bool *err)
   {
     NS_ASSERTION(buffer && *buffer, "null buffer!");
     const char *p = *buffer;
     *err = false;
     if (p >= end)
       {
         *err = true;
         return 0;
       }
     char c = *p++;
     if ( UTF8traits::isASCII(c) )
       {
         *buffer = p;
         return c;
       }
     uint32_t ucs4;
     uint32_t minUcs4;
     int32_t state = 0;
     if (!CalcState(c, ucs4, minUcs4, state)) {
         NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
         *err = true;
         return 0;
     }
     while ( state-- )
       {
         if (p == end)
           {
             *err = true;
             return 0;
           }
         c = *p++;
         if (!AddByte(c, state, ucs4))
           {
             *err = true;
             return 0;
           }
       }
       if ( ucs4 < minUcs4 )
         {
           // Overlong sequence
           ucs4 = UCS2_REPLACEMENT_CHAR;
         }
       else if ( ucs4 >= 0xD800 &&
                 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
         {
           // Surrogates and code points outside the Unicode range.
           ucs4 = UCS2_REPLACEMENT_CHAR;
         }
     *buffer = p;
     return ucs4;
   }
 private:
   static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
                           int32_t& state)
   {
     if ( UTF8traits::is2byte(c) )
       {
         ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
         state = 1;
         minUcs4 = 0x00000080;
       }
     else if ( UTF8traits::is3byte(c) )
       {
         ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
         state = 2;
         minUcs4 = 0x00000800;
       }
     else if ( UTF8traits::is4byte(c) )
       {
         ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
         state = 3;
         minUcs4 = 0x00010000;
       }
     else if ( UTF8traits::is5byte(c) )
       {
         ucs4 = (uint32_t(c) << 24) & 0x03000000L;
         state = 4;
         minUcs4 = 0x00200000;
       }
     else if ( UTF8traits::is6byte(c) )
       {
         ucs4 = (uint32_t(c) << 30) & 0x40000000L;
         state = 5;
         minUcs4 = 0x04000000;
       }
     else
       {
         return false;
       }
     return true;
   }
   static bool AddByte(char c, int32_t state, uint32_t& ucs4)
   {
     if ( UTF8traits::isInSeq(c) )
       {
         int32_t shift = state * 6;
         ucs4 |= (uint32_t(c) & 0x3F) << shift;
         return true;
       }
     return false;
   }
 };
 /**
  * Extract the next UCS-4 character from the buffer and return it.  The
  * pointer passed in is advanced to the start of the next character in the
  * buffer.  If non-null, the err parameter is filled in if an error occurs.
  */
 class UTF16CharEnumerator
 {
 public:
   static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
                            bool *err = nullptr)
   {
     NS_ASSERTION(buffer && *buffer, "null buffer!");
     const char16_t *p = *buffer;
     if (p >= end)
       {
         NS_ERROR("No input to work with");
         if (err)
           *err = true;
         return 0;
       }
     char16_t c = *p++;
     if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
       {
         if (err)
           *err = false;
         *buffer = p;
         return c;
       }
     else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
       {
         if (p == end)
           {
             // Found a high surrogate the end of the buffer. Flag this
             // as an error and return the Unicode replacement
             // character 0xFFFD.
             NS_WARNING("Unexpected end of buffer after high surrogate");
             if (err)
               *err = true;
             *buffer = p;
             return 0xFFFD;
           }
         // D800- DBFF - High Surrogate
         char16_t h = c;
         c = *p++;
         if (NS_IS_LOW_SURROGATE(c))
           {
             // DC00- DFFF - Low Surrogate
             // N = (H - D800) *400 + 10000 + (L - DC00)
             uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
             if (err)
               *err = false;
             *buffer = p;
             return ucs4;
           }
         else
           {
             // Found a high surrogate followed by something other than
             // a low surrogate. Flag this as an error and return the
             // Unicode replacement character 0xFFFD.  Note that the
             // pointer to the next character points to the second 16-bit
             // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
             // only the first code unit of an illegal sequence must be
             // treated as an illegally terminated code unit sequence
             // (also Chapter 3 D91, "isolated [not paired and ill-formed]
             // UTF-16 code units in the range D800..DFFF are ill-formed").
             NS_WARNING("got a High Surrogate but no low surrogate");
             if (err)
               *err = true;
             *buffer = p - 1;
             return 0xFFFD;
           }
       }
     else // U+DC00 - U+DFFF
       {
         // DC00- DFFF - Low Surrogate
         // Found a low surrogate w/o a preceding high surrogate. Flag
         // this as an error and return the Unicode replacement
         // character 0xFFFD.
         NS_WARNING("got a low Surrogate but no high surrogate");
         if (err)
           *err = true;
         *buffer = p;
         return 0xFFFD;
       }
     if (err)
       *err = true;
     return 0;
   }
 };
 /**
  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
  * UTF-8 to UTF-16
  */
 class ConvertUTF8toUTF16
   {
     public:
       typedef char      value_type;
       typedef char16_t buffer_type;
     ConvertUTF8toUTF16( buffer_type* aBuffer )
         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
     size_t Length() const { return mBuffer - mStart; }
     bool ErrorEncountered() const { return mErrorEncountered; }
     void write( const value_type* start, uint32_t N )
       {
         if ( mErrorEncountered )
           return;
         // algorithm assumes utf8 units won't
         // be spread across fragments
         const value_type* p = start;
         const value_type* end = start + N;
         buffer_type* out = mBuffer;
         for ( ; p != end /* && *p */; )
           {
             bool err;
             uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
             if ( err )
               {
                 mErrorEncountered = true;
                 mBuffer = out;
                 return;
               }
             if ( ucs4 >= PLANE1_BASE )
               {
                 *out++ = (buffer_type)H_SURROGATE(ucs4);
                 *out++ = (buffer_type)L_SURROGATE(ucs4);
               }
             else
               {
                 *out++ = ucs4;
               }
           }
         mBuffer = out;
       }
     void write_terminator()
       {
         *mBuffer = buffer_type(0);
       }
     private:
       buffer_type* const mStart;
       buffer_type* mBuffer;
       bool mErrorEncountered;
   };
 /**
  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
  * the length of the UTF-16 string equivalent to a UTF-8 string.
  */
 class CalculateUTF8Length
   {
     public:
       typedef char value_type;
     CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
     size_t Length() const { return mLength; }
     void write( const value_type* start, uint32_t N )
       {
           // ignore any further requests
         if ( mErrorEncountered )
             return;
         // algorithm assumes utf8 units won't
         // be spread across fragments
         const value_type* p = start;
         const value_type* end = start + N;
         for ( ; p < end /* && *p */; ++mLength )
           {
             if ( UTF8traits::isASCII(*p) )
                 p += 1;
             else if ( UTF8traits::is2byte(*p) )
                 p += 2;
             else if ( UTF8traits::is3byte(*p) )
                 p += 3;
             else if ( UTF8traits::is4byte(*p) ) {
                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
                 // greater than 0xFFFF, it will become a surrogate pair in the
                 // UTF-16 string, so add 1 more to mLength.
                 // This doesn't happen with is5byte and is6byte because they
                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
                 // converted to a single replacement character.
                 // However, there is one case when a 4 byte UTF-8 sequence will
                 // only generate 2 UTF-16 bytes. If we have a properly encoded
                 // sequence, but with an invalid value (too small or too big),
                 // that will result in a replacement character being written
                 // This replacement character is encoded as just 1 single
                 // UTF-16 character, which is 2 bytes.
                 // The below code therefore only adds 1 to mLength if the UTF8
                 // data will produce a decoded character which is greater than
                 // or equal to 0x010000 and less than 0x0110000.
                 // A 4byte UTF8 character is encoded as
                 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
                 // map to bit 17-21 in the final result. If these bits are
                 // between 0x01 and 0x11, that means that the final result is
                 // between 0x010000 and 0x110000. The below code reads these
                 // bits out and assigns them to c, but shifted up 4 bits to
                 // avoid having to shift twice.
                 // It doesn't matter what to do in the case where p + 4 > end
                 // since no UTF16 characters will be written in that case by
                 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
                 // any of the surrogate bits are wrong since no UTF16
                 // characters will be written in that case either.
                 if (p + 4 <= end) {
                   uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
                                ((uint32_t)(p[1] & 0x30));
                   if (c >= 0x010 && c < 0x110)
                     ++mLength;
                 }
                 p += 4;
             }
             else if ( UTF8traits::is5byte(*p) )
                 p += 5;
             else if ( UTF8traits::is6byte(*p) )
                 p += 6;
             else // error
               {
                 ++mLength; // to account for the decrement below
                 break;
               }
           }
         if ( p != end )
           {
             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
             --mLength; // The last multi-byte char wasn't complete, discard it.
             mErrorEncountered = true;
           }
       }
     private:
       size_t mLength;
       bool mErrorEncountered;
   };
 /**
  * A character sink (see |copy_string| in nsAlgorithm.h) for
  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
  * (0xEFBFBD in UTF-8).
  */
 class ConvertUTF16toUTF8
   {
     public:
       typedef char16_t value_type;
       typedef char      buffer_type;
     // The error handling here is more lenient than that in
     // |ConvertUTF8toUTF16|, but it's that way for backwards
     // compatibility.
     ConvertUTF16toUTF8( buffer_type* aBuffer )
         : mStart(aBuffer), mBuffer(aBuffer) {}
     size_t Size() const { return mBuffer - mStart; }
     void write( const value_type* start, uint32_t N )
       {
         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
         for (const value_type *p = start, *end = start + N; p < end; ++p )
           {
             value_type c = *p;
             if (! (c & 0xFF80)) // U+0000 - U+007F
               {
                 *out++ = (char)c;
               }
             else if (! (c & 0xF800)) // U+0100 - U+07FF
               {
                 *out++ = 0xC0 | (char)(c >> 6);
                 *out++ = 0x80 | (char)(0x003F & c);
               }
             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
               {
                 *out++ = 0xE0 | (char)(c >> 12);
                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
                 *out++ = 0x80 | (char)(0x003F & c );
               }
             else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
               {
                 // D800- DBFF - High Surrogate
                 value_type h = c;
                 ++p;
                 if (p == end)
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     *out++ = '\xEF';
                     *out++ = '\xBF';
                     *out++ = '\xBD';
                     NS_WARNING("String ending in half a surrogate pair!");
                     break;
                   }
                 c = *p;
                 if (NS_IS_LOW_SURROGATE(c))
                   {
                     // DC00- DFFF - Low Surrogate
                     // N = (H - D800) *400 + 10000 + ( L - DC00 )
                     uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
                     // 0001 0000-001F FFFF
                     *out++ = 0xF0 | (char)(ucs4 >> 18);
                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
                     *out++ = 0x80 | (char)(0x003F & ucs4);
                   }
                 else
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     *out++ = '\xEF';
                     *out++ = '\xBF';
                     *out++ = '\xBD';
                     // The pointer to the next character points to the second
                     // 16-bit value, not beyond it, as per Unicode 5.0.0
                     // Chapter 3 C10, only the first code unit of an illegal
                     // sequence must be treated as an illegally terminated
                     // code unit sequence (also Chapter 3 D91, "isolated [not
                     // paired and ill-formed] UTF-16 code units in the range
                     // D800..DFFF are ill-formed").
                     p--;
                     NS_WARNING("got a High Surrogate but no low surrogate");
                   }
               }
             else // U+DC00 - U+DFFF
               {
                 // Treat broken characters as the Unicode replacement
                 // character 0xFFFD (0xEFBFBD in UTF-8)
                 *out++ = '\xEF';
                 *out++ = '\xBF';
                 *out++ = '\xBD';
                 // DC00- DFFF - Low Surrogate
                 NS_WARNING("got a low Surrogate but no high surrogate");
               }
           }
         mBuffer = out;
       }
     void write_terminator()
       {
         *mBuffer = buffer_type(0);
       }
     private:
       buffer_type* const mStart;
       buffer_type* mBuffer;
   };
 /**
  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
  */
 class CalculateUTF8Size
   {
     public:
       typedef char16_t value_type;
     CalculateUTF8Size()
       : mSize(0) { }
     size_t Size() const { return mSize; }
     void write( const value_type* start, uint32_t N )
       {
         // Assume UCS2 surrogate pairs won't be spread across fragments.
         for (const value_type *p = start, *end = start + N; p < end; ++p )
           {
             value_type c = *p;
             if (! (c & 0xFF80)) // U+0000 - U+007F
               mSize += 1;
             else if (! (c & 0xF800)) // U+0100 - U+07FF
               mSize += 2;
             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
               mSize += 3;
             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
               {
                 ++p;
                 if (p == end)
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     mSize += 3;
                     NS_WARNING("String ending in half a surrogate pair!");
                     break;
                   }
                 c = *p;
                 if (0xDC00 == (0xFC00 & c))
                   mSize += 4;
                 else
                   {
                     // Treat broken characters as the Unicode
                     // replacement character 0xFFFD (0xEFBFBD in
                     // UTF-8)
                     mSize += 3;
                     // The next code unit is the second 16-bit value, not
                     // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
                     // only the first code unit of an illegal sequence must
                     // be treated as an illegally terminated code unit
                     // sequence (also Chapter 3 D91, "isolated [not paired and
                     // ill-formed] UTF-16 code units in the range D800..DFFF
                     // are ill-formed").
                     p--;
                     NS_WARNING("got a high Surrogate but no low surrogate");
                   }
               }
             else // U+DC00 - U+DFFF
               {
                 // Treat broken characters as the Unicode replacement
                 // character 0xFFFD (0xEFBFBD in UTF-8)
                 mSize += 3;
                 NS_WARNING("got a low Surrogate but no high surrogate");
               }
           }
       }
     private:
       size_t mSize;
   };
 #ifdef MOZILLA_INTERNAL_API
 /**
  * A character sink that performs a |reinterpret_cast|-style conversion
  * from char to char16_t.
  */
 class LossyConvertEncoding8to16
   {
     public:
       typedef char      value_type;
       typedef char      input_type;
       typedef char16_t output_type;
     public:
       LossyConvertEncoding8to16( char16_t* aDestination ) :
         mDestination(aDestination) { }
       void
       write( const char* aSource, uint32_t aSourceLength )
         {
 #ifdef MOZILLA_MAY_SUPPORT_SSE2
           if (mozilla::supports_sse2())
             {
               write_sse2(aSource, aSourceLength);
               return;
             }
 #endif
           const char* done_writing = aSource + aSourceLength;
           while ( aSource < done_writing )
             *mDestination++ = (char16_t)(unsigned char)(*aSource++);
         }
       void
       write_sse2( const char* aSource, uint32_t aSourceLength );
       void
       write_terminator()
         {
           *mDestination = (char16_t)(0);
         }
     private:
       char16_t* mDestination;
   };
 /**
  * A character sink that performs a |reinterpret_cast|-style conversion
  * from char16_t to char.
  */
 class LossyConvertEncoding16to8
   {
     public:
       typedef char16_t value_type;
       typedef char16_t input_type;
       typedef char      output_type;
       LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
       void
       write( const char16_t* aSource, uint32_t aSourceLength)
         {
 #ifdef MOZILLA_MAY_SUPPORT_SSE2
           if (mozilla::supports_sse2())
             {
               write_sse2(aSource, aSourceLength);
               return;
             }
 #endif
             const char16_t* done_writing = aSource + aSourceLength;
             while ( aSource < done_writing )
               *mDestination++ = (char)(*aSource++);
         }
 #ifdef MOZILLA_MAY_SUPPORT_SSE2
       void
       write_sse2( const char16_t* aSource, uint32_t aSourceLength );
 #endif
       void
       write_terminator()
         {
           *mDestination = '\0';
         }
     private:
       char *mDestination;
   };
 #endif // MOZILLA_INTERNAL_API
 #endif /* !defined(nsUTF8Utils_h_) */

The Tor Browser / annotate

xpcom/string/public/nsUTF8Utils.h@6474c204b198 (annotated)

xpcom/string/public/nsUTF8Utils.h