michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0:  * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0:  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0: #ifndef nsUTF8Utils_h_
michael@0: #define nsUTF8Utils_h_
michael@0: 
michael@0: // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
michael@0: // file will provide signatures for the Mozilla abstract string types. It will
michael@0: // use XPCOM assertion/debugging macros, etc.
michael@0: 
michael@0: #include "nscore.h"
michael@0: #include "mozilla/SSE.h"
michael@0: 
michael@0: #include "nsCharTraits.h"
michael@0: 
michael@0: class UTF8traits
michael@0:   {
michael@0:     public:
michael@0:       static bool isASCII(char c) { return (c & 0x80) == 0x00; }
michael@0:       static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
michael@0:       static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
michael@0:       static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
michael@0:       static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
michael@0:       static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
michael@0:       static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * Extract the next UCS-4 character from the buffer and return it.  The
michael@0:  * pointer passed in is advanced to the start of the next character in the
michael@0:  * buffer.  If non-null, the parameters err and overlong are filled in to
michael@0:  * indicate that the character was represented by an overlong sequence, or
michael@0:  * that an error occurred.
michael@0:  */
michael@0: 
michael@0: class UTF8CharEnumerator
michael@0: {
michael@0: public:
michael@0:   static uint32_t NextChar(const char **buffer, const char *end,
michael@0:                            bool *err)
michael@0:   {
michael@0:     NS_ASSERTION(buffer && *buffer, "null buffer!");
michael@0: 
michael@0:     const char *p = *buffer;
michael@0:     *err = false;
michael@0: 
michael@0:     if (p >= end)
michael@0:       {
michael@0:         *err = true;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     char c = *p++;
michael@0: 
michael@0:     if ( UTF8traits::isASCII(c) )
michael@0:       {
michael@0:         *buffer = p;
michael@0:         return c;
michael@0:       }
michael@0: 
michael@0:     uint32_t ucs4;
michael@0:     uint32_t minUcs4;
michael@0:     int32_t state = 0;
michael@0: 
michael@0:     if (!CalcState(c, ucs4, minUcs4, state)) {
michael@0:         NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
michael@0:         *err = true;
michael@0: 
michael@0:         return 0;
michael@0:     }
michael@0: 
michael@0:     while ( state-- )
michael@0:       {
michael@0:         if (p == end)
michael@0:           {
michael@0:             *err = true;
michael@0: 
michael@0:             return 0;
michael@0:           }
michael@0: 
michael@0:         c = *p++;
michael@0: 
michael@0:         if (!AddByte(c, state, ucs4))
michael@0:           {
michael@0:             *err = true;
michael@0: 
michael@0:             return 0;
michael@0:           }
michael@0:       }
michael@0: 
michael@0:       if ( ucs4 < minUcs4 )
michael@0:         {
michael@0:           // Overlong sequence
michael@0:           ucs4 = UCS2_REPLACEMENT_CHAR;
michael@0:         }
michael@0:       else if ( ucs4 >= 0xD800 &&
michael@0:                 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
michael@0:         {
michael@0:           // Surrogates and code points outside the Unicode range.
michael@0:           ucs4 = UCS2_REPLACEMENT_CHAR;
michael@0:         }
michael@0: 
michael@0:     *buffer = p;
michael@0:     return ucs4;
michael@0:   }
michael@0: 
michael@0: private:
michael@0:   static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
michael@0:                           int32_t& state)
michael@0:   {
michael@0:     if ( UTF8traits::is2byte(c) )
michael@0:       {
michael@0:         ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
michael@0:         state = 1;
michael@0:         minUcs4 = 0x00000080;
michael@0:       }
michael@0:     else if ( UTF8traits::is3byte(c) )
michael@0:       {
michael@0:         ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
michael@0:         state = 2;
michael@0:         minUcs4 = 0x00000800;
michael@0:       }
michael@0:     else if ( UTF8traits::is4byte(c) )
michael@0:       {
michael@0:         ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
michael@0:         state = 3;
michael@0:         minUcs4 = 0x00010000;
michael@0:       }
michael@0:     else if ( UTF8traits::is5byte(c) )
michael@0:       {
michael@0:         ucs4 = (uint32_t(c) << 24) & 0x03000000L;
michael@0:         state = 4;
michael@0:         minUcs4 = 0x00200000;
michael@0:       }
michael@0:     else if ( UTF8traits::is6byte(c) )
michael@0:       {
michael@0:         ucs4 = (uint32_t(c) << 30) & 0x40000000L;
michael@0:         state = 5;
michael@0:         minUcs4 = 0x04000000;
michael@0:       }
michael@0:     else
michael@0:       {
michael@0:         return false;
michael@0:       }
michael@0: 
michael@0:     return true;
michael@0:   }
michael@0: 
michael@0:   static bool AddByte(char c, int32_t state, uint32_t& ucs4)
michael@0:   {
michael@0:     if ( UTF8traits::isInSeq(c) )
michael@0:       {
michael@0:         int32_t shift = state * 6;
michael@0:         ucs4 |= (uint32_t(c) & 0x3F) << shift;
michael@0:         return true;
michael@0:       }
michael@0: 
michael@0:     return false;
michael@0:   }
michael@0: };
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * Extract the next UCS-4 character from the buffer and return it.  The
michael@0:  * pointer passed in is advanced to the start of the next character in the
michael@0:  * buffer.  If non-null, the err parameter is filled in if an error occurs.
michael@0:  */
michael@0: 
michael@0: 
michael@0: class UTF16CharEnumerator
michael@0: {
michael@0: public:
michael@0:   static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
michael@0:                            bool *err = nullptr)
michael@0:   {
michael@0:     NS_ASSERTION(buffer && *buffer, "null buffer!");
michael@0: 
michael@0:     const char16_t *p = *buffer;
michael@0: 
michael@0:     if (p >= end)
michael@0:       {
michael@0:         NS_ERROR("No input to work with");
michael@0:         if (err)
michael@0:           *err = true;
michael@0: 
michael@0:         return 0;
michael@0:       }
michael@0: 
michael@0:     char16_t c = *p++;
michael@0: 
michael@0:     if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
michael@0:       {
michael@0:         if (err)
michael@0:           *err = false;
michael@0:         *buffer = p;
michael@0:         return c;
michael@0:       }
michael@0:     else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
michael@0:       {
michael@0:         if (p == end)
michael@0:           {
michael@0:             // Found a high surrogate the end of the buffer. Flag this
michael@0:             // as an error and return the Unicode replacement
michael@0:             // character 0xFFFD.
michael@0: 
michael@0:             NS_WARNING("Unexpected end of buffer after high surrogate");
michael@0: 
michael@0:             if (err)
michael@0:               *err = true;
michael@0:             *buffer = p;
michael@0:             return 0xFFFD;
michael@0:           }
michael@0: 
michael@0:         // D800- DBFF - High Surrogate
michael@0:         char16_t h = c;
michael@0: 
michael@0:         c = *p++;
michael@0: 
michael@0:         if (NS_IS_LOW_SURROGATE(c))
michael@0:           {
michael@0:             // DC00- DFFF - Low Surrogate
michael@0:             // N = (H - D800) *400 + 10000 + (L - DC00)
michael@0:             uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
michael@0:             if (err)
michael@0:               *err = false;
michael@0:             *buffer = p;
michael@0:             return ucs4;
michael@0:           }
michael@0:         else
michael@0:           {
michael@0:             // Found a high surrogate followed by something other than
michael@0:             // a low surrogate. Flag this as an error and return the
michael@0:             // Unicode replacement character 0xFFFD.  Note that the
michael@0:             // pointer to the next character points to the second 16-bit
michael@0:             // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
michael@0:             // only the first code unit of an illegal sequence must be
michael@0:             // treated as an illegally terminated code unit sequence
michael@0:             // (also Chapter 3 D91, "isolated [not paired and ill-formed]
michael@0:             // UTF-16 code units in the range D800..DFFF are ill-formed").
michael@0:             NS_WARNING("got a High Surrogate but no low surrogate");
michael@0: 
michael@0:             if (err)
michael@0:               *err = true;
michael@0:             *buffer = p - 1;
michael@0:             return 0xFFFD;
michael@0:           }
michael@0:       }
michael@0:     else // U+DC00 - U+DFFF
michael@0:       {
michael@0:         // DC00- DFFF - Low Surrogate
michael@0: 
michael@0:         // Found a low surrogate w/o a preceding high surrogate. Flag
michael@0:         // this as an error and return the Unicode replacement
michael@0:         // character 0xFFFD.
michael@0: 
michael@0:         NS_WARNING("got a low Surrogate but no high surrogate");
michael@0:         if (err)
michael@0:           *err = true;
michael@0:         *buffer = p;
michael@0:         return 0xFFFD;
michael@0:       }
michael@0: 
michael@0:     if (err)
michael@0:       *err = true;
michael@0:     return 0;
michael@0:   }
michael@0: };
michael@0: 
michael@0: 
michael@0: /**
michael@0:  * A character sink (see |copy_string| in nsAlgorithm.h) for converting
michael@0:  * UTF-8 to UTF-16
michael@0:  */
michael@0: class ConvertUTF8toUTF16
michael@0:   {
michael@0:     public:
michael@0:       typedef char      value_type;
michael@0:       typedef char16_t buffer_type;
michael@0: 
michael@0:     ConvertUTF8toUTF16( buffer_type* aBuffer )
michael@0:         : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
michael@0: 
michael@0:     size_t Length() const { return mBuffer - mStart; }
michael@0: 
michael@0:     bool ErrorEncountered() const { return mErrorEncountered; }
michael@0: 
michael@0:     void write( const value_type* start, uint32_t N )
michael@0:       {
michael@0:         if ( mErrorEncountered )
michael@0:           return;
michael@0: 
michael@0:         // algorithm assumes utf8 units won't
michael@0:         // be spread across fragments
michael@0:         const value_type* p = start;
michael@0:         const value_type* end = start + N;
michael@0:         buffer_type* out = mBuffer;
michael@0:         for ( ; p != end /* && *p */; )
michael@0:           {
michael@0:             bool err;
michael@0:             uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
michael@0: 
michael@0:             if ( err )
michael@0:               {
michael@0:                 mErrorEncountered = true;
michael@0:                 mBuffer = out;
michael@0:                 return;
michael@0:               }
michael@0: 
michael@0:             if ( ucs4 >= PLANE1_BASE )
michael@0:               {
michael@0:                 *out++ = (buffer_type)H_SURROGATE(ucs4);
michael@0:                 *out++ = (buffer_type)L_SURROGATE(ucs4);
michael@0:               }
michael@0:             else
michael@0:               {
michael@0:                 *out++ = ucs4;
michael@0:               }
michael@0:           }
michael@0:         mBuffer = out;
michael@0:       }
michael@0: 
michael@0:     void write_terminator()
michael@0:       {
michael@0:         *mBuffer = buffer_type(0);
michael@0:       }
michael@0: 
michael@0:     private:
michael@0:       buffer_type* const mStart;
michael@0:       buffer_type* mBuffer;
michael@0:       bool mErrorEncountered;
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
michael@0:  * the length of the UTF-16 string equivalent to a UTF-8 string.
michael@0:  */
michael@0: class CalculateUTF8Length
michael@0:   {
michael@0:     public:
michael@0:       typedef char value_type;
michael@0: 
michael@0:     CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
michael@0: 
michael@0:     size_t Length() const { return mLength; }
michael@0: 
michael@0:     void write( const value_type* start, uint32_t N )
michael@0:       {
michael@0:           // ignore any further requests
michael@0:         if ( mErrorEncountered )
michael@0:             return;
michael@0: 
michael@0:         // algorithm assumes utf8 units won't
michael@0:         // be spread across fragments
michael@0:         const value_type* p = start;
michael@0:         const value_type* end = start + N;
michael@0:         for ( ; p < end /* && *p */; ++mLength )
michael@0:           {
michael@0:             if ( UTF8traits::isASCII(*p) )
michael@0:                 p += 1;
michael@0:             else if ( UTF8traits::is2byte(*p) )
michael@0:                 p += 2;
michael@0:             else if ( UTF8traits::is3byte(*p) )
michael@0:                 p += 3;
michael@0:             else if ( UTF8traits::is4byte(*p) ) {
michael@0:                 // Because a UTF-8 sequence of 4 bytes represents a codepoint
michael@0:                 // greater than 0xFFFF, it will become a surrogate pair in the
michael@0:                 // UTF-16 string, so add 1 more to mLength.
michael@0:                 // This doesn't happen with is5byte and is6byte because they
michael@0:                 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
michael@0:                 // converted to a single replacement character.
michael@0: 
michael@0:                 // However, there is one case when a 4 byte UTF-8 sequence will
michael@0:                 // only generate 2 UTF-16 bytes. If we have a properly encoded
michael@0:                 // sequence, but with an invalid value (too small or too big),
michael@0:                 // that will result in a replacement character being written
michael@0:                 // This replacement character is encoded as just 1 single
michael@0:                 // UTF-16 character, which is 2 bytes.
michael@0: 
michael@0:                 // The below code therefore only adds 1 to mLength if the UTF8
michael@0:                 // data will produce a decoded character which is greater than
michael@0:                 // or equal to 0x010000 and less than 0x0110000.
michael@0: 
michael@0:                 // A 4byte UTF8 character is encoded as
michael@0:                 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
michael@0:                 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
michael@0:                 // map to bit 17-21 in the final result. If these bits are
michael@0:                 // between 0x01 and 0x11, that means that the final result is
michael@0:                 // between 0x010000 and 0x110000. The below code reads these
michael@0:                 // bits out and assigns them to c, but shifted up 4 bits to
michael@0:                 // avoid having to shift twice.
michael@0: 
michael@0:                 // It doesn't matter what to do in the case where p + 4 > end
michael@0:                 // since no UTF16 characters will be written in that case by
michael@0:                 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
michael@0:                 // any of the surrogate bits are wrong since no UTF16
michael@0:                 // characters will be written in that case either.
michael@0: 
michael@0:                 if (p + 4 <= end) {
michael@0:                   uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
michael@0:                                ((uint32_t)(p[1] & 0x30));
michael@0:                   if (c >= 0x010 && c < 0x110)
michael@0:                     ++mLength;
michael@0:                 }
michael@0: 
michael@0:                 p += 4;
michael@0:             }
michael@0:             else if ( UTF8traits::is5byte(*p) )
michael@0:                 p += 5;
michael@0:             else if ( UTF8traits::is6byte(*p) )
michael@0:                 p += 6;
michael@0:             else // error
michael@0:               {
michael@0:                 ++mLength; // to account for the decrement below
michael@0:                 break;
michael@0:               }
michael@0:           }
michael@0:         if ( p != end )
michael@0:           {
michael@0:             NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
michael@0:             --mLength; // The last multi-byte char wasn't complete, discard it.
michael@0:             mErrorEncountered = true;
michael@0:           }
michael@0:       }
michael@0: 
michael@0:     private:
michael@0:       size_t mLength;
michael@0:       bool mErrorEncountered;
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * A character sink (see |copy_string| in nsAlgorithm.h) for
michael@0:  * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
michael@0:  * (0xEFBFBD in UTF-8).
michael@0:  */
michael@0: class ConvertUTF16toUTF8
michael@0:   {
michael@0:     public:
michael@0:       typedef char16_t value_type;
michael@0:       typedef char      buffer_type;
michael@0: 
michael@0:     // The error handling here is more lenient than that in
michael@0:     // |ConvertUTF8toUTF16|, but it's that way for backwards
michael@0:     // compatibility.
michael@0: 
michael@0:     ConvertUTF16toUTF8( buffer_type* aBuffer )
michael@0:         : mStart(aBuffer), mBuffer(aBuffer) {}
michael@0: 
michael@0:     size_t Size() const { return mBuffer - mStart; }
michael@0: 
michael@0:     void write( const value_type* start, uint32_t N )
michael@0:       {
michael@0:         buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
michael@0: 
michael@0:         for (const value_type *p = start, *end = start + N; p < end; ++p )
michael@0:           {
michael@0:             value_type c = *p;
michael@0:             if (! (c & 0xFF80)) // U+0000 - U+007F
michael@0:               {
michael@0:                 *out++ = (char)c;
michael@0:               }
michael@0:             else if (! (c & 0xF800)) // U+0100 - U+07FF
michael@0:               {
michael@0:                 *out++ = 0xC0 | (char)(c >> 6);
michael@0:                 *out++ = 0x80 | (char)(0x003F & c);
michael@0:               }
michael@0:             else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
michael@0:               {
michael@0:                 *out++ = 0xE0 | (char)(c >> 12);
michael@0:                 *out++ = 0x80 | (char)(0x003F & (c >> 6));
michael@0:                 *out++ = 0x80 | (char)(0x003F & c );
michael@0:               }
michael@0:             else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
michael@0:               {
michael@0:                 // D800- DBFF - High Surrogate
michael@0:                 value_type h = c;
michael@0: 
michael@0:                 ++p;
michael@0:                 if (p == end)
michael@0:                   {
michael@0:                     // Treat broken characters as the Unicode
michael@0:                     // replacement character 0xFFFD (0xEFBFBD in
michael@0:                     // UTF-8)
michael@0:                     *out++ = '\xEF';
michael@0:                     *out++ = '\xBF';
michael@0:                     *out++ = '\xBD';
michael@0: 
michael@0:                     NS_WARNING("String ending in half a surrogate pair!");
michael@0: 
michael@0:                     break;
michael@0:                   }
michael@0:                 c = *p;
michael@0: 
michael@0:                 if (NS_IS_LOW_SURROGATE(c))
michael@0:                   {
michael@0:                     // DC00- DFFF - Low Surrogate
michael@0:                     // N = (H - D800) *400 + 10000 + ( L - DC00 )
michael@0:                     uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
michael@0: 
michael@0:                     // 0001 0000-001F FFFF
michael@0:                     *out++ = 0xF0 | (char)(ucs4 >> 18);
michael@0:                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
michael@0:                     *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
michael@0:                     *out++ = 0x80 | (char)(0x003F & ucs4);
michael@0:                   }
michael@0:                 else
michael@0:                   {
michael@0:                     // Treat broken characters as the Unicode
michael@0:                     // replacement character 0xFFFD (0xEFBFBD in
michael@0:                     // UTF-8)
michael@0:                     *out++ = '\xEF';
michael@0:                     *out++ = '\xBF';
michael@0:                     *out++ = '\xBD';
michael@0: 
michael@0:                     // The pointer to the next character points to the second
michael@0:                     // 16-bit value, not beyond it, as per Unicode 5.0.0
michael@0:                     // Chapter 3 C10, only the first code unit of an illegal
michael@0:                     // sequence must be treated as an illegally terminated
michael@0:                     // code unit sequence (also Chapter 3 D91, "isolated [not
michael@0:                     // paired and ill-formed] UTF-16 code units in the range
michael@0:                     // D800..DFFF are ill-formed").
michael@0:                     p--;
michael@0: 
michael@0:                     NS_WARNING("got a High Surrogate but no low surrogate");
michael@0:                   }
michael@0:               }
michael@0:             else // U+DC00 - U+DFFF
michael@0:               {
michael@0:                 // Treat broken characters as the Unicode replacement
michael@0:                 // character 0xFFFD (0xEFBFBD in UTF-8)
michael@0:                 *out++ = '\xEF';
michael@0:                 *out++ = '\xBF';
michael@0:                 *out++ = '\xBD';
michael@0: 
michael@0:                 // DC00- DFFF - Low Surrogate
michael@0:                 NS_WARNING("got a low Surrogate but no high surrogate");
michael@0:               }
michael@0:           }
michael@0: 
michael@0:         mBuffer = out;
michael@0:       }
michael@0: 
michael@0:     void write_terminator()
michael@0:       {
michael@0:         *mBuffer = buffer_type(0);
michael@0:       }
michael@0: 
michael@0:     private:
michael@0:       buffer_type* const mStart;
michael@0:       buffer_type* mBuffer;
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * A character sink (see |copy_string| in nsAlgorithm.h) for computing
michael@0:  * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
michael@0:  * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
michael@0:  */
michael@0: class CalculateUTF8Size
michael@0:   {
michael@0:     public:
michael@0:       typedef char16_t value_type;
michael@0: 
michael@0:     CalculateUTF8Size()
michael@0:       : mSize(0) { }
michael@0: 
michael@0:     size_t Size() const { return mSize; }
michael@0: 
michael@0:     void write( const value_type* start, uint32_t N )
michael@0:       {
michael@0:         // Assume UCS2 surrogate pairs won't be spread across fragments.
michael@0:         for (const value_type *p = start, *end = start + N; p < end; ++p )
michael@0:           {
michael@0:             value_type c = *p;
michael@0:             if (! (c & 0xFF80)) // U+0000 - U+007F
michael@0:               mSize += 1;
michael@0:             else if (! (c & 0xF800)) // U+0100 - U+07FF
michael@0:               mSize += 2;
michael@0:             else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
michael@0:               mSize += 3;
michael@0:             else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
michael@0:               {
michael@0:                 ++p;
michael@0:                 if (p == end)
michael@0:                   {
michael@0:                     // Treat broken characters as the Unicode
michael@0:                     // replacement character 0xFFFD (0xEFBFBD in
michael@0:                     // UTF-8)
michael@0:                     mSize += 3;
michael@0: 
michael@0:                     NS_WARNING("String ending in half a surrogate pair!");
michael@0: 
michael@0:                     break;
michael@0:                   }
michael@0:                 c = *p;
michael@0: 
michael@0:                 if (0xDC00 == (0xFC00 & c))
michael@0:                   mSize += 4;
michael@0:                 else
michael@0:                   {
michael@0:                     // Treat broken characters as the Unicode
michael@0:                     // replacement character 0xFFFD (0xEFBFBD in
michael@0:                     // UTF-8)
michael@0:                     mSize += 3;
michael@0: 
michael@0:                     // The next code unit is the second 16-bit value, not
michael@0:                     // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
michael@0:                     // only the first code unit of an illegal sequence must
michael@0:                     // be treated as an illegally terminated code unit
michael@0:                     // sequence (also Chapter 3 D91, "isolated [not paired and
michael@0:                     // ill-formed] UTF-16 code units in the range D800..DFFF
michael@0:                     // are ill-formed").
michael@0:                     p--;
michael@0: 
michael@0:                     NS_WARNING("got a high Surrogate but no low surrogate");
michael@0:                   }
michael@0:               }
michael@0:             else // U+DC00 - U+DFFF
michael@0:               {
michael@0:                 // Treat broken characters as the Unicode replacement
michael@0:                 // character 0xFFFD (0xEFBFBD in UTF-8)
michael@0:                 mSize += 3;
michael@0: 
michael@0:                 NS_WARNING("got a low Surrogate but no high surrogate");
michael@0:               }
michael@0:           }
michael@0:       }
michael@0: 
michael@0:     private:
michael@0:       size_t mSize;
michael@0:   };
michael@0: 
michael@0: #ifdef MOZILLA_INTERNAL_API
michael@0: /**
michael@0:  * A character sink that performs a |reinterpret_cast|-style conversion
michael@0:  * from char to char16_t.
michael@0:  */
michael@0: class LossyConvertEncoding8to16
michael@0:   {
michael@0:     public:
michael@0:       typedef char      value_type;
michael@0:       typedef char      input_type;
michael@0:       typedef char16_t output_type;
michael@0: 
michael@0:     public:
michael@0:       LossyConvertEncoding8to16( char16_t* aDestination ) :
michael@0:         mDestination(aDestination) { }
michael@0: 
michael@0:       void
michael@0:       write( const char* aSource, uint32_t aSourceLength )
michael@0:         {
michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0:           if (mozilla::supports_sse2())
michael@0:             {
michael@0:               write_sse2(aSource, aSourceLength);
michael@0:               return;
michael@0:             }
michael@0: #endif
michael@0:           const char* done_writing = aSource + aSourceLength;
michael@0:           while ( aSource < done_writing )
michael@0:             *mDestination++ = (char16_t)(unsigned char)(*aSource++);
michael@0:         }
michael@0: 
michael@0:       void
michael@0:       write_sse2( const char* aSource, uint32_t aSourceLength );
michael@0: 
michael@0:       void
michael@0:       write_terminator()
michael@0:         {
michael@0:           *mDestination = (char16_t)(0);
michael@0:         }
michael@0: 
michael@0:     private:
michael@0:       char16_t* mDestination;
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * A character sink that performs a |reinterpret_cast|-style conversion
michael@0:  * from char16_t to char.
michael@0:  */
michael@0: class LossyConvertEncoding16to8
michael@0:   {
michael@0:     public:
michael@0:       typedef char16_t value_type;
michael@0:       typedef char16_t input_type;
michael@0:       typedef char      output_type;
michael@0: 
michael@0:       LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
michael@0: 
michael@0:       void
michael@0:       write( const char16_t* aSource, uint32_t aSourceLength)
michael@0:         {
michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0:           if (mozilla::supports_sse2())
michael@0:             {
michael@0:               write_sse2(aSource, aSourceLength);
michael@0:               return;
michael@0:             }
michael@0: #endif
michael@0:             const char16_t* done_writing = aSource + aSourceLength;
michael@0:             while ( aSource < done_writing )
michael@0:               *mDestination++ = (char)(*aSource++);
michael@0:         }
michael@0: 
michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0:       void
michael@0:       write_sse2( const char16_t* aSource, uint32_t aSourceLength );
michael@0: #endif
michael@0: 
michael@0:       void
michael@0:       write_terminator()
michael@0:         {
michael@0:           *mDestination = '\0';
michael@0:         }
michael@0: 
michael@0:     private:
michael@0:       char *mDestination;
michael@0:   };
michael@0: #endif // MOZILLA_INTERNAL_API
michael@0: 
michael@0: #endif /* !defined(nsUTF8Utils_h_) */