michael@0: /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ michael@0: /* This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: #ifndef nsUTF8Utils_h_ michael@0: #define nsUTF8Utils_h_ michael@0: michael@0: // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this michael@0: // file will provide signatures for the Mozilla abstract string types. It will michael@0: // use XPCOM assertion/debugging macros, etc. michael@0: michael@0: #include "nscore.h" michael@0: #include "mozilla/SSE.h" michael@0: michael@0: #include "nsCharTraits.h" michael@0: michael@0: class UTF8traits michael@0: { michael@0: public: michael@0: static bool isASCII(char c) { return (c & 0x80) == 0x00; } michael@0: static bool isInSeq(char c) { return (c & 0xC0) == 0x80; } michael@0: static bool is2byte(char c) { return (c & 0xE0) == 0xC0; } michael@0: static bool is3byte(char c) { return (c & 0xF0) == 0xE0; } michael@0: static bool is4byte(char c) { return (c & 0xF8) == 0xF0; } michael@0: static bool is5byte(char c) { return (c & 0xFC) == 0xF8; } michael@0: static bool is6byte(char c) { return (c & 0xFE) == 0xFC; } michael@0: }; michael@0: michael@0: /** michael@0: * Extract the next UCS-4 character from the buffer and return it. The michael@0: * pointer passed in is advanced to the start of the next character in the michael@0: * buffer. If non-null, the parameters err and overlong are filled in to michael@0: * indicate that the character was represented by an overlong sequence, or michael@0: * that an error occurred. michael@0: */ michael@0: michael@0: class UTF8CharEnumerator michael@0: { michael@0: public: michael@0: static uint32_t NextChar(const char **buffer, const char *end, michael@0: bool *err) michael@0: { michael@0: NS_ASSERTION(buffer && *buffer, "null buffer!"); michael@0: michael@0: const char *p = *buffer; michael@0: *err = false; michael@0: michael@0: if (p >= end) michael@0: { michael@0: *err = true; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: char c = *p++; michael@0: michael@0: if ( UTF8traits::isASCII(c) ) michael@0: { michael@0: *buffer = p; michael@0: return c; michael@0: } michael@0: michael@0: uint32_t ucs4; michael@0: uint32_t minUcs4; michael@0: int32_t state = 0; michael@0: michael@0: if (!CalcState(c, ucs4, minUcs4, state)) { michael@0: NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); michael@0: *err = true; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: while ( state-- ) michael@0: { michael@0: if (p == end) michael@0: { michael@0: *err = true; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: c = *p++; michael@0: michael@0: if (!AddByte(c, state, ucs4)) michael@0: { michael@0: *err = true; michael@0: michael@0: return 0; michael@0: } michael@0: } michael@0: michael@0: if ( ucs4 < minUcs4 ) michael@0: { michael@0: // Overlong sequence michael@0: ucs4 = UCS2_REPLACEMENT_CHAR; michael@0: } michael@0: else if ( ucs4 >= 0xD800 && michael@0: (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) michael@0: { michael@0: // Surrogates and code points outside the Unicode range. michael@0: ucs4 = UCS2_REPLACEMENT_CHAR; michael@0: } michael@0: michael@0: *buffer = p; michael@0: return ucs4; michael@0: } michael@0: michael@0: private: michael@0: static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4, michael@0: int32_t& state) michael@0: { michael@0: if ( UTF8traits::is2byte(c) ) michael@0: { michael@0: ucs4 = (uint32_t(c) << 6) & 0x000007C0L; michael@0: state = 1; michael@0: minUcs4 = 0x00000080; michael@0: } michael@0: else if ( UTF8traits::is3byte(c) ) michael@0: { michael@0: ucs4 = (uint32_t(c) << 12) & 0x0000F000L; michael@0: state = 2; michael@0: minUcs4 = 0x00000800; michael@0: } michael@0: else if ( UTF8traits::is4byte(c) ) michael@0: { michael@0: ucs4 = (uint32_t(c) << 18) & 0x001F0000L; michael@0: state = 3; michael@0: minUcs4 = 0x00010000; michael@0: } michael@0: else if ( UTF8traits::is5byte(c) ) michael@0: { michael@0: ucs4 = (uint32_t(c) << 24) & 0x03000000L; michael@0: state = 4; michael@0: minUcs4 = 0x00200000; michael@0: } michael@0: else if ( UTF8traits::is6byte(c) ) michael@0: { michael@0: ucs4 = (uint32_t(c) << 30) & 0x40000000L; michael@0: state = 5; michael@0: minUcs4 = 0x04000000; michael@0: } michael@0: else michael@0: { michael@0: return false; michael@0: } michael@0: michael@0: return true; michael@0: } michael@0: michael@0: static bool AddByte(char c, int32_t state, uint32_t& ucs4) michael@0: { michael@0: if ( UTF8traits::isInSeq(c) ) michael@0: { michael@0: int32_t shift = state * 6; michael@0: ucs4 |= (uint32_t(c) & 0x3F) << shift; michael@0: return true; michael@0: } michael@0: michael@0: return false; michael@0: } michael@0: }; michael@0: michael@0: michael@0: /** michael@0: * Extract the next UCS-4 character from the buffer and return it. The michael@0: * pointer passed in is advanced to the start of the next character in the michael@0: * buffer. If non-null, the err parameter is filled in if an error occurs. michael@0: */ michael@0: michael@0: michael@0: class UTF16CharEnumerator michael@0: { michael@0: public: michael@0: static uint32_t NextChar(const char16_t **buffer, const char16_t *end, michael@0: bool *err = nullptr) michael@0: { michael@0: NS_ASSERTION(buffer && *buffer, "null buffer!"); michael@0: michael@0: const char16_t *p = *buffer; michael@0: michael@0: if (p >= end) michael@0: { michael@0: NS_ERROR("No input to work with"); michael@0: if (err) michael@0: *err = true; michael@0: michael@0: return 0; michael@0: } michael@0: michael@0: char16_t c = *p++; michael@0: michael@0: if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF michael@0: { michael@0: if (err) michael@0: *err = false; michael@0: *buffer = p; michael@0: return c; michael@0: } michael@0: else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF michael@0: { michael@0: if (p == end) michael@0: { michael@0: // Found a high surrogate the end of the buffer. Flag this michael@0: // as an error and return the Unicode replacement michael@0: // character 0xFFFD. michael@0: michael@0: NS_WARNING("Unexpected end of buffer after high surrogate"); michael@0: michael@0: if (err) michael@0: *err = true; michael@0: *buffer = p; michael@0: return 0xFFFD; michael@0: } michael@0: michael@0: // D800- DBFF - High Surrogate michael@0: char16_t h = c; michael@0: michael@0: c = *p++; michael@0: michael@0: if (NS_IS_LOW_SURROGATE(c)) michael@0: { michael@0: // DC00- DFFF - Low Surrogate michael@0: // N = (H - D800) *400 + 10000 + (L - DC00) michael@0: uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); michael@0: if (err) michael@0: *err = false; michael@0: *buffer = p; michael@0: return ucs4; michael@0: } michael@0: else michael@0: { michael@0: // Found a high surrogate followed by something other than michael@0: // a low surrogate. Flag this as an error and return the michael@0: // Unicode replacement character 0xFFFD. Note that the michael@0: // pointer to the next character points to the second 16-bit michael@0: // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10, michael@0: // only the first code unit of an illegal sequence must be michael@0: // treated as an illegally terminated code unit sequence michael@0: // (also Chapter 3 D91, "isolated [not paired and ill-formed] michael@0: // UTF-16 code units in the range D800..DFFF are ill-formed"). michael@0: NS_WARNING("got a High Surrogate but no low surrogate"); michael@0: michael@0: if (err) michael@0: *err = true; michael@0: *buffer = p - 1; michael@0: return 0xFFFD; michael@0: } michael@0: } michael@0: else // U+DC00 - U+DFFF michael@0: { michael@0: // DC00- DFFF - Low Surrogate michael@0: michael@0: // Found a low surrogate w/o a preceding high surrogate. Flag michael@0: // this as an error and return the Unicode replacement michael@0: // character 0xFFFD. michael@0: michael@0: NS_WARNING("got a low Surrogate but no high surrogate"); michael@0: if (err) michael@0: *err = true; michael@0: *buffer = p; michael@0: return 0xFFFD; michael@0: } michael@0: michael@0: if (err) michael@0: *err = true; michael@0: return 0; michael@0: } michael@0: }; michael@0: michael@0: michael@0: /** michael@0: * A character sink (see |copy_string| in nsAlgorithm.h) for converting michael@0: * UTF-8 to UTF-16 michael@0: */ michael@0: class ConvertUTF8toUTF16 michael@0: { michael@0: public: michael@0: typedef char value_type; michael@0: typedef char16_t buffer_type; michael@0: michael@0: ConvertUTF8toUTF16( buffer_type* aBuffer ) michael@0: : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {} michael@0: michael@0: size_t Length() const { return mBuffer - mStart; } michael@0: michael@0: bool ErrorEncountered() const { return mErrorEncountered; } michael@0: michael@0: void write( const value_type* start, uint32_t N ) michael@0: { michael@0: if ( mErrorEncountered ) michael@0: return; michael@0: michael@0: // algorithm assumes utf8 units won't michael@0: // be spread across fragments michael@0: const value_type* p = start; michael@0: const value_type* end = start + N; michael@0: buffer_type* out = mBuffer; michael@0: for ( ; p != end /* && *p */; ) michael@0: { michael@0: bool err; michael@0: uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err); michael@0: michael@0: if ( err ) michael@0: { michael@0: mErrorEncountered = true; michael@0: mBuffer = out; michael@0: return; michael@0: } michael@0: michael@0: if ( ucs4 >= PLANE1_BASE ) michael@0: { michael@0: *out++ = (buffer_type)H_SURROGATE(ucs4); michael@0: *out++ = (buffer_type)L_SURROGATE(ucs4); michael@0: } michael@0: else michael@0: { michael@0: *out++ = ucs4; michael@0: } michael@0: } michael@0: mBuffer = out; michael@0: } michael@0: michael@0: void write_terminator() michael@0: { michael@0: *mBuffer = buffer_type(0); michael@0: } michael@0: michael@0: private: michael@0: buffer_type* const mStart; michael@0: buffer_type* mBuffer; michael@0: bool mErrorEncountered; michael@0: }; michael@0: michael@0: /** michael@0: * A character sink (see |copy_string| in nsAlgorithm.h) for computing michael@0: * the length of the UTF-16 string equivalent to a UTF-8 string. michael@0: */ michael@0: class CalculateUTF8Length michael@0: { michael@0: public: michael@0: typedef char value_type; michael@0: michael@0: CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { } michael@0: michael@0: size_t Length() const { return mLength; } michael@0: michael@0: void write( const value_type* start, uint32_t N ) michael@0: { michael@0: // ignore any further requests michael@0: if ( mErrorEncountered ) michael@0: return; michael@0: michael@0: // algorithm assumes utf8 units won't michael@0: // be spread across fragments michael@0: const value_type* p = start; michael@0: const value_type* end = start + N; michael@0: for ( ; p < end /* && *p */; ++mLength ) michael@0: { michael@0: if ( UTF8traits::isASCII(*p) ) michael@0: p += 1; michael@0: else if ( UTF8traits::is2byte(*p) ) michael@0: p += 2; michael@0: else if ( UTF8traits::is3byte(*p) ) michael@0: p += 3; michael@0: else if ( UTF8traits::is4byte(*p) ) { michael@0: // Because a UTF-8 sequence of 4 bytes represents a codepoint michael@0: // greater than 0xFFFF, it will become a surrogate pair in the michael@0: // UTF-16 string, so add 1 more to mLength. michael@0: // This doesn't happen with is5byte and is6byte because they michael@0: // are illegal UTF-8 sequences (greater than 0x10FFFF) so get michael@0: // converted to a single replacement character. michael@0: michael@0: // However, there is one case when a 4 byte UTF-8 sequence will michael@0: // only generate 2 UTF-16 bytes. If we have a properly encoded michael@0: // sequence, but with an invalid value (too small or too big), michael@0: // that will result in a replacement character being written michael@0: // This replacement character is encoded as just 1 single michael@0: // UTF-16 character, which is 2 bytes. michael@0: michael@0: // The below code therefore only adds 1 to mLength if the UTF8 michael@0: // data will produce a decoded character which is greater than michael@0: // or equal to 0x010000 and less than 0x0110000. michael@0: michael@0: // A 4byte UTF8 character is encoded as michael@0: // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx michael@0: // Bit 1-3 on the first byte, and bit 5-6 on the second byte, michael@0: // map to bit 17-21 in the final result. If these bits are michael@0: // between 0x01 and 0x11, that means that the final result is michael@0: // between 0x010000 and 0x110000. The below code reads these michael@0: // bits out and assigns them to c, but shifted up 4 bits to michael@0: // avoid having to shift twice. michael@0: michael@0: // It doesn't matter what to do in the case where p + 4 > end michael@0: // since no UTF16 characters will be written in that case by michael@0: // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if michael@0: // any of the surrogate bits are wrong since no UTF16 michael@0: // characters will be written in that case either. michael@0: michael@0: if (p + 4 <= end) { michael@0: uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 | michael@0: ((uint32_t)(p[1] & 0x30)); michael@0: if (c >= 0x010 && c < 0x110) michael@0: ++mLength; michael@0: } michael@0: michael@0: p += 4; michael@0: } michael@0: else if ( UTF8traits::is5byte(*p) ) michael@0: p += 5; michael@0: else if ( UTF8traits::is6byte(*p) ) michael@0: p += 6; michael@0: else // error michael@0: { michael@0: ++mLength; // to account for the decrement below michael@0: break; michael@0: } michael@0: } michael@0: if ( p != end ) michael@0: { michael@0: NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); michael@0: --mLength; // The last multi-byte char wasn't complete, discard it. michael@0: mErrorEncountered = true; michael@0: } michael@0: } michael@0: michael@0: private: michael@0: size_t mLength; michael@0: bool mErrorEncountered; michael@0: }; michael@0: michael@0: /** michael@0: * A character sink (see |copy_string| in nsAlgorithm.h) for michael@0: * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD michael@0: * (0xEFBFBD in UTF-8). michael@0: */ michael@0: class ConvertUTF16toUTF8 michael@0: { michael@0: public: michael@0: typedef char16_t value_type; michael@0: typedef char buffer_type; michael@0: michael@0: // The error handling here is more lenient than that in michael@0: // |ConvertUTF8toUTF16|, but it's that way for backwards michael@0: // compatibility. michael@0: michael@0: ConvertUTF16toUTF8( buffer_type* aBuffer ) michael@0: : mStart(aBuffer), mBuffer(aBuffer) {} michael@0: michael@0: size_t Size() const { return mBuffer - mStart; } michael@0: michael@0: void write( const value_type* start, uint32_t N ) michael@0: { michael@0: buffer_type *out = mBuffer; // gcc isn't smart enough to do this! michael@0: michael@0: for (const value_type *p = start, *end = start + N; p < end; ++p ) michael@0: { michael@0: value_type c = *p; michael@0: if (! (c & 0xFF80)) // U+0000 - U+007F michael@0: { michael@0: *out++ = (char)c; michael@0: } michael@0: else if (! (c & 0xF800)) // U+0100 - U+07FF michael@0: { michael@0: *out++ = 0xC0 | (char)(c >> 6); michael@0: *out++ = 0x80 | (char)(0x003F & c); michael@0: } michael@0: else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF michael@0: { michael@0: *out++ = 0xE0 | (char)(c >> 12); michael@0: *out++ = 0x80 | (char)(0x003F & (c >> 6)); michael@0: *out++ = 0x80 | (char)(0x003F & c ); michael@0: } michael@0: else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF michael@0: { michael@0: // D800- DBFF - High Surrogate michael@0: value_type h = c; michael@0: michael@0: ++p; michael@0: if (p == end) michael@0: { michael@0: // Treat broken characters as the Unicode michael@0: // replacement character 0xFFFD (0xEFBFBD in michael@0: // UTF-8) michael@0: *out++ = '\xEF'; michael@0: *out++ = '\xBF'; michael@0: *out++ = '\xBD'; michael@0: michael@0: NS_WARNING("String ending in half a surrogate pair!"); michael@0: michael@0: break; michael@0: } michael@0: c = *p; michael@0: michael@0: if (NS_IS_LOW_SURROGATE(c)) michael@0: { michael@0: // DC00- DFFF - Low Surrogate michael@0: // N = (H - D800) *400 + 10000 + ( L - DC00 ) michael@0: uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); michael@0: michael@0: // 0001 0000-001F FFFF michael@0: *out++ = 0xF0 | (char)(ucs4 >> 18); michael@0: *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); michael@0: *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); michael@0: *out++ = 0x80 | (char)(0x003F & ucs4); michael@0: } michael@0: else michael@0: { michael@0: // Treat broken characters as the Unicode michael@0: // replacement character 0xFFFD (0xEFBFBD in michael@0: // UTF-8) michael@0: *out++ = '\xEF'; michael@0: *out++ = '\xBF'; michael@0: *out++ = '\xBD'; michael@0: michael@0: // The pointer to the next character points to the second michael@0: // 16-bit value, not beyond it, as per Unicode 5.0.0 michael@0: // Chapter 3 C10, only the first code unit of an illegal michael@0: // sequence must be treated as an illegally terminated michael@0: // code unit sequence (also Chapter 3 D91, "isolated [not michael@0: // paired and ill-formed] UTF-16 code units in the range michael@0: // D800..DFFF are ill-formed"). michael@0: p--; michael@0: michael@0: NS_WARNING("got a High Surrogate but no low surrogate"); michael@0: } michael@0: } michael@0: else // U+DC00 - U+DFFF michael@0: { michael@0: // Treat broken characters as the Unicode replacement michael@0: // character 0xFFFD (0xEFBFBD in UTF-8) michael@0: *out++ = '\xEF'; michael@0: *out++ = '\xBF'; michael@0: *out++ = '\xBD'; michael@0: michael@0: // DC00- DFFF - Low Surrogate michael@0: NS_WARNING("got a low Surrogate but no high surrogate"); michael@0: } michael@0: } michael@0: michael@0: mBuffer = out; michael@0: } michael@0: michael@0: void write_terminator() michael@0: { michael@0: *mBuffer = buffer_type(0); michael@0: } michael@0: michael@0: private: michael@0: buffer_type* const mStart; michael@0: buffer_type* mBuffer; michael@0: }; michael@0: michael@0: /** michael@0: * A character sink (see |copy_string| in nsAlgorithm.h) for computing michael@0: * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid michael@0: * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8). michael@0: */ michael@0: class CalculateUTF8Size michael@0: { michael@0: public: michael@0: typedef char16_t value_type; michael@0: michael@0: CalculateUTF8Size() michael@0: : mSize(0) { } michael@0: michael@0: size_t Size() const { return mSize; } michael@0: michael@0: void write( const value_type* start, uint32_t N ) michael@0: { michael@0: // Assume UCS2 surrogate pairs won't be spread across fragments. michael@0: for (const value_type *p = start, *end = start + N; p < end; ++p ) michael@0: { michael@0: value_type c = *p; michael@0: if (! (c & 0xFF80)) // U+0000 - U+007F michael@0: mSize += 1; michael@0: else if (! (c & 0xF800)) // U+0100 - U+07FF michael@0: mSize += 2; michael@0: else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF michael@0: mSize += 3; michael@0: else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF michael@0: { michael@0: ++p; michael@0: if (p == end) michael@0: { michael@0: // Treat broken characters as the Unicode michael@0: // replacement character 0xFFFD (0xEFBFBD in michael@0: // UTF-8) michael@0: mSize += 3; michael@0: michael@0: NS_WARNING("String ending in half a surrogate pair!"); michael@0: michael@0: break; michael@0: } michael@0: c = *p; michael@0: michael@0: if (0xDC00 == (0xFC00 & c)) michael@0: mSize += 4; michael@0: else michael@0: { michael@0: // Treat broken characters as the Unicode michael@0: // replacement character 0xFFFD (0xEFBFBD in michael@0: // UTF-8) michael@0: mSize += 3; michael@0: michael@0: // The next code unit is the second 16-bit value, not michael@0: // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10, michael@0: // only the first code unit of an illegal sequence must michael@0: // be treated as an illegally terminated code unit michael@0: // sequence (also Chapter 3 D91, "isolated [not paired and michael@0: // ill-formed] UTF-16 code units in the range D800..DFFF michael@0: // are ill-formed"). michael@0: p--; michael@0: michael@0: NS_WARNING("got a high Surrogate but no low surrogate"); michael@0: } michael@0: } michael@0: else // U+DC00 - U+DFFF michael@0: { michael@0: // Treat broken characters as the Unicode replacement michael@0: // character 0xFFFD (0xEFBFBD in UTF-8) michael@0: mSize += 3; michael@0: michael@0: NS_WARNING("got a low Surrogate but no high surrogate"); michael@0: } michael@0: } michael@0: } michael@0: michael@0: private: michael@0: size_t mSize; michael@0: }; michael@0: michael@0: #ifdef MOZILLA_INTERNAL_API michael@0: /** michael@0: * A character sink that performs a |reinterpret_cast|-style conversion michael@0: * from char to char16_t. michael@0: */ michael@0: class LossyConvertEncoding8to16 michael@0: { michael@0: public: michael@0: typedef char value_type; michael@0: typedef char input_type; michael@0: typedef char16_t output_type; michael@0: michael@0: public: michael@0: LossyConvertEncoding8to16( char16_t* aDestination ) : michael@0: mDestination(aDestination) { } michael@0: michael@0: void michael@0: write( const char* aSource, uint32_t aSourceLength ) michael@0: { michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2 michael@0: if (mozilla::supports_sse2()) michael@0: { michael@0: write_sse2(aSource, aSourceLength); michael@0: return; michael@0: } michael@0: #endif michael@0: const char* done_writing = aSource + aSourceLength; michael@0: while ( aSource < done_writing ) michael@0: *mDestination++ = (char16_t)(unsigned char)(*aSource++); michael@0: } michael@0: michael@0: void michael@0: write_sse2( const char* aSource, uint32_t aSourceLength ); michael@0: michael@0: void michael@0: write_terminator() michael@0: { michael@0: *mDestination = (char16_t)(0); michael@0: } michael@0: michael@0: private: michael@0: char16_t* mDestination; michael@0: }; michael@0: michael@0: /** michael@0: * A character sink that performs a |reinterpret_cast|-style conversion michael@0: * from char16_t to char. michael@0: */ michael@0: class LossyConvertEncoding16to8 michael@0: { michael@0: public: michael@0: typedef char16_t value_type; michael@0: typedef char16_t input_type; michael@0: typedef char output_type; michael@0: michael@0: LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { } michael@0: michael@0: void michael@0: write( const char16_t* aSource, uint32_t aSourceLength) michael@0: { michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2 michael@0: if (mozilla::supports_sse2()) michael@0: { michael@0: write_sse2(aSource, aSourceLength); michael@0: return; michael@0: } michael@0: #endif michael@0: const char16_t* done_writing = aSource + aSourceLength; michael@0: while ( aSource < done_writing ) michael@0: *mDestination++ = (char)(*aSource++); michael@0: } michael@0: michael@0: #ifdef MOZILLA_MAY_SUPPORT_SSE2 michael@0: void michael@0: write_sse2( const char16_t* aSource, uint32_t aSourceLength ); michael@0: #endif michael@0: michael@0: void michael@0: write_terminator() michael@0: { michael@0: *mDestination = '\0'; michael@0: } michael@0: michael@0: private: michael@0: char *mDestination; michael@0: }; michael@0: #endif // MOZILLA_INTERNAL_API michael@0: michael@0: #endif /* !defined(nsUTF8Utils_h_) */