1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/xpcom/string/public/nsUTF8Utils.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,721 @@ 1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 +#ifndef nsUTF8Utils_h_ 1.9 +#define nsUTF8Utils_h_ 1.10 + 1.11 +// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this 1.12 +// file will provide signatures for the Mozilla abstract string types. It will 1.13 +// use XPCOM assertion/debugging macros, etc. 1.14 + 1.15 +#include "nscore.h" 1.16 +#include "mozilla/SSE.h" 1.17 + 1.18 +#include "nsCharTraits.h" 1.19 + 1.20 +class UTF8traits 1.21 + { 1.22 + public: 1.23 + static bool isASCII(char c) { return (c & 0x80) == 0x00; } 1.24 + static bool isInSeq(char c) { return (c & 0xC0) == 0x80; } 1.25 + static bool is2byte(char c) { return (c & 0xE0) == 0xC0; } 1.26 + static bool is3byte(char c) { return (c & 0xF0) == 0xE0; } 1.27 + static bool is4byte(char c) { return (c & 0xF8) == 0xF0; } 1.28 + static bool is5byte(char c) { return (c & 0xFC) == 0xF8; } 1.29 + static bool is6byte(char c) { return (c & 0xFE) == 0xFC; } 1.30 + }; 1.31 + 1.32 +/** 1.33 + * Extract the next UCS-4 character from the buffer and return it. The 1.34 + * pointer passed in is advanced to the start of the next character in the 1.35 + * buffer. If non-null, the parameters err and overlong are filled in to 1.36 + * indicate that the character was represented by an overlong sequence, or 1.37 + * that an error occurred. 1.38 + */ 1.39 + 1.40 +class UTF8CharEnumerator 1.41 +{ 1.42 +public: 1.43 + static uint32_t NextChar(const char **buffer, const char *end, 1.44 + bool *err) 1.45 + { 1.46 + NS_ASSERTION(buffer && *buffer, "null buffer!"); 1.47 + 1.48 + const char *p = *buffer; 1.49 + *err = false; 1.50 + 1.51 + if (p >= end) 1.52 + { 1.53 + *err = true; 1.54 + 1.55 + return 0; 1.56 + } 1.57 + 1.58 + char c = *p++; 1.59 + 1.60 + if ( UTF8traits::isASCII(c) ) 1.61 + { 1.62 + *buffer = p; 1.63 + return c; 1.64 + } 1.65 + 1.66 + uint32_t ucs4; 1.67 + uint32_t minUcs4; 1.68 + int32_t state = 0; 1.69 + 1.70 + if (!CalcState(c, ucs4, minUcs4, state)) { 1.71 + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); 1.72 + *err = true; 1.73 + 1.74 + return 0; 1.75 + } 1.76 + 1.77 + while ( state-- ) 1.78 + { 1.79 + if (p == end) 1.80 + { 1.81 + *err = true; 1.82 + 1.83 + return 0; 1.84 + } 1.85 + 1.86 + c = *p++; 1.87 + 1.88 + if (!AddByte(c, state, ucs4)) 1.89 + { 1.90 + *err = true; 1.91 + 1.92 + return 0; 1.93 + } 1.94 + } 1.95 + 1.96 + if ( ucs4 < minUcs4 ) 1.97 + { 1.98 + // Overlong sequence 1.99 + ucs4 = UCS2_REPLACEMENT_CHAR; 1.100 + } 1.101 + else if ( ucs4 >= 0xD800 && 1.102 + (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) 1.103 + { 1.104 + // Surrogates and code points outside the Unicode range. 1.105 + ucs4 = UCS2_REPLACEMENT_CHAR; 1.106 + } 1.107 + 1.108 + *buffer = p; 1.109 + return ucs4; 1.110 + } 1.111 + 1.112 +private: 1.113 + static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4, 1.114 + int32_t& state) 1.115 + { 1.116 + if ( UTF8traits::is2byte(c) ) 1.117 + { 1.118 + ucs4 = (uint32_t(c) << 6) & 0x000007C0L; 1.119 + state = 1; 1.120 + minUcs4 = 0x00000080; 1.121 + } 1.122 + else if ( UTF8traits::is3byte(c) ) 1.123 + { 1.124 + ucs4 = (uint32_t(c) << 12) & 0x0000F000L; 1.125 + state = 2; 1.126 + minUcs4 = 0x00000800; 1.127 + } 1.128 + else if ( UTF8traits::is4byte(c) ) 1.129 + { 1.130 + ucs4 = (uint32_t(c) << 18) & 0x001F0000L; 1.131 + state = 3; 1.132 + minUcs4 = 0x00010000; 1.133 + } 1.134 + else if ( UTF8traits::is5byte(c) ) 1.135 + { 1.136 + ucs4 = (uint32_t(c) << 24) & 0x03000000L; 1.137 + state = 4; 1.138 + minUcs4 = 0x00200000; 1.139 + } 1.140 + else if ( UTF8traits::is6byte(c) ) 1.141 + { 1.142 + ucs4 = (uint32_t(c) << 30) & 0x40000000L; 1.143 + state = 5; 1.144 + minUcs4 = 0x04000000; 1.145 + } 1.146 + else 1.147 + { 1.148 + return false; 1.149 + } 1.150 + 1.151 + return true; 1.152 + } 1.153 + 1.154 + static bool AddByte(char c, int32_t state, uint32_t& ucs4) 1.155 + { 1.156 + if ( UTF8traits::isInSeq(c) ) 1.157 + { 1.158 + int32_t shift = state * 6; 1.159 + ucs4 |= (uint32_t(c) & 0x3F) << shift; 1.160 + return true; 1.161 + } 1.162 + 1.163 + return false; 1.164 + } 1.165 +}; 1.166 + 1.167 + 1.168 +/** 1.169 + * Extract the next UCS-4 character from the buffer and return it. The 1.170 + * pointer passed in is advanced to the start of the next character in the 1.171 + * buffer. If non-null, the err parameter is filled in if an error occurs. 1.172 + */ 1.173 + 1.174 + 1.175 +class UTF16CharEnumerator 1.176 +{ 1.177 +public: 1.178 + static uint32_t NextChar(const char16_t **buffer, const char16_t *end, 1.179 + bool *err = nullptr) 1.180 + { 1.181 + NS_ASSERTION(buffer && *buffer, "null buffer!"); 1.182 + 1.183 + const char16_t *p = *buffer; 1.184 + 1.185 + if (p >= end) 1.186 + { 1.187 + NS_ERROR("No input to work with"); 1.188 + if (err) 1.189 + *err = true; 1.190 + 1.191 + return 0; 1.192 + } 1.193 + 1.194 + char16_t c = *p++; 1.195 + 1.196 + if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF 1.197 + { 1.198 + if (err) 1.199 + *err = false; 1.200 + *buffer = p; 1.201 + return c; 1.202 + } 1.203 + else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF 1.204 + { 1.205 + if (p == end) 1.206 + { 1.207 + // Found a high surrogate the end of the buffer. Flag this 1.208 + // as an error and return the Unicode replacement 1.209 + // character 0xFFFD. 1.210 + 1.211 + NS_WARNING("Unexpected end of buffer after high surrogate"); 1.212 + 1.213 + if (err) 1.214 + *err = true; 1.215 + *buffer = p; 1.216 + return 0xFFFD; 1.217 + } 1.218 + 1.219 + // D800- DBFF - High Surrogate 1.220 + char16_t h = c; 1.221 + 1.222 + c = *p++; 1.223 + 1.224 + if (NS_IS_LOW_SURROGATE(c)) 1.225 + { 1.226 + // DC00- DFFF - Low Surrogate 1.227 + // N = (H - D800) *400 + 10000 + (L - DC00) 1.228 + uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); 1.229 + if (err) 1.230 + *err = false; 1.231 + *buffer = p; 1.232 + return ucs4; 1.233 + } 1.234 + else 1.235 + { 1.236 + // Found a high surrogate followed by something other than 1.237 + // a low surrogate. Flag this as an error and return the 1.238 + // Unicode replacement character 0xFFFD. Note that the 1.239 + // pointer to the next character points to the second 16-bit 1.240 + // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10, 1.241 + // only the first code unit of an illegal sequence must be 1.242 + // treated as an illegally terminated code unit sequence 1.243 + // (also Chapter 3 D91, "isolated [not paired and ill-formed] 1.244 + // UTF-16 code units in the range D800..DFFF are ill-formed"). 1.245 + NS_WARNING("got a High Surrogate but no low surrogate"); 1.246 + 1.247 + if (err) 1.248 + *err = true; 1.249 + *buffer = p - 1; 1.250 + return 0xFFFD; 1.251 + } 1.252 + } 1.253 + else // U+DC00 - U+DFFF 1.254 + { 1.255 + // DC00- DFFF - Low Surrogate 1.256 + 1.257 + // Found a low surrogate w/o a preceding high surrogate. Flag 1.258 + // this as an error and return the Unicode replacement 1.259 + // character 0xFFFD. 1.260 + 1.261 + NS_WARNING("got a low Surrogate but no high surrogate"); 1.262 + if (err) 1.263 + *err = true; 1.264 + *buffer = p; 1.265 + return 0xFFFD; 1.266 + } 1.267 + 1.268 + if (err) 1.269 + *err = true; 1.270 + return 0; 1.271 + } 1.272 +}; 1.273 + 1.274 + 1.275 +/** 1.276 + * A character sink (see |copy_string| in nsAlgorithm.h) for converting 1.277 + * UTF-8 to UTF-16 1.278 + */ 1.279 +class ConvertUTF8toUTF16 1.280 + { 1.281 + public: 1.282 + typedef char value_type; 1.283 + typedef char16_t buffer_type; 1.284 + 1.285 + ConvertUTF8toUTF16( buffer_type* aBuffer ) 1.286 + : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {} 1.287 + 1.288 + size_t Length() const { return mBuffer - mStart; } 1.289 + 1.290 + bool ErrorEncountered() const { return mErrorEncountered; } 1.291 + 1.292 + void write( const value_type* start, uint32_t N ) 1.293 + { 1.294 + if ( mErrorEncountered ) 1.295 + return; 1.296 + 1.297 + // algorithm assumes utf8 units won't 1.298 + // be spread across fragments 1.299 + const value_type* p = start; 1.300 + const value_type* end = start + N; 1.301 + buffer_type* out = mBuffer; 1.302 + for ( ; p != end /* && *p */; ) 1.303 + { 1.304 + bool err; 1.305 + uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err); 1.306 + 1.307 + if ( err ) 1.308 + { 1.309 + mErrorEncountered = true; 1.310 + mBuffer = out; 1.311 + return; 1.312 + } 1.313 + 1.314 + if ( ucs4 >= PLANE1_BASE ) 1.315 + { 1.316 + *out++ = (buffer_type)H_SURROGATE(ucs4); 1.317 + *out++ = (buffer_type)L_SURROGATE(ucs4); 1.318 + } 1.319 + else 1.320 + { 1.321 + *out++ = ucs4; 1.322 + } 1.323 + } 1.324 + mBuffer = out; 1.325 + } 1.326 + 1.327 + void write_terminator() 1.328 + { 1.329 + *mBuffer = buffer_type(0); 1.330 + } 1.331 + 1.332 + private: 1.333 + buffer_type* const mStart; 1.334 + buffer_type* mBuffer; 1.335 + bool mErrorEncountered; 1.336 + }; 1.337 + 1.338 +/** 1.339 + * A character sink (see |copy_string| in nsAlgorithm.h) for computing 1.340 + * the length of the UTF-16 string equivalent to a UTF-8 string. 1.341 + */ 1.342 +class CalculateUTF8Length 1.343 + { 1.344 + public: 1.345 + typedef char value_type; 1.346 + 1.347 + CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { } 1.348 + 1.349 + size_t Length() const { return mLength; } 1.350 + 1.351 + void write( const value_type* start, uint32_t N ) 1.352 + { 1.353 + // ignore any further requests 1.354 + if ( mErrorEncountered ) 1.355 + return; 1.356 + 1.357 + // algorithm assumes utf8 units won't 1.358 + // be spread across fragments 1.359 + const value_type* p = start; 1.360 + const value_type* end = start + N; 1.361 + for ( ; p < end /* && *p */; ++mLength ) 1.362 + { 1.363 + if ( UTF8traits::isASCII(*p) ) 1.364 + p += 1; 1.365 + else if ( UTF8traits::is2byte(*p) ) 1.366 + p += 2; 1.367 + else if ( UTF8traits::is3byte(*p) ) 1.368 + p += 3; 1.369 + else if ( UTF8traits::is4byte(*p) ) { 1.370 + // Because a UTF-8 sequence of 4 bytes represents a codepoint 1.371 + // greater than 0xFFFF, it will become a surrogate pair in the 1.372 + // UTF-16 string, so add 1 more to mLength. 1.373 + // This doesn't happen with is5byte and is6byte because they 1.374 + // are illegal UTF-8 sequences (greater than 0x10FFFF) so get 1.375 + // converted to a single replacement character. 1.376 + 1.377 + // However, there is one case when a 4 byte UTF-8 sequence will 1.378 + // only generate 2 UTF-16 bytes. If we have a properly encoded 1.379 + // sequence, but with an invalid value (too small or too big), 1.380 + // that will result in a replacement character being written 1.381 + // This replacement character is encoded as just 1 single 1.382 + // UTF-16 character, which is 2 bytes. 1.383 + 1.384 + // The below code therefore only adds 1 to mLength if the UTF8 1.385 + // data will produce a decoded character which is greater than 1.386 + // or equal to 0x010000 and less than 0x0110000. 1.387 + 1.388 + // A 4byte UTF8 character is encoded as 1.389 + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1.390 + // Bit 1-3 on the first byte, and bit 5-6 on the second byte, 1.391 + // map to bit 17-21 in the final result. If these bits are 1.392 + // between 0x01 and 0x11, that means that the final result is 1.393 + // between 0x010000 and 0x110000. The below code reads these 1.394 + // bits out and assigns them to c, but shifted up 4 bits to 1.395 + // avoid having to shift twice. 1.396 + 1.397 + // It doesn't matter what to do in the case where p + 4 > end 1.398 + // since no UTF16 characters will be written in that case by 1.399 + // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if 1.400 + // any of the surrogate bits are wrong since no UTF16 1.401 + // characters will be written in that case either. 1.402 + 1.403 + if (p + 4 <= end) { 1.404 + uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 | 1.405 + ((uint32_t)(p[1] & 0x30)); 1.406 + if (c >= 0x010 && c < 0x110) 1.407 + ++mLength; 1.408 + } 1.409 + 1.410 + p += 4; 1.411 + } 1.412 + else if ( UTF8traits::is5byte(*p) ) 1.413 + p += 5; 1.414 + else if ( UTF8traits::is6byte(*p) ) 1.415 + p += 6; 1.416 + else // error 1.417 + { 1.418 + ++mLength; // to account for the decrement below 1.419 + break; 1.420 + } 1.421 + } 1.422 + if ( p != end ) 1.423 + { 1.424 + NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); 1.425 + --mLength; // The last multi-byte char wasn't complete, discard it. 1.426 + mErrorEncountered = true; 1.427 + } 1.428 + } 1.429 + 1.430 + private: 1.431 + size_t mLength; 1.432 + bool mErrorEncountered; 1.433 + }; 1.434 + 1.435 +/** 1.436 + * A character sink (see |copy_string| in nsAlgorithm.h) for 1.437 + * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD 1.438 + * (0xEFBFBD in UTF-8). 1.439 + */ 1.440 +class ConvertUTF16toUTF8 1.441 + { 1.442 + public: 1.443 + typedef char16_t value_type; 1.444 + typedef char buffer_type; 1.445 + 1.446 + // The error handling here is more lenient than that in 1.447 + // |ConvertUTF8toUTF16|, but it's that way for backwards 1.448 + // compatibility. 1.449 + 1.450 + ConvertUTF16toUTF8( buffer_type* aBuffer ) 1.451 + : mStart(aBuffer), mBuffer(aBuffer) {} 1.452 + 1.453 + size_t Size() const { return mBuffer - mStart; } 1.454 + 1.455 + void write( const value_type* start, uint32_t N ) 1.456 + { 1.457 + buffer_type *out = mBuffer; // gcc isn't smart enough to do this! 1.458 + 1.459 + for (const value_type *p = start, *end = start + N; p < end; ++p ) 1.460 + { 1.461 + value_type c = *p; 1.462 + if (! (c & 0xFF80)) // U+0000 - U+007F 1.463 + { 1.464 + *out++ = (char)c; 1.465 + } 1.466 + else if (! (c & 0xF800)) // U+0100 - U+07FF 1.467 + { 1.468 + *out++ = 0xC0 | (char)(c >> 6); 1.469 + *out++ = 0x80 | (char)(0x003F & c); 1.470 + } 1.471 + else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF 1.472 + { 1.473 + *out++ = 0xE0 | (char)(c >> 12); 1.474 + *out++ = 0x80 | (char)(0x003F & (c >> 6)); 1.475 + *out++ = 0x80 | (char)(0x003F & c ); 1.476 + } 1.477 + else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF 1.478 + { 1.479 + // D800- DBFF - High Surrogate 1.480 + value_type h = c; 1.481 + 1.482 + ++p; 1.483 + if (p == end) 1.484 + { 1.485 + // Treat broken characters as the Unicode 1.486 + // replacement character 0xFFFD (0xEFBFBD in 1.487 + // UTF-8) 1.488 + *out++ = '\xEF'; 1.489 + *out++ = '\xBF'; 1.490 + *out++ = '\xBD'; 1.491 + 1.492 + NS_WARNING("String ending in half a surrogate pair!"); 1.493 + 1.494 + break; 1.495 + } 1.496 + c = *p; 1.497 + 1.498 + if (NS_IS_LOW_SURROGATE(c)) 1.499 + { 1.500 + // DC00- DFFF - Low Surrogate 1.501 + // N = (H - D800) *400 + 10000 + ( L - DC00 ) 1.502 + uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); 1.503 + 1.504 + // 0001 0000-001F FFFF 1.505 + *out++ = 0xF0 | (char)(ucs4 >> 18); 1.506 + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); 1.507 + *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); 1.508 + *out++ = 0x80 | (char)(0x003F & ucs4); 1.509 + } 1.510 + else 1.511 + { 1.512 + // Treat broken characters as the Unicode 1.513 + // replacement character 0xFFFD (0xEFBFBD in 1.514 + // UTF-8) 1.515 + *out++ = '\xEF'; 1.516 + *out++ = '\xBF'; 1.517 + *out++ = '\xBD'; 1.518 + 1.519 + // The pointer to the next character points to the second 1.520 + // 16-bit value, not beyond it, as per Unicode 5.0.0 1.521 + // Chapter 3 C10, only the first code unit of an illegal 1.522 + // sequence must be treated as an illegally terminated 1.523 + // code unit sequence (also Chapter 3 D91, "isolated [not 1.524 + // paired and ill-formed] UTF-16 code units in the range 1.525 + // D800..DFFF are ill-formed"). 1.526 + p--; 1.527 + 1.528 + NS_WARNING("got a High Surrogate but no low surrogate"); 1.529 + } 1.530 + } 1.531 + else // U+DC00 - U+DFFF 1.532 + { 1.533 + // Treat broken characters as the Unicode replacement 1.534 + // character 0xFFFD (0xEFBFBD in UTF-8) 1.535 + *out++ = '\xEF'; 1.536 + *out++ = '\xBF'; 1.537 + *out++ = '\xBD'; 1.538 + 1.539 + // DC00- DFFF - Low Surrogate 1.540 + NS_WARNING("got a low Surrogate but no high surrogate"); 1.541 + } 1.542 + } 1.543 + 1.544 + mBuffer = out; 1.545 + } 1.546 + 1.547 + void write_terminator() 1.548 + { 1.549 + *mBuffer = buffer_type(0); 1.550 + } 1.551 + 1.552 + private: 1.553 + buffer_type* const mStart; 1.554 + buffer_type* mBuffer; 1.555 + }; 1.556 + 1.557 +/** 1.558 + * A character sink (see |copy_string| in nsAlgorithm.h) for computing 1.559 + * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid 1.560 + * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8). 1.561 + */ 1.562 +class CalculateUTF8Size 1.563 + { 1.564 + public: 1.565 + typedef char16_t value_type; 1.566 + 1.567 + CalculateUTF8Size() 1.568 + : mSize(0) { } 1.569 + 1.570 + size_t Size() const { return mSize; } 1.571 + 1.572 + void write( const value_type* start, uint32_t N ) 1.573 + { 1.574 + // Assume UCS2 surrogate pairs won't be spread across fragments. 1.575 + for (const value_type *p = start, *end = start + N; p < end; ++p ) 1.576 + { 1.577 + value_type c = *p; 1.578 + if (! (c & 0xFF80)) // U+0000 - U+007F 1.579 + mSize += 1; 1.580 + else if (! (c & 0xF800)) // U+0100 - U+07FF 1.581 + mSize += 2; 1.582 + else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF 1.583 + mSize += 3; 1.584 + else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF 1.585 + { 1.586 + ++p; 1.587 + if (p == end) 1.588 + { 1.589 + // Treat broken characters as the Unicode 1.590 + // replacement character 0xFFFD (0xEFBFBD in 1.591 + // UTF-8) 1.592 + mSize += 3; 1.593 + 1.594 + NS_WARNING("String ending in half a surrogate pair!"); 1.595 + 1.596 + break; 1.597 + } 1.598 + c = *p; 1.599 + 1.600 + if (0xDC00 == (0xFC00 & c)) 1.601 + mSize += 4; 1.602 + else 1.603 + { 1.604 + // Treat broken characters as the Unicode 1.605 + // replacement character 0xFFFD (0xEFBFBD in 1.606 + // UTF-8) 1.607 + mSize += 3; 1.608 + 1.609 + // The next code unit is the second 16-bit value, not 1.610 + // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10, 1.611 + // only the first code unit of an illegal sequence must 1.612 + // be treated as an illegally terminated code unit 1.613 + // sequence (also Chapter 3 D91, "isolated [not paired and 1.614 + // ill-formed] UTF-16 code units in the range D800..DFFF 1.615 + // are ill-formed"). 1.616 + p--; 1.617 + 1.618 + NS_WARNING("got a high Surrogate but no low surrogate"); 1.619 + } 1.620 + } 1.621 + else // U+DC00 - U+DFFF 1.622 + { 1.623 + // Treat broken characters as the Unicode replacement 1.624 + // character 0xFFFD (0xEFBFBD in UTF-8) 1.625 + mSize += 3; 1.626 + 1.627 + NS_WARNING("got a low Surrogate but no high surrogate"); 1.628 + } 1.629 + } 1.630 + } 1.631 + 1.632 + private: 1.633 + size_t mSize; 1.634 + }; 1.635 + 1.636 +#ifdef MOZILLA_INTERNAL_API 1.637 +/** 1.638 + * A character sink that performs a |reinterpret_cast|-style conversion 1.639 + * from char to char16_t. 1.640 + */ 1.641 +class LossyConvertEncoding8to16 1.642 + { 1.643 + public: 1.644 + typedef char value_type; 1.645 + typedef char input_type; 1.646 + typedef char16_t output_type; 1.647 + 1.648 + public: 1.649 + LossyConvertEncoding8to16( char16_t* aDestination ) : 1.650 + mDestination(aDestination) { } 1.651 + 1.652 + void 1.653 + write( const char* aSource, uint32_t aSourceLength ) 1.654 + { 1.655 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.656 + if (mozilla::supports_sse2()) 1.657 + { 1.658 + write_sse2(aSource, aSourceLength); 1.659 + return; 1.660 + } 1.661 +#endif 1.662 + const char* done_writing = aSource + aSourceLength; 1.663 + while ( aSource < done_writing ) 1.664 + *mDestination++ = (char16_t)(unsigned char)(*aSource++); 1.665 + } 1.666 + 1.667 + void 1.668 + write_sse2( const char* aSource, uint32_t aSourceLength ); 1.669 + 1.670 + void 1.671 + write_terminator() 1.672 + { 1.673 + *mDestination = (char16_t)(0); 1.674 + } 1.675 + 1.676 + private: 1.677 + char16_t* mDestination; 1.678 + }; 1.679 + 1.680 +/** 1.681 + * A character sink that performs a |reinterpret_cast|-style conversion 1.682 + * from char16_t to char. 1.683 + */ 1.684 +class LossyConvertEncoding16to8 1.685 + { 1.686 + public: 1.687 + typedef char16_t value_type; 1.688 + typedef char16_t input_type; 1.689 + typedef char output_type; 1.690 + 1.691 + LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { } 1.692 + 1.693 + void 1.694 + write( const char16_t* aSource, uint32_t aSourceLength) 1.695 + { 1.696 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.697 + if (mozilla::supports_sse2()) 1.698 + { 1.699 + write_sse2(aSource, aSourceLength); 1.700 + return; 1.701 + } 1.702 +#endif 1.703 + const char16_t* done_writing = aSource + aSourceLength; 1.704 + while ( aSource < done_writing ) 1.705 + *mDestination++ = (char)(*aSource++); 1.706 + } 1.707 + 1.708 +#ifdef MOZILLA_MAY_SUPPORT_SSE2 1.709 + void 1.710 + write_sse2( const char16_t* aSource, uint32_t aSourceLength ); 1.711 +#endif 1.712 + 1.713 + void 1.714 + write_terminator() 1.715 + { 1.716 + *mDestination = '\0'; 1.717 + } 1.718 + 1.719 + private: 1.720 + char *mDestination; 1.721 + }; 1.722 +#endif // MOZILLA_INTERNAL_API 1.723 + 1.724 +#endif /* !defined(nsUTF8Utils_h_) */