xpcom/string/public/nsUTF8Utils.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/xpcom/string/public/nsUTF8Utils.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,721 @@
     1.4 +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
     1.5 +/* This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +#ifndef nsUTF8Utils_h_
     1.9 +#define nsUTF8Utils_h_
    1.10 +
    1.11 +// This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
    1.12 +// file will provide signatures for the Mozilla abstract string types. It will
    1.13 +// use XPCOM assertion/debugging macros, etc.
    1.14 +
    1.15 +#include "nscore.h"
    1.16 +#include "mozilla/SSE.h"
    1.17 +
    1.18 +#include "nsCharTraits.h"
    1.19 +
    1.20 +class UTF8traits
    1.21 +  {
    1.22 +    public:
    1.23 +      static bool isASCII(char c) { return (c & 0x80) == 0x00; }
    1.24 +      static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
    1.25 +      static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
    1.26 +      static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
    1.27 +      static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
    1.28 +      static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
    1.29 +      static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
    1.30 +  };
    1.31 +
    1.32 +/**
    1.33 + * Extract the next UCS-4 character from the buffer and return it.  The
    1.34 + * pointer passed in is advanced to the start of the next character in the
    1.35 + * buffer.  If non-null, the parameters err and overlong are filled in to
    1.36 + * indicate that the character was represented by an overlong sequence, or
    1.37 + * that an error occurred.
    1.38 + */
    1.39 +
    1.40 +class UTF8CharEnumerator
    1.41 +{
    1.42 +public:
    1.43 +  static uint32_t NextChar(const char **buffer, const char *end,
    1.44 +                           bool *err)
    1.45 +  {
    1.46 +    NS_ASSERTION(buffer && *buffer, "null buffer!");
    1.47 +
    1.48 +    const char *p = *buffer;
    1.49 +    *err = false;
    1.50 +
    1.51 +    if (p >= end)
    1.52 +      {
    1.53 +        *err = true;
    1.54 +
    1.55 +        return 0;
    1.56 +      }
    1.57 +
    1.58 +    char c = *p++;
    1.59 +
    1.60 +    if ( UTF8traits::isASCII(c) )
    1.61 +      {
    1.62 +        *buffer = p;
    1.63 +        return c;
    1.64 +      }
    1.65 +
    1.66 +    uint32_t ucs4;
    1.67 +    uint32_t minUcs4;
    1.68 +    int32_t state = 0;
    1.69 +
    1.70 +    if (!CalcState(c, ucs4, minUcs4, state)) {
    1.71 +        NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
    1.72 +        *err = true;
    1.73 +
    1.74 +        return 0;
    1.75 +    }
    1.76 +
    1.77 +    while ( state-- )
    1.78 +      {
    1.79 +        if (p == end)
    1.80 +          {
    1.81 +            *err = true;
    1.82 +
    1.83 +            return 0;
    1.84 +          }
    1.85 +
    1.86 +        c = *p++;
    1.87 +
    1.88 +        if (!AddByte(c, state, ucs4))
    1.89 +          {
    1.90 +            *err = true;
    1.91 +
    1.92 +            return 0;
    1.93 +          }
    1.94 +      }
    1.95 +
    1.96 +      if ( ucs4 < minUcs4 )
    1.97 +        {
    1.98 +          // Overlong sequence
    1.99 +          ucs4 = UCS2_REPLACEMENT_CHAR;
   1.100 +        }
   1.101 +      else if ( ucs4 >= 0xD800 &&
   1.102 +                (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
   1.103 +        {
   1.104 +          // Surrogates and code points outside the Unicode range.
   1.105 +          ucs4 = UCS2_REPLACEMENT_CHAR;
   1.106 +        }
   1.107 +
   1.108 +    *buffer = p;
   1.109 +    return ucs4;
   1.110 +  }
   1.111 +
   1.112 +private:
   1.113 +  static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
   1.114 +                          int32_t& state)
   1.115 +  {
   1.116 +    if ( UTF8traits::is2byte(c) )
   1.117 +      {
   1.118 +        ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
   1.119 +        state = 1;
   1.120 +        minUcs4 = 0x00000080;
   1.121 +      }
   1.122 +    else if ( UTF8traits::is3byte(c) )
   1.123 +      {
   1.124 +        ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
   1.125 +        state = 2;
   1.126 +        minUcs4 = 0x00000800;
   1.127 +      }
   1.128 +    else if ( UTF8traits::is4byte(c) )
   1.129 +      {
   1.130 +        ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
   1.131 +        state = 3;
   1.132 +        minUcs4 = 0x00010000;
   1.133 +      }
   1.134 +    else if ( UTF8traits::is5byte(c) )
   1.135 +      {
   1.136 +        ucs4 = (uint32_t(c) << 24) & 0x03000000L;
   1.137 +        state = 4;
   1.138 +        minUcs4 = 0x00200000;
   1.139 +      }
   1.140 +    else if ( UTF8traits::is6byte(c) )
   1.141 +      {
   1.142 +        ucs4 = (uint32_t(c) << 30) & 0x40000000L;
   1.143 +        state = 5;
   1.144 +        minUcs4 = 0x04000000;
   1.145 +      }
   1.146 +    else
   1.147 +      {
   1.148 +        return false;
   1.149 +      }
   1.150 +
   1.151 +    return true;
   1.152 +  }
   1.153 +
   1.154 +  static bool AddByte(char c, int32_t state, uint32_t& ucs4)
   1.155 +  {
   1.156 +    if ( UTF8traits::isInSeq(c) )
   1.157 +      {
   1.158 +        int32_t shift = state * 6;
   1.159 +        ucs4 |= (uint32_t(c) & 0x3F) << shift;
   1.160 +        return true;
   1.161 +      }
   1.162 +
   1.163 +    return false;
   1.164 +  }
   1.165 +};
   1.166 +
   1.167 +
   1.168 +/**
   1.169 + * Extract the next UCS-4 character from the buffer and return it.  The
   1.170 + * pointer passed in is advanced to the start of the next character in the
   1.171 + * buffer.  If non-null, the err parameter is filled in if an error occurs.
   1.172 + */
   1.173 +
   1.174 +
   1.175 +class UTF16CharEnumerator
   1.176 +{
   1.177 +public:
   1.178 +  static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
   1.179 +                           bool *err = nullptr)
   1.180 +  {
   1.181 +    NS_ASSERTION(buffer && *buffer, "null buffer!");
   1.182 +
   1.183 +    const char16_t *p = *buffer;
   1.184 +
   1.185 +    if (p >= end)
   1.186 +      {
   1.187 +        NS_ERROR("No input to work with");
   1.188 +        if (err)
   1.189 +          *err = true;
   1.190 +
   1.191 +        return 0;
   1.192 +      }
   1.193 +
   1.194 +    char16_t c = *p++;
   1.195 +
   1.196 +    if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
   1.197 +      {
   1.198 +        if (err)
   1.199 +          *err = false;
   1.200 +        *buffer = p;
   1.201 +        return c;
   1.202 +      }
   1.203 +    else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
   1.204 +      {
   1.205 +        if (p == end)
   1.206 +          {
   1.207 +            // Found a high surrogate the end of the buffer. Flag this
   1.208 +            // as an error and return the Unicode replacement
   1.209 +            // character 0xFFFD.
   1.210 +
   1.211 +            NS_WARNING("Unexpected end of buffer after high surrogate");
   1.212 +
   1.213 +            if (err)
   1.214 +              *err = true;
   1.215 +            *buffer = p;
   1.216 +            return 0xFFFD;
   1.217 +          }
   1.218 +
   1.219 +        // D800- DBFF - High Surrogate
   1.220 +        char16_t h = c;
   1.221 +
   1.222 +        c = *p++;
   1.223 +
   1.224 +        if (NS_IS_LOW_SURROGATE(c))
   1.225 +          {
   1.226 +            // DC00- DFFF - Low Surrogate
   1.227 +            // N = (H - D800) *400 + 10000 + (L - DC00)
   1.228 +            uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
   1.229 +            if (err)
   1.230 +              *err = false;
   1.231 +            *buffer = p;
   1.232 +            return ucs4;
   1.233 +          }
   1.234 +        else
   1.235 +          {
   1.236 +            // Found a high surrogate followed by something other than
   1.237 +            // a low surrogate. Flag this as an error and return the
   1.238 +            // Unicode replacement character 0xFFFD.  Note that the
   1.239 +            // pointer to the next character points to the second 16-bit
   1.240 +            // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
   1.241 +            // only the first code unit of an illegal sequence must be
   1.242 +            // treated as an illegally terminated code unit sequence
   1.243 +            // (also Chapter 3 D91, "isolated [not paired and ill-formed]
   1.244 +            // UTF-16 code units in the range D800..DFFF are ill-formed").
   1.245 +            NS_WARNING("got a High Surrogate but no low surrogate");
   1.246 +
   1.247 +            if (err)
   1.248 +              *err = true;
   1.249 +            *buffer = p - 1;
   1.250 +            return 0xFFFD;
   1.251 +          }
   1.252 +      }
   1.253 +    else // U+DC00 - U+DFFF
   1.254 +      {
   1.255 +        // DC00- DFFF - Low Surrogate
   1.256 +
   1.257 +        // Found a low surrogate w/o a preceding high surrogate. Flag
   1.258 +        // this as an error and return the Unicode replacement
   1.259 +        // character 0xFFFD.
   1.260 +
   1.261 +        NS_WARNING("got a low Surrogate but no high surrogate");
   1.262 +        if (err)
   1.263 +          *err = true;
   1.264 +        *buffer = p;
   1.265 +        return 0xFFFD;
   1.266 +      }
   1.267 +
   1.268 +    if (err)
   1.269 +      *err = true;
   1.270 +    return 0;
   1.271 +  }
   1.272 +};
   1.273 +
   1.274 +
   1.275 +/**
   1.276 + * A character sink (see |copy_string| in nsAlgorithm.h) for converting
   1.277 + * UTF-8 to UTF-16
   1.278 + */
   1.279 +class ConvertUTF8toUTF16
   1.280 +  {
   1.281 +    public:
   1.282 +      typedef char      value_type;
   1.283 +      typedef char16_t buffer_type;
   1.284 +
   1.285 +    ConvertUTF8toUTF16( buffer_type* aBuffer )
   1.286 +        : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
   1.287 +
   1.288 +    size_t Length() const { return mBuffer - mStart; }
   1.289 +
   1.290 +    bool ErrorEncountered() const { return mErrorEncountered; }
   1.291 +
   1.292 +    void write( const value_type* start, uint32_t N )
   1.293 +      {
   1.294 +        if ( mErrorEncountered )
   1.295 +          return;
   1.296 +
   1.297 +        // algorithm assumes utf8 units won't
   1.298 +        // be spread across fragments
   1.299 +        const value_type* p = start;
   1.300 +        const value_type* end = start + N;
   1.301 +        buffer_type* out = mBuffer;
   1.302 +        for ( ; p != end /* && *p */; )
   1.303 +          {
   1.304 +            bool err;
   1.305 +            uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
   1.306 +
   1.307 +            if ( err )
   1.308 +              {
   1.309 +                mErrorEncountered = true;
   1.310 +                mBuffer = out;
   1.311 +                return;
   1.312 +              }
   1.313 +
   1.314 +            if ( ucs4 >= PLANE1_BASE )
   1.315 +              {
   1.316 +                *out++ = (buffer_type)H_SURROGATE(ucs4);
   1.317 +                *out++ = (buffer_type)L_SURROGATE(ucs4);
   1.318 +              }
   1.319 +            else
   1.320 +              {
   1.321 +                *out++ = ucs4;
   1.322 +              }
   1.323 +          }
   1.324 +        mBuffer = out;
   1.325 +      }
   1.326 +
   1.327 +    void write_terminator()
   1.328 +      {
   1.329 +        *mBuffer = buffer_type(0);
   1.330 +      }
   1.331 +
   1.332 +    private:
   1.333 +      buffer_type* const mStart;
   1.334 +      buffer_type* mBuffer;
   1.335 +      bool mErrorEncountered;
   1.336 +  };
   1.337 +
   1.338 +/**
   1.339 + * A character sink (see |copy_string| in nsAlgorithm.h) for computing
   1.340 + * the length of the UTF-16 string equivalent to a UTF-8 string.
   1.341 + */
   1.342 +class CalculateUTF8Length
   1.343 +  {
   1.344 +    public:
   1.345 +      typedef char value_type;
   1.346 +
   1.347 +    CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
   1.348 +
   1.349 +    size_t Length() const { return mLength; }
   1.350 +
   1.351 +    void write( const value_type* start, uint32_t N )
   1.352 +      {
   1.353 +          // ignore any further requests
   1.354 +        if ( mErrorEncountered )
   1.355 +            return;
   1.356 +
   1.357 +        // algorithm assumes utf8 units won't
   1.358 +        // be spread across fragments
   1.359 +        const value_type* p = start;
   1.360 +        const value_type* end = start + N;
   1.361 +        for ( ; p < end /* && *p */; ++mLength )
   1.362 +          {
   1.363 +            if ( UTF8traits::isASCII(*p) )
   1.364 +                p += 1;
   1.365 +            else if ( UTF8traits::is2byte(*p) )
   1.366 +                p += 2;
   1.367 +            else if ( UTF8traits::is3byte(*p) )
   1.368 +                p += 3;
   1.369 +            else if ( UTF8traits::is4byte(*p) ) {
   1.370 +                // Because a UTF-8 sequence of 4 bytes represents a codepoint
   1.371 +                // greater than 0xFFFF, it will become a surrogate pair in the
   1.372 +                // UTF-16 string, so add 1 more to mLength.
   1.373 +                // This doesn't happen with is5byte and is6byte because they
   1.374 +                // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
   1.375 +                // converted to a single replacement character.
   1.376 +
   1.377 +                // However, there is one case when a 4 byte UTF-8 sequence will
   1.378 +                // only generate 2 UTF-16 bytes. If we have a properly encoded
   1.379 +                // sequence, but with an invalid value (too small or too big),
   1.380 +                // that will result in a replacement character being written
   1.381 +                // This replacement character is encoded as just 1 single
   1.382 +                // UTF-16 character, which is 2 bytes.
   1.383 +
   1.384 +                // The below code therefore only adds 1 to mLength if the UTF8
   1.385 +                // data will produce a decoded character which is greater than
   1.386 +                // or equal to 0x010000 and less than 0x0110000.
   1.387 +
   1.388 +                // A 4byte UTF8 character is encoded as
   1.389 +                // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
   1.390 +                // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
   1.391 +                // map to bit 17-21 in the final result. If these bits are
   1.392 +                // between 0x01 and 0x11, that means that the final result is
   1.393 +                // between 0x010000 and 0x110000. The below code reads these
   1.394 +                // bits out and assigns them to c, but shifted up 4 bits to
   1.395 +                // avoid having to shift twice.
   1.396 +
   1.397 +                // It doesn't matter what to do in the case where p + 4 > end
   1.398 +                // since no UTF16 characters will be written in that case by
   1.399 +                // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
   1.400 +                // any of the surrogate bits are wrong since no UTF16
   1.401 +                // characters will be written in that case either.
   1.402 +
   1.403 +                if (p + 4 <= end) {
   1.404 +                  uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
   1.405 +                               ((uint32_t)(p[1] & 0x30));
   1.406 +                  if (c >= 0x010 && c < 0x110)
   1.407 +                    ++mLength;
   1.408 +                }
   1.409 +
   1.410 +                p += 4;
   1.411 +            }
   1.412 +            else if ( UTF8traits::is5byte(*p) )
   1.413 +                p += 5;
   1.414 +            else if ( UTF8traits::is6byte(*p) )
   1.415 +                p += 6;
   1.416 +            else // error
   1.417 +              {
   1.418 +                ++mLength; // to account for the decrement below
   1.419 +                break;
   1.420 +              }
   1.421 +          }
   1.422 +        if ( p != end )
   1.423 +          {
   1.424 +            NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
   1.425 +            --mLength; // The last multi-byte char wasn't complete, discard it.
   1.426 +            mErrorEncountered = true;
   1.427 +          }
   1.428 +      }
   1.429 +
   1.430 +    private:
   1.431 +      size_t mLength;
   1.432 +      bool mErrorEncountered;
   1.433 +  };
   1.434 +
   1.435 +/**
   1.436 + * A character sink (see |copy_string| in nsAlgorithm.h) for
   1.437 + * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
   1.438 + * (0xEFBFBD in UTF-8).
   1.439 + */
   1.440 +class ConvertUTF16toUTF8
   1.441 +  {
   1.442 +    public:
   1.443 +      typedef char16_t value_type;
   1.444 +      typedef char      buffer_type;
   1.445 +
   1.446 +    // The error handling here is more lenient than that in
   1.447 +    // |ConvertUTF8toUTF16|, but it's that way for backwards
   1.448 +    // compatibility.
   1.449 +
   1.450 +    ConvertUTF16toUTF8( buffer_type* aBuffer )
   1.451 +        : mStart(aBuffer), mBuffer(aBuffer) {}
   1.452 +
   1.453 +    size_t Size() const { return mBuffer - mStart; }
   1.454 +
   1.455 +    void write( const value_type* start, uint32_t N )
   1.456 +      {
   1.457 +        buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
   1.458 +
   1.459 +        for (const value_type *p = start, *end = start + N; p < end; ++p )
   1.460 +          {
   1.461 +            value_type c = *p;
   1.462 +            if (! (c & 0xFF80)) // U+0000 - U+007F
   1.463 +              {
   1.464 +                *out++ = (char)c;
   1.465 +              }
   1.466 +            else if (! (c & 0xF800)) // U+0100 - U+07FF
   1.467 +              {
   1.468 +                *out++ = 0xC0 | (char)(c >> 6);
   1.469 +                *out++ = 0x80 | (char)(0x003F & c);
   1.470 +              }
   1.471 +            else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
   1.472 +              {
   1.473 +                *out++ = 0xE0 | (char)(c >> 12);
   1.474 +                *out++ = 0x80 | (char)(0x003F & (c >> 6));
   1.475 +                *out++ = 0x80 | (char)(0x003F & c );
   1.476 +              }
   1.477 +            else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
   1.478 +              {
   1.479 +                // D800- DBFF - High Surrogate
   1.480 +                value_type h = c;
   1.481 +
   1.482 +                ++p;
   1.483 +                if (p == end)
   1.484 +                  {
   1.485 +                    // Treat broken characters as the Unicode
   1.486 +                    // replacement character 0xFFFD (0xEFBFBD in
   1.487 +                    // UTF-8)
   1.488 +                    *out++ = '\xEF';
   1.489 +                    *out++ = '\xBF';
   1.490 +                    *out++ = '\xBD';
   1.491 +
   1.492 +                    NS_WARNING("String ending in half a surrogate pair!");
   1.493 +
   1.494 +                    break;
   1.495 +                  }
   1.496 +                c = *p;
   1.497 +
   1.498 +                if (NS_IS_LOW_SURROGATE(c))
   1.499 +                  {
   1.500 +                    // DC00- DFFF - Low Surrogate
   1.501 +                    // N = (H - D800) *400 + 10000 + ( L - DC00 )
   1.502 +                    uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
   1.503 +
   1.504 +                    // 0001 0000-001F FFFF
   1.505 +                    *out++ = 0xF0 | (char)(ucs4 >> 18);
   1.506 +                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
   1.507 +                    *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
   1.508 +                    *out++ = 0x80 | (char)(0x003F & ucs4);
   1.509 +                  }
   1.510 +                else
   1.511 +                  {
   1.512 +                    // Treat broken characters as the Unicode
   1.513 +                    // replacement character 0xFFFD (0xEFBFBD in
   1.514 +                    // UTF-8)
   1.515 +                    *out++ = '\xEF';
   1.516 +                    *out++ = '\xBF';
   1.517 +                    *out++ = '\xBD';
   1.518 +
   1.519 +                    // The pointer to the next character points to the second
   1.520 +                    // 16-bit value, not beyond it, as per Unicode 5.0.0
   1.521 +                    // Chapter 3 C10, only the first code unit of an illegal
   1.522 +                    // sequence must be treated as an illegally terminated
   1.523 +                    // code unit sequence (also Chapter 3 D91, "isolated [not
   1.524 +                    // paired and ill-formed] UTF-16 code units in the range
   1.525 +                    // D800..DFFF are ill-formed").
   1.526 +                    p--;
   1.527 +
   1.528 +                    NS_WARNING("got a High Surrogate but no low surrogate");
   1.529 +                  }
   1.530 +              }
   1.531 +            else // U+DC00 - U+DFFF
   1.532 +              {
   1.533 +                // Treat broken characters as the Unicode replacement
   1.534 +                // character 0xFFFD (0xEFBFBD in UTF-8)
   1.535 +                *out++ = '\xEF';
   1.536 +                *out++ = '\xBF';
   1.537 +                *out++ = '\xBD';
   1.538 +
   1.539 +                // DC00- DFFF - Low Surrogate
   1.540 +                NS_WARNING("got a low Surrogate but no high surrogate");
   1.541 +              }
   1.542 +          }
   1.543 +
   1.544 +        mBuffer = out;
   1.545 +      }
   1.546 +
   1.547 +    void write_terminator()
   1.548 +      {
   1.549 +        *mBuffer = buffer_type(0);
   1.550 +      }
   1.551 +
   1.552 +    private:
   1.553 +      buffer_type* const mStart;
   1.554 +      buffer_type* mBuffer;
   1.555 +  };
   1.556 +
   1.557 +/**
   1.558 + * A character sink (see |copy_string| in nsAlgorithm.h) for computing
   1.559 + * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
   1.560 + * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
   1.561 + */
   1.562 +class CalculateUTF8Size
   1.563 +  {
   1.564 +    public:
   1.565 +      typedef char16_t value_type;
   1.566 +
   1.567 +    CalculateUTF8Size()
   1.568 +      : mSize(0) { }
   1.569 +
   1.570 +    size_t Size() const { return mSize; }
   1.571 +
   1.572 +    void write( const value_type* start, uint32_t N )
   1.573 +      {
   1.574 +        // Assume UCS2 surrogate pairs won't be spread across fragments.
   1.575 +        for (const value_type *p = start, *end = start + N; p < end; ++p )
   1.576 +          {
   1.577 +            value_type c = *p;
   1.578 +            if (! (c & 0xFF80)) // U+0000 - U+007F
   1.579 +              mSize += 1;
   1.580 +            else if (! (c & 0xF800)) // U+0100 - U+07FF
   1.581 +              mSize += 2;
   1.582 +            else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
   1.583 +              mSize += 3;
   1.584 +            else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
   1.585 +              {
   1.586 +                ++p;
   1.587 +                if (p == end)
   1.588 +                  {
   1.589 +                    // Treat broken characters as the Unicode
   1.590 +                    // replacement character 0xFFFD (0xEFBFBD in
   1.591 +                    // UTF-8)
   1.592 +                    mSize += 3;
   1.593 +
   1.594 +                    NS_WARNING("String ending in half a surrogate pair!");
   1.595 +
   1.596 +                    break;
   1.597 +                  }
   1.598 +                c = *p;
   1.599 +
   1.600 +                if (0xDC00 == (0xFC00 & c))
   1.601 +                  mSize += 4;
   1.602 +                else
   1.603 +                  {
   1.604 +                    // Treat broken characters as the Unicode
   1.605 +                    // replacement character 0xFFFD (0xEFBFBD in
   1.606 +                    // UTF-8)
   1.607 +                    mSize += 3;
   1.608 +
   1.609 +                    // The next code unit is the second 16-bit value, not
   1.610 +                    // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
   1.611 +                    // only the first code unit of an illegal sequence must
   1.612 +                    // be treated as an illegally terminated code unit
   1.613 +                    // sequence (also Chapter 3 D91, "isolated [not paired and
   1.614 +                    // ill-formed] UTF-16 code units in the range D800..DFFF
   1.615 +                    // are ill-formed").
   1.616 +                    p--;
   1.617 +
   1.618 +                    NS_WARNING("got a high Surrogate but no low surrogate");
   1.619 +                  }
   1.620 +              }
   1.621 +            else // U+DC00 - U+DFFF
   1.622 +              {
   1.623 +                // Treat broken characters as the Unicode replacement
   1.624 +                // character 0xFFFD (0xEFBFBD in UTF-8)
   1.625 +                mSize += 3;
   1.626 +
   1.627 +                NS_WARNING("got a low Surrogate but no high surrogate");
   1.628 +              }
   1.629 +          }
   1.630 +      }
   1.631 +
   1.632 +    private:
   1.633 +      size_t mSize;
   1.634 +  };
   1.635 +
   1.636 +#ifdef MOZILLA_INTERNAL_API
   1.637 +/**
   1.638 + * A character sink that performs a |reinterpret_cast|-style conversion
   1.639 + * from char to char16_t.
   1.640 + */
   1.641 +class LossyConvertEncoding8to16
   1.642 +  {
   1.643 +    public:
   1.644 +      typedef char      value_type;
   1.645 +      typedef char      input_type;
   1.646 +      typedef char16_t output_type;
   1.647 +
   1.648 +    public:
   1.649 +      LossyConvertEncoding8to16( char16_t* aDestination ) :
   1.650 +        mDestination(aDestination) { }
   1.651 +
   1.652 +      void
   1.653 +      write( const char* aSource, uint32_t aSourceLength )
   1.654 +        {
   1.655 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
   1.656 +          if (mozilla::supports_sse2())
   1.657 +            {
   1.658 +              write_sse2(aSource, aSourceLength);
   1.659 +              return;
   1.660 +            }
   1.661 +#endif
   1.662 +          const char* done_writing = aSource + aSourceLength;
   1.663 +          while ( aSource < done_writing )
   1.664 +            *mDestination++ = (char16_t)(unsigned char)(*aSource++);
   1.665 +        }
   1.666 +
   1.667 +      void
   1.668 +      write_sse2( const char* aSource, uint32_t aSourceLength );
   1.669 +
   1.670 +      void
   1.671 +      write_terminator()
   1.672 +        {
   1.673 +          *mDestination = (char16_t)(0);
   1.674 +        }
   1.675 +
   1.676 +    private:
   1.677 +      char16_t* mDestination;
   1.678 +  };
   1.679 +
   1.680 +/**
   1.681 + * A character sink that performs a |reinterpret_cast|-style conversion
   1.682 + * from char16_t to char.
   1.683 + */
   1.684 +class LossyConvertEncoding16to8
   1.685 +  {
   1.686 +    public:
   1.687 +      typedef char16_t value_type;
   1.688 +      typedef char16_t input_type;
   1.689 +      typedef char      output_type;
   1.690 +
   1.691 +      LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
   1.692 +
   1.693 +      void
   1.694 +      write( const char16_t* aSource, uint32_t aSourceLength)
   1.695 +        {
   1.696 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
   1.697 +          if (mozilla::supports_sse2())
   1.698 +            {
   1.699 +              write_sse2(aSource, aSourceLength);
   1.700 +              return;
   1.701 +            }
   1.702 +#endif
   1.703 +            const char16_t* done_writing = aSource + aSourceLength;
   1.704 +            while ( aSource < done_writing )
   1.705 +              *mDestination++ = (char)(*aSource++);
   1.706 +        }
   1.707 +
   1.708 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
   1.709 +      void
   1.710 +      write_sse2( const char16_t* aSource, uint32_t aSourceLength );
   1.711 +#endif
   1.712 +
   1.713 +      void
   1.714 +      write_terminator()
   1.715 +        {
   1.716 +          *mDestination = '\0';
   1.717 +        }
   1.718 +
   1.719 +    private:
   1.720 +      char *mDestination;
   1.721 +  };
   1.722 +#endif // MOZILLA_INTERNAL_API
   1.723 +
   1.724 +#endif /* !defined(nsUTF8Utils_h_) */

mercurial