xpcom/string/public/nsUTF8Utils.h

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
michael@0 2 /* This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5 #ifndef nsUTF8Utils_h_
michael@0 6 #define nsUTF8Utils_h_
michael@0 7
michael@0 8 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
michael@0 9 // file will provide signatures for the Mozilla abstract string types. It will
michael@0 10 // use XPCOM assertion/debugging macros, etc.
michael@0 11
michael@0 12 #include "nscore.h"
michael@0 13 #include "mozilla/SSE.h"
michael@0 14
michael@0 15 #include "nsCharTraits.h"
michael@0 16
michael@0 17 class UTF8traits
michael@0 18 {
michael@0 19 public:
michael@0 20 static bool isASCII(char c) { return (c & 0x80) == 0x00; }
michael@0 21 static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
michael@0 22 static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
michael@0 23 static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
michael@0 24 static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
michael@0 25 static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
michael@0 26 static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
michael@0 27 };
michael@0 28
michael@0 29 /**
michael@0 30 * Extract the next UCS-4 character from the buffer and return it. The
michael@0 31 * pointer passed in is advanced to the start of the next character in the
michael@0 32 * buffer. If non-null, the parameters err and overlong are filled in to
michael@0 33 * indicate that the character was represented by an overlong sequence, or
michael@0 34 * that an error occurred.
michael@0 35 */
michael@0 36
michael@0 37 class UTF8CharEnumerator
michael@0 38 {
michael@0 39 public:
michael@0 40 static uint32_t NextChar(const char **buffer, const char *end,
michael@0 41 bool *err)
michael@0 42 {
michael@0 43 NS_ASSERTION(buffer && *buffer, "null buffer!");
michael@0 44
michael@0 45 const char *p = *buffer;
michael@0 46 *err = false;
michael@0 47
michael@0 48 if (p >= end)
michael@0 49 {
michael@0 50 *err = true;
michael@0 51
michael@0 52 return 0;
michael@0 53 }
michael@0 54
michael@0 55 char c = *p++;
michael@0 56
michael@0 57 if ( UTF8traits::isASCII(c) )
michael@0 58 {
michael@0 59 *buffer = p;
michael@0 60 return c;
michael@0 61 }
michael@0 62
michael@0 63 uint32_t ucs4;
michael@0 64 uint32_t minUcs4;
michael@0 65 int32_t state = 0;
michael@0 66
michael@0 67 if (!CalcState(c, ucs4, minUcs4, state)) {
michael@0 68 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
michael@0 69 *err = true;
michael@0 70
michael@0 71 return 0;
michael@0 72 }
michael@0 73
michael@0 74 while ( state-- )
michael@0 75 {
michael@0 76 if (p == end)
michael@0 77 {
michael@0 78 *err = true;
michael@0 79
michael@0 80 return 0;
michael@0 81 }
michael@0 82
michael@0 83 c = *p++;
michael@0 84
michael@0 85 if (!AddByte(c, state, ucs4))
michael@0 86 {
michael@0 87 *err = true;
michael@0 88
michael@0 89 return 0;
michael@0 90 }
michael@0 91 }
michael@0 92
michael@0 93 if ( ucs4 < minUcs4 )
michael@0 94 {
michael@0 95 // Overlong sequence
michael@0 96 ucs4 = UCS2_REPLACEMENT_CHAR;
michael@0 97 }
michael@0 98 else if ( ucs4 >= 0xD800 &&
michael@0 99 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
michael@0 100 {
michael@0 101 // Surrogates and code points outside the Unicode range.
michael@0 102 ucs4 = UCS2_REPLACEMENT_CHAR;
michael@0 103 }
michael@0 104
michael@0 105 *buffer = p;
michael@0 106 return ucs4;
michael@0 107 }
michael@0 108
michael@0 109 private:
michael@0 110 static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
michael@0 111 int32_t& state)
michael@0 112 {
michael@0 113 if ( UTF8traits::is2byte(c) )
michael@0 114 {
michael@0 115 ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
michael@0 116 state = 1;
michael@0 117 minUcs4 = 0x00000080;
michael@0 118 }
michael@0 119 else if ( UTF8traits::is3byte(c) )
michael@0 120 {
michael@0 121 ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
michael@0 122 state = 2;
michael@0 123 minUcs4 = 0x00000800;
michael@0 124 }
michael@0 125 else if ( UTF8traits::is4byte(c) )
michael@0 126 {
michael@0 127 ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
michael@0 128 state = 3;
michael@0 129 minUcs4 = 0x00010000;
michael@0 130 }
michael@0 131 else if ( UTF8traits::is5byte(c) )
michael@0 132 {
michael@0 133 ucs4 = (uint32_t(c) << 24) & 0x03000000L;
michael@0 134 state = 4;
michael@0 135 minUcs4 = 0x00200000;
michael@0 136 }
michael@0 137 else if ( UTF8traits::is6byte(c) )
michael@0 138 {
michael@0 139 ucs4 = (uint32_t(c) << 30) & 0x40000000L;
michael@0 140 state = 5;
michael@0 141 minUcs4 = 0x04000000;
michael@0 142 }
michael@0 143 else
michael@0 144 {
michael@0 145 return false;
michael@0 146 }
michael@0 147
michael@0 148 return true;
michael@0 149 }
michael@0 150
michael@0 151 static bool AddByte(char c, int32_t state, uint32_t& ucs4)
michael@0 152 {
michael@0 153 if ( UTF8traits::isInSeq(c) )
michael@0 154 {
michael@0 155 int32_t shift = state * 6;
michael@0 156 ucs4 |= (uint32_t(c) & 0x3F) << shift;
michael@0 157 return true;
michael@0 158 }
michael@0 159
michael@0 160 return false;
michael@0 161 }
michael@0 162 };
michael@0 163
michael@0 164
michael@0 165 /**
michael@0 166 * Extract the next UCS-4 character from the buffer and return it. The
michael@0 167 * pointer passed in is advanced to the start of the next character in the
michael@0 168 * buffer. If non-null, the err parameter is filled in if an error occurs.
michael@0 169 */
michael@0 170
michael@0 171
michael@0 172 class UTF16CharEnumerator
michael@0 173 {
michael@0 174 public:
michael@0 175 static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
michael@0 176 bool *err = nullptr)
michael@0 177 {
michael@0 178 NS_ASSERTION(buffer && *buffer, "null buffer!");
michael@0 179
michael@0 180 const char16_t *p = *buffer;
michael@0 181
michael@0 182 if (p >= end)
michael@0 183 {
michael@0 184 NS_ERROR("No input to work with");
michael@0 185 if (err)
michael@0 186 *err = true;
michael@0 187
michael@0 188 return 0;
michael@0 189 }
michael@0 190
michael@0 191 char16_t c = *p++;
michael@0 192
michael@0 193 if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
michael@0 194 {
michael@0 195 if (err)
michael@0 196 *err = false;
michael@0 197 *buffer = p;
michael@0 198 return c;
michael@0 199 }
michael@0 200 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
michael@0 201 {
michael@0 202 if (p == end)
michael@0 203 {
michael@0 204 // Found a high surrogate the end of the buffer. Flag this
michael@0 205 // as an error and return the Unicode replacement
michael@0 206 // character 0xFFFD.
michael@0 207
michael@0 208 NS_WARNING("Unexpected end of buffer after high surrogate");
michael@0 209
michael@0 210 if (err)
michael@0 211 *err = true;
michael@0 212 *buffer = p;
michael@0 213 return 0xFFFD;
michael@0 214 }
michael@0 215
michael@0 216 // D800- DBFF - High Surrogate
michael@0 217 char16_t h = c;
michael@0 218
michael@0 219 c = *p++;
michael@0 220
michael@0 221 if (NS_IS_LOW_SURROGATE(c))
michael@0 222 {
michael@0 223 // DC00- DFFF - Low Surrogate
michael@0 224 // N = (H - D800) *400 + 10000 + (L - DC00)
michael@0 225 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
michael@0 226 if (err)
michael@0 227 *err = false;
michael@0 228 *buffer = p;
michael@0 229 return ucs4;
michael@0 230 }
michael@0 231 else
michael@0 232 {
michael@0 233 // Found a high surrogate followed by something other than
michael@0 234 // a low surrogate. Flag this as an error and return the
michael@0 235 // Unicode replacement character 0xFFFD. Note that the
michael@0 236 // pointer to the next character points to the second 16-bit
michael@0 237 // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
michael@0 238 // only the first code unit of an illegal sequence must be
michael@0 239 // treated as an illegally terminated code unit sequence
michael@0 240 // (also Chapter 3 D91, "isolated [not paired and ill-formed]
michael@0 241 // UTF-16 code units in the range D800..DFFF are ill-formed").
michael@0 242 NS_WARNING("got a High Surrogate but no low surrogate");
michael@0 243
michael@0 244 if (err)
michael@0 245 *err = true;
michael@0 246 *buffer = p - 1;
michael@0 247 return 0xFFFD;
michael@0 248 }
michael@0 249 }
michael@0 250 else // U+DC00 - U+DFFF
michael@0 251 {
michael@0 252 // DC00- DFFF - Low Surrogate
michael@0 253
michael@0 254 // Found a low surrogate w/o a preceding high surrogate. Flag
michael@0 255 // this as an error and return the Unicode replacement
michael@0 256 // character 0xFFFD.
michael@0 257
michael@0 258 NS_WARNING("got a low Surrogate but no high surrogate");
michael@0 259 if (err)
michael@0 260 *err = true;
michael@0 261 *buffer = p;
michael@0 262 return 0xFFFD;
michael@0 263 }
michael@0 264
michael@0 265 if (err)
michael@0 266 *err = true;
michael@0 267 return 0;
michael@0 268 }
michael@0 269 };
michael@0 270
michael@0 271
michael@0 272 /**
michael@0 273 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
michael@0 274 * UTF-8 to UTF-16
michael@0 275 */
michael@0 276 class ConvertUTF8toUTF16
michael@0 277 {
michael@0 278 public:
michael@0 279 typedef char value_type;
michael@0 280 typedef char16_t buffer_type;
michael@0 281
michael@0 282 ConvertUTF8toUTF16( buffer_type* aBuffer )
michael@0 283 : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
michael@0 284
michael@0 285 size_t Length() const { return mBuffer - mStart; }
michael@0 286
michael@0 287 bool ErrorEncountered() const { return mErrorEncountered; }
michael@0 288
michael@0 289 void write( const value_type* start, uint32_t N )
michael@0 290 {
michael@0 291 if ( mErrorEncountered )
michael@0 292 return;
michael@0 293
michael@0 294 // algorithm assumes utf8 units won't
michael@0 295 // be spread across fragments
michael@0 296 const value_type* p = start;
michael@0 297 const value_type* end = start + N;
michael@0 298 buffer_type* out = mBuffer;
michael@0 299 for ( ; p != end /* && *p */; )
michael@0 300 {
michael@0 301 bool err;
michael@0 302 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
michael@0 303
michael@0 304 if ( err )
michael@0 305 {
michael@0 306 mErrorEncountered = true;
michael@0 307 mBuffer = out;
michael@0 308 return;
michael@0 309 }
michael@0 310
michael@0 311 if ( ucs4 >= PLANE1_BASE )
michael@0 312 {
michael@0 313 *out++ = (buffer_type)H_SURROGATE(ucs4);
michael@0 314 *out++ = (buffer_type)L_SURROGATE(ucs4);
michael@0 315 }
michael@0 316 else
michael@0 317 {
michael@0 318 *out++ = ucs4;
michael@0 319 }
michael@0 320 }
michael@0 321 mBuffer = out;
michael@0 322 }
michael@0 323
michael@0 324 void write_terminator()
michael@0 325 {
michael@0 326 *mBuffer = buffer_type(0);
michael@0 327 }
michael@0 328
michael@0 329 private:
michael@0 330 buffer_type* const mStart;
michael@0 331 buffer_type* mBuffer;
michael@0 332 bool mErrorEncountered;
michael@0 333 };
michael@0 334
michael@0 335 /**
michael@0 336 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
michael@0 337 * the length of the UTF-16 string equivalent to a UTF-8 string.
michael@0 338 */
michael@0 339 class CalculateUTF8Length
michael@0 340 {
michael@0 341 public:
michael@0 342 typedef char value_type;
michael@0 343
michael@0 344 CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
michael@0 345
michael@0 346 size_t Length() const { return mLength; }
michael@0 347
michael@0 348 void write( const value_type* start, uint32_t N )
michael@0 349 {
michael@0 350 // ignore any further requests
michael@0 351 if ( mErrorEncountered )
michael@0 352 return;
michael@0 353
michael@0 354 // algorithm assumes utf8 units won't
michael@0 355 // be spread across fragments
michael@0 356 const value_type* p = start;
michael@0 357 const value_type* end = start + N;
michael@0 358 for ( ; p < end /* && *p */; ++mLength )
michael@0 359 {
michael@0 360 if ( UTF8traits::isASCII(*p) )
michael@0 361 p += 1;
michael@0 362 else if ( UTF8traits::is2byte(*p) )
michael@0 363 p += 2;
michael@0 364 else if ( UTF8traits::is3byte(*p) )
michael@0 365 p += 3;
michael@0 366 else if ( UTF8traits::is4byte(*p) ) {
michael@0 367 // Because a UTF-8 sequence of 4 bytes represents a codepoint
michael@0 368 // greater than 0xFFFF, it will become a surrogate pair in the
michael@0 369 // UTF-16 string, so add 1 more to mLength.
michael@0 370 // This doesn't happen with is5byte and is6byte because they
michael@0 371 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
michael@0 372 // converted to a single replacement character.
michael@0 373
michael@0 374 // However, there is one case when a 4 byte UTF-8 sequence will
michael@0 375 // only generate 2 UTF-16 bytes. If we have a properly encoded
michael@0 376 // sequence, but with an invalid value (too small or too big),
michael@0 377 // that will result in a replacement character being written
michael@0 378 // This replacement character is encoded as just 1 single
michael@0 379 // UTF-16 character, which is 2 bytes.
michael@0 380
michael@0 381 // The below code therefore only adds 1 to mLength if the UTF8
michael@0 382 // data will produce a decoded character which is greater than
michael@0 383 // or equal to 0x010000 and less than 0x0110000.
michael@0 384
michael@0 385 // A 4byte UTF8 character is encoded as
michael@0 386 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
michael@0 387 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
michael@0 388 // map to bit 17-21 in the final result. If these bits are
michael@0 389 // between 0x01 and 0x11, that means that the final result is
michael@0 390 // between 0x010000 and 0x110000. The below code reads these
michael@0 391 // bits out and assigns them to c, but shifted up 4 bits to
michael@0 392 // avoid having to shift twice.
michael@0 393
michael@0 394 // It doesn't matter what to do in the case where p + 4 > end
michael@0 395 // since no UTF16 characters will be written in that case by
michael@0 396 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
michael@0 397 // any of the surrogate bits are wrong since no UTF16
michael@0 398 // characters will be written in that case either.
michael@0 399
michael@0 400 if (p + 4 <= end) {
michael@0 401 uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
michael@0 402 ((uint32_t)(p[1] & 0x30));
michael@0 403 if (c >= 0x010 && c < 0x110)
michael@0 404 ++mLength;
michael@0 405 }
michael@0 406
michael@0 407 p += 4;
michael@0 408 }
michael@0 409 else if ( UTF8traits::is5byte(*p) )
michael@0 410 p += 5;
michael@0 411 else if ( UTF8traits::is6byte(*p) )
michael@0 412 p += 6;
michael@0 413 else // error
michael@0 414 {
michael@0 415 ++mLength; // to account for the decrement below
michael@0 416 break;
michael@0 417 }
michael@0 418 }
michael@0 419 if ( p != end )
michael@0 420 {
michael@0 421 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
michael@0 422 --mLength; // The last multi-byte char wasn't complete, discard it.
michael@0 423 mErrorEncountered = true;
michael@0 424 }
michael@0 425 }
michael@0 426
michael@0 427 private:
michael@0 428 size_t mLength;
michael@0 429 bool mErrorEncountered;
michael@0 430 };
michael@0 431
michael@0 432 /**
michael@0 433 * A character sink (see |copy_string| in nsAlgorithm.h) for
michael@0 434 * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
michael@0 435 * (0xEFBFBD in UTF-8).
michael@0 436 */
michael@0 437 class ConvertUTF16toUTF8
michael@0 438 {
michael@0 439 public:
michael@0 440 typedef char16_t value_type;
michael@0 441 typedef char buffer_type;
michael@0 442
michael@0 443 // The error handling here is more lenient than that in
michael@0 444 // |ConvertUTF8toUTF16|, but it's that way for backwards
michael@0 445 // compatibility.
michael@0 446
michael@0 447 ConvertUTF16toUTF8( buffer_type* aBuffer )
michael@0 448 : mStart(aBuffer), mBuffer(aBuffer) {}
michael@0 449
michael@0 450 size_t Size() const { return mBuffer - mStart; }
michael@0 451
michael@0 452 void write( const value_type* start, uint32_t N )
michael@0 453 {
michael@0 454 buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
michael@0 455
michael@0 456 for (const value_type *p = start, *end = start + N; p < end; ++p )
michael@0 457 {
michael@0 458 value_type c = *p;
michael@0 459 if (! (c & 0xFF80)) // U+0000 - U+007F
michael@0 460 {
michael@0 461 *out++ = (char)c;
michael@0 462 }
michael@0 463 else if (! (c & 0xF800)) // U+0100 - U+07FF
michael@0 464 {
michael@0 465 *out++ = 0xC0 | (char)(c >> 6);
michael@0 466 *out++ = 0x80 | (char)(0x003F & c);
michael@0 467 }
michael@0 468 else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
michael@0 469 {
michael@0 470 *out++ = 0xE0 | (char)(c >> 12);
michael@0 471 *out++ = 0x80 | (char)(0x003F & (c >> 6));
michael@0 472 *out++ = 0x80 | (char)(0x003F & c );
michael@0 473 }
michael@0 474 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
michael@0 475 {
michael@0 476 // D800- DBFF - High Surrogate
michael@0 477 value_type h = c;
michael@0 478
michael@0 479 ++p;
michael@0 480 if (p == end)
michael@0 481 {
michael@0 482 // Treat broken characters as the Unicode
michael@0 483 // replacement character 0xFFFD (0xEFBFBD in
michael@0 484 // UTF-8)
michael@0 485 *out++ = '\xEF';
michael@0 486 *out++ = '\xBF';
michael@0 487 *out++ = '\xBD';
michael@0 488
michael@0 489 NS_WARNING("String ending in half a surrogate pair!");
michael@0 490
michael@0 491 break;
michael@0 492 }
michael@0 493 c = *p;
michael@0 494
michael@0 495 if (NS_IS_LOW_SURROGATE(c))
michael@0 496 {
michael@0 497 // DC00- DFFF - Low Surrogate
michael@0 498 // N = (H - D800) *400 + 10000 + ( L - DC00 )
michael@0 499 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
michael@0 500
michael@0 501 // 0001 0000-001F FFFF
michael@0 502 *out++ = 0xF0 | (char)(ucs4 >> 18);
michael@0 503 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
michael@0 504 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
michael@0 505 *out++ = 0x80 | (char)(0x003F & ucs4);
michael@0 506 }
michael@0 507 else
michael@0 508 {
michael@0 509 // Treat broken characters as the Unicode
michael@0 510 // replacement character 0xFFFD (0xEFBFBD in
michael@0 511 // UTF-8)
michael@0 512 *out++ = '\xEF';
michael@0 513 *out++ = '\xBF';
michael@0 514 *out++ = '\xBD';
michael@0 515
michael@0 516 // The pointer to the next character points to the second
michael@0 517 // 16-bit value, not beyond it, as per Unicode 5.0.0
michael@0 518 // Chapter 3 C10, only the first code unit of an illegal
michael@0 519 // sequence must be treated as an illegally terminated
michael@0 520 // code unit sequence (also Chapter 3 D91, "isolated [not
michael@0 521 // paired and ill-formed] UTF-16 code units in the range
michael@0 522 // D800..DFFF are ill-formed").
michael@0 523 p--;
michael@0 524
michael@0 525 NS_WARNING("got a High Surrogate but no low surrogate");
michael@0 526 }
michael@0 527 }
michael@0 528 else // U+DC00 - U+DFFF
michael@0 529 {
michael@0 530 // Treat broken characters as the Unicode replacement
michael@0 531 // character 0xFFFD (0xEFBFBD in UTF-8)
michael@0 532 *out++ = '\xEF';
michael@0 533 *out++ = '\xBF';
michael@0 534 *out++ = '\xBD';
michael@0 535
michael@0 536 // DC00- DFFF - Low Surrogate
michael@0 537 NS_WARNING("got a low Surrogate but no high surrogate");
michael@0 538 }
michael@0 539 }
michael@0 540
michael@0 541 mBuffer = out;
michael@0 542 }
michael@0 543
michael@0 544 void write_terminator()
michael@0 545 {
michael@0 546 *mBuffer = buffer_type(0);
michael@0 547 }
michael@0 548
michael@0 549 private:
michael@0 550 buffer_type* const mStart;
michael@0 551 buffer_type* mBuffer;
michael@0 552 };
michael@0 553
michael@0 554 /**
michael@0 555 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
michael@0 556 * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
michael@0 557 * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
michael@0 558 */
michael@0 559 class CalculateUTF8Size
michael@0 560 {
michael@0 561 public:
michael@0 562 typedef char16_t value_type;
michael@0 563
michael@0 564 CalculateUTF8Size()
michael@0 565 : mSize(0) { }
michael@0 566
michael@0 567 size_t Size() const { return mSize; }
michael@0 568
michael@0 569 void write( const value_type* start, uint32_t N )
michael@0 570 {
michael@0 571 // Assume UCS2 surrogate pairs won't be spread across fragments.
michael@0 572 for (const value_type *p = start, *end = start + N; p < end; ++p )
michael@0 573 {
michael@0 574 value_type c = *p;
michael@0 575 if (! (c & 0xFF80)) // U+0000 - U+007F
michael@0 576 mSize += 1;
michael@0 577 else if (! (c & 0xF800)) // U+0100 - U+07FF
michael@0 578 mSize += 2;
michael@0 579 else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
michael@0 580 mSize += 3;
michael@0 581 else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
michael@0 582 {
michael@0 583 ++p;
michael@0 584 if (p == end)
michael@0 585 {
michael@0 586 // Treat broken characters as the Unicode
michael@0 587 // replacement character 0xFFFD (0xEFBFBD in
michael@0 588 // UTF-8)
michael@0 589 mSize += 3;
michael@0 590
michael@0 591 NS_WARNING("String ending in half a surrogate pair!");
michael@0 592
michael@0 593 break;
michael@0 594 }
michael@0 595 c = *p;
michael@0 596
michael@0 597 if (0xDC00 == (0xFC00 & c))
michael@0 598 mSize += 4;
michael@0 599 else
michael@0 600 {
michael@0 601 // Treat broken characters as the Unicode
michael@0 602 // replacement character 0xFFFD (0xEFBFBD in
michael@0 603 // UTF-8)
michael@0 604 mSize += 3;
michael@0 605
michael@0 606 // The next code unit is the second 16-bit value, not
michael@0 607 // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
michael@0 608 // only the first code unit of an illegal sequence must
michael@0 609 // be treated as an illegally terminated code unit
michael@0 610 // sequence (also Chapter 3 D91, "isolated [not paired and
michael@0 611 // ill-formed] UTF-16 code units in the range D800..DFFF
michael@0 612 // are ill-formed").
michael@0 613 p--;
michael@0 614
michael@0 615 NS_WARNING("got a high Surrogate but no low surrogate");
michael@0 616 }
michael@0 617 }
michael@0 618 else // U+DC00 - U+DFFF
michael@0 619 {
michael@0 620 // Treat broken characters as the Unicode replacement
michael@0 621 // character 0xFFFD (0xEFBFBD in UTF-8)
michael@0 622 mSize += 3;
michael@0 623
michael@0 624 NS_WARNING("got a low Surrogate but no high surrogate");
michael@0 625 }
michael@0 626 }
michael@0 627 }
michael@0 628
michael@0 629 private:
michael@0 630 size_t mSize;
michael@0 631 };
michael@0 632
michael@0 633 #ifdef MOZILLA_INTERNAL_API
michael@0 634 /**
michael@0 635 * A character sink that performs a |reinterpret_cast|-style conversion
michael@0 636 * from char to char16_t.
michael@0 637 */
michael@0 638 class LossyConvertEncoding8to16
michael@0 639 {
michael@0 640 public:
michael@0 641 typedef char value_type;
michael@0 642 typedef char input_type;
michael@0 643 typedef char16_t output_type;
michael@0 644
michael@0 645 public:
michael@0 646 LossyConvertEncoding8to16( char16_t* aDestination ) :
michael@0 647 mDestination(aDestination) { }
michael@0 648
michael@0 649 void
michael@0 650 write( const char* aSource, uint32_t aSourceLength )
michael@0 651 {
michael@0 652 #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0 653 if (mozilla::supports_sse2())
michael@0 654 {
michael@0 655 write_sse2(aSource, aSourceLength);
michael@0 656 return;
michael@0 657 }
michael@0 658 #endif
michael@0 659 const char* done_writing = aSource + aSourceLength;
michael@0 660 while ( aSource < done_writing )
michael@0 661 *mDestination++ = (char16_t)(unsigned char)(*aSource++);
michael@0 662 }
michael@0 663
michael@0 664 void
michael@0 665 write_sse2( const char* aSource, uint32_t aSourceLength );
michael@0 666
michael@0 667 void
michael@0 668 write_terminator()
michael@0 669 {
michael@0 670 *mDestination = (char16_t)(0);
michael@0 671 }
michael@0 672
michael@0 673 private:
michael@0 674 char16_t* mDestination;
michael@0 675 };
michael@0 676
michael@0 677 /**
michael@0 678 * A character sink that performs a |reinterpret_cast|-style conversion
michael@0 679 * from char16_t to char.
michael@0 680 */
michael@0 681 class LossyConvertEncoding16to8
michael@0 682 {
michael@0 683 public:
michael@0 684 typedef char16_t value_type;
michael@0 685 typedef char16_t input_type;
michael@0 686 typedef char output_type;
michael@0 687
michael@0 688 LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
michael@0 689
michael@0 690 void
michael@0 691 write( const char16_t* aSource, uint32_t aSourceLength)
michael@0 692 {
michael@0 693 #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0 694 if (mozilla::supports_sse2())
michael@0 695 {
michael@0 696 write_sse2(aSource, aSourceLength);
michael@0 697 return;
michael@0 698 }
michael@0 699 #endif
michael@0 700 const char16_t* done_writing = aSource + aSourceLength;
michael@0 701 while ( aSource < done_writing )
michael@0 702 *mDestination++ = (char)(*aSource++);
michael@0 703 }
michael@0 704
michael@0 705 #ifdef MOZILLA_MAY_SUPPORT_SSE2
michael@0 706 void
michael@0 707 write_sse2( const char16_t* aSource, uint32_t aSourceLength );
michael@0 708 #endif
michael@0 709
michael@0 710 void
michael@0 711 write_terminator()
michael@0 712 {
michael@0 713 *mDestination = '\0';
michael@0 714 }
michael@0 715
michael@0 716 private:
michael@0 717 char *mDestination;
michael@0 718 };
michael@0 719 #endif // MOZILLA_INTERNAL_API
michael@0 720
michael@0 721 #endif /* !defined(nsUTF8Utils_h_) */

mercurial