Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
michael@0 | 2 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | #ifndef nsUTF8Utils_h_ |
michael@0 | 6 | #define nsUTF8Utils_h_ |
michael@0 | 7 | |
michael@0 | 8 | // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this |
michael@0 | 9 | // file will provide signatures for the Mozilla abstract string types. It will |
michael@0 | 10 | // use XPCOM assertion/debugging macros, etc. |
michael@0 | 11 | |
michael@0 | 12 | #include "nscore.h" |
michael@0 | 13 | #include "mozilla/SSE.h" |
michael@0 | 14 | |
michael@0 | 15 | #include "nsCharTraits.h" |
michael@0 | 16 | |
michael@0 | 17 | class UTF8traits |
michael@0 | 18 | { |
michael@0 | 19 | public: |
michael@0 | 20 | static bool isASCII(char c) { return (c & 0x80) == 0x00; } |
michael@0 | 21 | static bool isInSeq(char c) { return (c & 0xC0) == 0x80; } |
michael@0 | 22 | static bool is2byte(char c) { return (c & 0xE0) == 0xC0; } |
michael@0 | 23 | static bool is3byte(char c) { return (c & 0xF0) == 0xE0; } |
michael@0 | 24 | static bool is4byte(char c) { return (c & 0xF8) == 0xF0; } |
michael@0 | 25 | static bool is5byte(char c) { return (c & 0xFC) == 0xF8; } |
michael@0 | 26 | static bool is6byte(char c) { return (c & 0xFE) == 0xFC; } |
michael@0 | 27 | }; |
michael@0 | 28 | |
michael@0 | 29 | /** |
michael@0 | 30 | * Extract the next UCS-4 character from the buffer and return it. The |
michael@0 | 31 | * pointer passed in is advanced to the start of the next character in the |
michael@0 | 32 | * buffer. If non-null, the parameters err and overlong are filled in to |
michael@0 | 33 | * indicate that the character was represented by an overlong sequence, or |
michael@0 | 34 | * that an error occurred. |
michael@0 | 35 | */ |
michael@0 | 36 | |
michael@0 | 37 | class UTF8CharEnumerator |
michael@0 | 38 | { |
michael@0 | 39 | public: |
michael@0 | 40 | static uint32_t NextChar(const char **buffer, const char *end, |
michael@0 | 41 | bool *err) |
michael@0 | 42 | { |
michael@0 | 43 | NS_ASSERTION(buffer && *buffer, "null buffer!"); |
michael@0 | 44 | |
michael@0 | 45 | const char *p = *buffer; |
michael@0 | 46 | *err = false; |
michael@0 | 47 | |
michael@0 | 48 | if (p >= end) |
michael@0 | 49 | { |
michael@0 | 50 | *err = true; |
michael@0 | 51 | |
michael@0 | 52 | return 0; |
michael@0 | 53 | } |
michael@0 | 54 | |
michael@0 | 55 | char c = *p++; |
michael@0 | 56 | |
michael@0 | 57 | if ( UTF8traits::isASCII(c) ) |
michael@0 | 58 | { |
michael@0 | 59 | *buffer = p; |
michael@0 | 60 | return c; |
michael@0 | 61 | } |
michael@0 | 62 | |
michael@0 | 63 | uint32_t ucs4; |
michael@0 | 64 | uint32_t minUcs4; |
michael@0 | 65 | int32_t state = 0; |
michael@0 | 66 | |
michael@0 | 67 | if (!CalcState(c, ucs4, minUcs4, state)) { |
michael@0 | 68 | NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); |
michael@0 | 69 | *err = true; |
michael@0 | 70 | |
michael@0 | 71 | return 0; |
michael@0 | 72 | } |
michael@0 | 73 | |
michael@0 | 74 | while ( state-- ) |
michael@0 | 75 | { |
michael@0 | 76 | if (p == end) |
michael@0 | 77 | { |
michael@0 | 78 | *err = true; |
michael@0 | 79 | |
michael@0 | 80 | return 0; |
michael@0 | 81 | } |
michael@0 | 82 | |
michael@0 | 83 | c = *p++; |
michael@0 | 84 | |
michael@0 | 85 | if (!AddByte(c, state, ucs4)) |
michael@0 | 86 | { |
michael@0 | 87 | *err = true; |
michael@0 | 88 | |
michael@0 | 89 | return 0; |
michael@0 | 90 | } |
michael@0 | 91 | } |
michael@0 | 92 | |
michael@0 | 93 | if ( ucs4 < minUcs4 ) |
michael@0 | 94 | { |
michael@0 | 95 | // Overlong sequence |
michael@0 | 96 | ucs4 = UCS2_REPLACEMENT_CHAR; |
michael@0 | 97 | } |
michael@0 | 98 | else if ( ucs4 >= 0xD800 && |
michael@0 | 99 | (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) |
michael@0 | 100 | { |
michael@0 | 101 | // Surrogates and code points outside the Unicode range. |
michael@0 | 102 | ucs4 = UCS2_REPLACEMENT_CHAR; |
michael@0 | 103 | } |
michael@0 | 104 | |
michael@0 | 105 | *buffer = p; |
michael@0 | 106 | return ucs4; |
michael@0 | 107 | } |
michael@0 | 108 | |
michael@0 | 109 | private: |
michael@0 | 110 | static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4, |
michael@0 | 111 | int32_t& state) |
michael@0 | 112 | { |
michael@0 | 113 | if ( UTF8traits::is2byte(c) ) |
michael@0 | 114 | { |
michael@0 | 115 | ucs4 = (uint32_t(c) << 6) & 0x000007C0L; |
michael@0 | 116 | state = 1; |
michael@0 | 117 | minUcs4 = 0x00000080; |
michael@0 | 118 | } |
michael@0 | 119 | else if ( UTF8traits::is3byte(c) ) |
michael@0 | 120 | { |
michael@0 | 121 | ucs4 = (uint32_t(c) << 12) & 0x0000F000L; |
michael@0 | 122 | state = 2; |
michael@0 | 123 | minUcs4 = 0x00000800; |
michael@0 | 124 | } |
michael@0 | 125 | else if ( UTF8traits::is4byte(c) ) |
michael@0 | 126 | { |
michael@0 | 127 | ucs4 = (uint32_t(c) << 18) & 0x001F0000L; |
michael@0 | 128 | state = 3; |
michael@0 | 129 | minUcs4 = 0x00010000; |
michael@0 | 130 | } |
michael@0 | 131 | else if ( UTF8traits::is5byte(c) ) |
michael@0 | 132 | { |
michael@0 | 133 | ucs4 = (uint32_t(c) << 24) & 0x03000000L; |
michael@0 | 134 | state = 4; |
michael@0 | 135 | minUcs4 = 0x00200000; |
michael@0 | 136 | } |
michael@0 | 137 | else if ( UTF8traits::is6byte(c) ) |
michael@0 | 138 | { |
michael@0 | 139 | ucs4 = (uint32_t(c) << 30) & 0x40000000L; |
michael@0 | 140 | state = 5; |
michael@0 | 141 | minUcs4 = 0x04000000; |
michael@0 | 142 | } |
michael@0 | 143 | else |
michael@0 | 144 | { |
michael@0 | 145 | return false; |
michael@0 | 146 | } |
michael@0 | 147 | |
michael@0 | 148 | return true; |
michael@0 | 149 | } |
michael@0 | 150 | |
michael@0 | 151 | static bool AddByte(char c, int32_t state, uint32_t& ucs4) |
michael@0 | 152 | { |
michael@0 | 153 | if ( UTF8traits::isInSeq(c) ) |
michael@0 | 154 | { |
michael@0 | 155 | int32_t shift = state * 6; |
michael@0 | 156 | ucs4 |= (uint32_t(c) & 0x3F) << shift; |
michael@0 | 157 | return true; |
michael@0 | 158 | } |
michael@0 | 159 | |
michael@0 | 160 | return false; |
michael@0 | 161 | } |
michael@0 | 162 | }; |
michael@0 | 163 | |
michael@0 | 164 | |
michael@0 | 165 | /** |
michael@0 | 166 | * Extract the next UCS-4 character from the buffer and return it. The |
michael@0 | 167 | * pointer passed in is advanced to the start of the next character in the |
michael@0 | 168 | * buffer. If non-null, the err parameter is filled in if an error occurs. |
michael@0 | 169 | */ |
michael@0 | 170 | |
michael@0 | 171 | |
michael@0 | 172 | class UTF16CharEnumerator |
michael@0 | 173 | { |
michael@0 | 174 | public: |
michael@0 | 175 | static uint32_t NextChar(const char16_t **buffer, const char16_t *end, |
michael@0 | 176 | bool *err = nullptr) |
michael@0 | 177 | { |
michael@0 | 178 | NS_ASSERTION(buffer && *buffer, "null buffer!"); |
michael@0 | 179 | |
michael@0 | 180 | const char16_t *p = *buffer; |
michael@0 | 181 | |
michael@0 | 182 | if (p >= end) |
michael@0 | 183 | { |
michael@0 | 184 | NS_ERROR("No input to work with"); |
michael@0 | 185 | if (err) |
michael@0 | 186 | *err = true; |
michael@0 | 187 | |
michael@0 | 188 | return 0; |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | char16_t c = *p++; |
michael@0 | 192 | |
michael@0 | 193 | if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF |
michael@0 | 194 | { |
michael@0 | 195 | if (err) |
michael@0 | 196 | *err = false; |
michael@0 | 197 | *buffer = p; |
michael@0 | 198 | return c; |
michael@0 | 199 | } |
michael@0 | 200 | else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF |
michael@0 | 201 | { |
michael@0 | 202 | if (p == end) |
michael@0 | 203 | { |
michael@0 | 204 | // Found a high surrogate the end of the buffer. Flag this |
michael@0 | 205 | // as an error and return the Unicode replacement |
michael@0 | 206 | // character 0xFFFD. |
michael@0 | 207 | |
michael@0 | 208 | NS_WARNING("Unexpected end of buffer after high surrogate"); |
michael@0 | 209 | |
michael@0 | 210 | if (err) |
michael@0 | 211 | *err = true; |
michael@0 | 212 | *buffer = p; |
michael@0 | 213 | return 0xFFFD; |
michael@0 | 214 | } |
michael@0 | 215 | |
michael@0 | 216 | // D800- DBFF - High Surrogate |
michael@0 | 217 | char16_t h = c; |
michael@0 | 218 | |
michael@0 | 219 | c = *p++; |
michael@0 | 220 | |
michael@0 | 221 | if (NS_IS_LOW_SURROGATE(c)) |
michael@0 | 222 | { |
michael@0 | 223 | // DC00- DFFF - Low Surrogate |
michael@0 | 224 | // N = (H - D800) *400 + 10000 + (L - DC00) |
michael@0 | 225 | uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); |
michael@0 | 226 | if (err) |
michael@0 | 227 | *err = false; |
michael@0 | 228 | *buffer = p; |
michael@0 | 229 | return ucs4; |
michael@0 | 230 | } |
michael@0 | 231 | else |
michael@0 | 232 | { |
michael@0 | 233 | // Found a high surrogate followed by something other than |
michael@0 | 234 | // a low surrogate. Flag this as an error and return the |
michael@0 | 235 | // Unicode replacement character 0xFFFD. Note that the |
michael@0 | 236 | // pointer to the next character points to the second 16-bit |
michael@0 | 237 | // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10, |
michael@0 | 238 | // only the first code unit of an illegal sequence must be |
michael@0 | 239 | // treated as an illegally terminated code unit sequence |
michael@0 | 240 | // (also Chapter 3 D91, "isolated [not paired and ill-formed] |
michael@0 | 241 | // UTF-16 code units in the range D800..DFFF are ill-formed"). |
michael@0 | 242 | NS_WARNING("got a High Surrogate but no low surrogate"); |
michael@0 | 243 | |
michael@0 | 244 | if (err) |
michael@0 | 245 | *err = true; |
michael@0 | 246 | *buffer = p - 1; |
michael@0 | 247 | return 0xFFFD; |
michael@0 | 248 | } |
michael@0 | 249 | } |
michael@0 | 250 | else // U+DC00 - U+DFFF |
michael@0 | 251 | { |
michael@0 | 252 | // DC00- DFFF - Low Surrogate |
michael@0 | 253 | |
michael@0 | 254 | // Found a low surrogate w/o a preceding high surrogate. Flag |
michael@0 | 255 | // this as an error and return the Unicode replacement |
michael@0 | 256 | // character 0xFFFD. |
michael@0 | 257 | |
michael@0 | 258 | NS_WARNING("got a low Surrogate but no high surrogate"); |
michael@0 | 259 | if (err) |
michael@0 | 260 | *err = true; |
michael@0 | 261 | *buffer = p; |
michael@0 | 262 | return 0xFFFD; |
michael@0 | 263 | } |
michael@0 | 264 | |
michael@0 | 265 | if (err) |
michael@0 | 266 | *err = true; |
michael@0 | 267 | return 0; |
michael@0 | 268 | } |
michael@0 | 269 | }; |
michael@0 | 270 | |
michael@0 | 271 | |
michael@0 | 272 | /** |
michael@0 | 273 | * A character sink (see |copy_string| in nsAlgorithm.h) for converting |
michael@0 | 274 | * UTF-8 to UTF-16 |
michael@0 | 275 | */ |
michael@0 | 276 | class ConvertUTF8toUTF16 |
michael@0 | 277 | { |
michael@0 | 278 | public: |
michael@0 | 279 | typedef char value_type; |
michael@0 | 280 | typedef char16_t buffer_type; |
michael@0 | 281 | |
michael@0 | 282 | ConvertUTF8toUTF16( buffer_type* aBuffer ) |
michael@0 | 283 | : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {} |
michael@0 | 284 | |
michael@0 | 285 | size_t Length() const { return mBuffer - mStart; } |
michael@0 | 286 | |
michael@0 | 287 | bool ErrorEncountered() const { return mErrorEncountered; } |
michael@0 | 288 | |
michael@0 | 289 | void write( const value_type* start, uint32_t N ) |
michael@0 | 290 | { |
michael@0 | 291 | if ( mErrorEncountered ) |
michael@0 | 292 | return; |
michael@0 | 293 | |
michael@0 | 294 | // algorithm assumes utf8 units won't |
michael@0 | 295 | // be spread across fragments |
michael@0 | 296 | const value_type* p = start; |
michael@0 | 297 | const value_type* end = start + N; |
michael@0 | 298 | buffer_type* out = mBuffer; |
michael@0 | 299 | for ( ; p != end /* && *p */; ) |
michael@0 | 300 | { |
michael@0 | 301 | bool err; |
michael@0 | 302 | uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err); |
michael@0 | 303 | |
michael@0 | 304 | if ( err ) |
michael@0 | 305 | { |
michael@0 | 306 | mErrorEncountered = true; |
michael@0 | 307 | mBuffer = out; |
michael@0 | 308 | return; |
michael@0 | 309 | } |
michael@0 | 310 | |
michael@0 | 311 | if ( ucs4 >= PLANE1_BASE ) |
michael@0 | 312 | { |
michael@0 | 313 | *out++ = (buffer_type)H_SURROGATE(ucs4); |
michael@0 | 314 | *out++ = (buffer_type)L_SURROGATE(ucs4); |
michael@0 | 315 | } |
michael@0 | 316 | else |
michael@0 | 317 | { |
michael@0 | 318 | *out++ = ucs4; |
michael@0 | 319 | } |
michael@0 | 320 | } |
michael@0 | 321 | mBuffer = out; |
michael@0 | 322 | } |
michael@0 | 323 | |
michael@0 | 324 | void write_terminator() |
michael@0 | 325 | { |
michael@0 | 326 | *mBuffer = buffer_type(0); |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | private: |
michael@0 | 330 | buffer_type* const mStart; |
michael@0 | 331 | buffer_type* mBuffer; |
michael@0 | 332 | bool mErrorEncountered; |
michael@0 | 333 | }; |
michael@0 | 334 | |
michael@0 | 335 | /** |
michael@0 | 336 | * A character sink (see |copy_string| in nsAlgorithm.h) for computing |
michael@0 | 337 | * the length of the UTF-16 string equivalent to a UTF-8 string. |
michael@0 | 338 | */ |
michael@0 | 339 | class CalculateUTF8Length |
michael@0 | 340 | { |
michael@0 | 341 | public: |
michael@0 | 342 | typedef char value_type; |
michael@0 | 343 | |
michael@0 | 344 | CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { } |
michael@0 | 345 | |
michael@0 | 346 | size_t Length() const { return mLength; } |
michael@0 | 347 | |
michael@0 | 348 | void write( const value_type* start, uint32_t N ) |
michael@0 | 349 | { |
michael@0 | 350 | // ignore any further requests |
michael@0 | 351 | if ( mErrorEncountered ) |
michael@0 | 352 | return; |
michael@0 | 353 | |
michael@0 | 354 | // algorithm assumes utf8 units won't |
michael@0 | 355 | // be spread across fragments |
michael@0 | 356 | const value_type* p = start; |
michael@0 | 357 | const value_type* end = start + N; |
michael@0 | 358 | for ( ; p < end /* && *p */; ++mLength ) |
michael@0 | 359 | { |
michael@0 | 360 | if ( UTF8traits::isASCII(*p) ) |
michael@0 | 361 | p += 1; |
michael@0 | 362 | else if ( UTF8traits::is2byte(*p) ) |
michael@0 | 363 | p += 2; |
michael@0 | 364 | else if ( UTF8traits::is3byte(*p) ) |
michael@0 | 365 | p += 3; |
michael@0 | 366 | else if ( UTF8traits::is4byte(*p) ) { |
michael@0 | 367 | // Because a UTF-8 sequence of 4 bytes represents a codepoint |
michael@0 | 368 | // greater than 0xFFFF, it will become a surrogate pair in the |
michael@0 | 369 | // UTF-16 string, so add 1 more to mLength. |
michael@0 | 370 | // This doesn't happen with is5byte and is6byte because they |
michael@0 | 371 | // are illegal UTF-8 sequences (greater than 0x10FFFF) so get |
michael@0 | 372 | // converted to a single replacement character. |
michael@0 | 373 | |
michael@0 | 374 | // However, there is one case when a 4 byte UTF-8 sequence will |
michael@0 | 375 | // only generate 2 UTF-16 bytes. If we have a properly encoded |
michael@0 | 376 | // sequence, but with an invalid value (too small or too big), |
michael@0 | 377 | // that will result in a replacement character being written |
michael@0 | 378 | // This replacement character is encoded as just 1 single |
michael@0 | 379 | // UTF-16 character, which is 2 bytes. |
michael@0 | 380 | |
michael@0 | 381 | // The below code therefore only adds 1 to mLength if the UTF8 |
michael@0 | 382 | // data will produce a decoded character which is greater than |
michael@0 | 383 | // or equal to 0x010000 and less than 0x0110000. |
michael@0 | 384 | |
michael@0 | 385 | // A 4byte UTF8 character is encoded as |
michael@0 | 386 | // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
michael@0 | 387 | // Bit 1-3 on the first byte, and bit 5-6 on the second byte, |
michael@0 | 388 | // map to bit 17-21 in the final result. If these bits are |
michael@0 | 389 | // between 0x01 and 0x11, that means that the final result is |
michael@0 | 390 | // between 0x010000 and 0x110000. The below code reads these |
michael@0 | 391 | // bits out and assigns them to c, but shifted up 4 bits to |
michael@0 | 392 | // avoid having to shift twice. |
michael@0 | 393 | |
michael@0 | 394 | // It doesn't matter what to do in the case where p + 4 > end |
michael@0 | 395 | // since no UTF16 characters will be written in that case by |
michael@0 | 396 | // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if |
michael@0 | 397 | // any of the surrogate bits are wrong since no UTF16 |
michael@0 | 398 | // characters will be written in that case either. |
michael@0 | 399 | |
michael@0 | 400 | if (p + 4 <= end) { |
michael@0 | 401 | uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 | |
michael@0 | 402 | ((uint32_t)(p[1] & 0x30)); |
michael@0 | 403 | if (c >= 0x010 && c < 0x110) |
michael@0 | 404 | ++mLength; |
michael@0 | 405 | } |
michael@0 | 406 | |
michael@0 | 407 | p += 4; |
michael@0 | 408 | } |
michael@0 | 409 | else if ( UTF8traits::is5byte(*p) ) |
michael@0 | 410 | p += 5; |
michael@0 | 411 | else if ( UTF8traits::is6byte(*p) ) |
michael@0 | 412 | p += 6; |
michael@0 | 413 | else // error |
michael@0 | 414 | { |
michael@0 | 415 | ++mLength; // to account for the decrement below |
michael@0 | 416 | break; |
michael@0 | 417 | } |
michael@0 | 418 | } |
michael@0 | 419 | if ( p != end ) |
michael@0 | 420 | { |
michael@0 | 421 | NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); |
michael@0 | 422 | --mLength; // The last multi-byte char wasn't complete, discard it. |
michael@0 | 423 | mErrorEncountered = true; |
michael@0 | 424 | } |
michael@0 | 425 | } |
michael@0 | 426 | |
michael@0 | 427 | private: |
michael@0 | 428 | size_t mLength; |
michael@0 | 429 | bool mErrorEncountered; |
michael@0 | 430 | }; |
michael@0 | 431 | |
michael@0 | 432 | /** |
michael@0 | 433 | * A character sink (see |copy_string| in nsAlgorithm.h) for |
michael@0 | 434 | * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD |
michael@0 | 435 | * (0xEFBFBD in UTF-8). |
michael@0 | 436 | */ |
michael@0 | 437 | class ConvertUTF16toUTF8 |
michael@0 | 438 | { |
michael@0 | 439 | public: |
michael@0 | 440 | typedef char16_t value_type; |
michael@0 | 441 | typedef char buffer_type; |
michael@0 | 442 | |
michael@0 | 443 | // The error handling here is more lenient than that in |
michael@0 | 444 | // |ConvertUTF8toUTF16|, but it's that way for backwards |
michael@0 | 445 | // compatibility. |
michael@0 | 446 | |
michael@0 | 447 | ConvertUTF16toUTF8( buffer_type* aBuffer ) |
michael@0 | 448 | : mStart(aBuffer), mBuffer(aBuffer) {} |
michael@0 | 449 | |
michael@0 | 450 | size_t Size() const { return mBuffer - mStart; } |
michael@0 | 451 | |
michael@0 | 452 | void write( const value_type* start, uint32_t N ) |
michael@0 | 453 | { |
michael@0 | 454 | buffer_type *out = mBuffer; // gcc isn't smart enough to do this! |
michael@0 | 455 | |
michael@0 | 456 | for (const value_type *p = start, *end = start + N; p < end; ++p ) |
michael@0 | 457 | { |
michael@0 | 458 | value_type c = *p; |
michael@0 | 459 | if (! (c & 0xFF80)) // U+0000 - U+007F |
michael@0 | 460 | { |
michael@0 | 461 | *out++ = (char)c; |
michael@0 | 462 | } |
michael@0 | 463 | else if (! (c & 0xF800)) // U+0100 - U+07FF |
michael@0 | 464 | { |
michael@0 | 465 | *out++ = 0xC0 | (char)(c >> 6); |
michael@0 | 466 | *out++ = 0x80 | (char)(0x003F & c); |
michael@0 | 467 | } |
michael@0 | 468 | else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF |
michael@0 | 469 | { |
michael@0 | 470 | *out++ = 0xE0 | (char)(c >> 12); |
michael@0 | 471 | *out++ = 0x80 | (char)(0x003F & (c >> 6)); |
michael@0 | 472 | *out++ = 0x80 | (char)(0x003F & c ); |
michael@0 | 473 | } |
michael@0 | 474 | else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF |
michael@0 | 475 | { |
michael@0 | 476 | // D800- DBFF - High Surrogate |
michael@0 | 477 | value_type h = c; |
michael@0 | 478 | |
michael@0 | 479 | ++p; |
michael@0 | 480 | if (p == end) |
michael@0 | 481 | { |
michael@0 | 482 | // Treat broken characters as the Unicode |
michael@0 | 483 | // replacement character 0xFFFD (0xEFBFBD in |
michael@0 | 484 | // UTF-8) |
michael@0 | 485 | *out++ = '\xEF'; |
michael@0 | 486 | *out++ = '\xBF'; |
michael@0 | 487 | *out++ = '\xBD'; |
michael@0 | 488 | |
michael@0 | 489 | NS_WARNING("String ending in half a surrogate pair!"); |
michael@0 | 490 | |
michael@0 | 491 | break; |
michael@0 | 492 | } |
michael@0 | 493 | c = *p; |
michael@0 | 494 | |
michael@0 | 495 | if (NS_IS_LOW_SURROGATE(c)) |
michael@0 | 496 | { |
michael@0 | 497 | // DC00- DFFF - Low Surrogate |
michael@0 | 498 | // N = (H - D800) *400 + 10000 + ( L - DC00 ) |
michael@0 | 499 | uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); |
michael@0 | 500 | |
michael@0 | 501 | // 0001 0000-001F FFFF |
michael@0 | 502 | *out++ = 0xF0 | (char)(ucs4 >> 18); |
michael@0 | 503 | *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); |
michael@0 | 504 | *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); |
michael@0 | 505 | *out++ = 0x80 | (char)(0x003F & ucs4); |
michael@0 | 506 | } |
michael@0 | 507 | else |
michael@0 | 508 | { |
michael@0 | 509 | // Treat broken characters as the Unicode |
michael@0 | 510 | // replacement character 0xFFFD (0xEFBFBD in |
michael@0 | 511 | // UTF-8) |
michael@0 | 512 | *out++ = '\xEF'; |
michael@0 | 513 | *out++ = '\xBF'; |
michael@0 | 514 | *out++ = '\xBD'; |
michael@0 | 515 | |
michael@0 | 516 | // The pointer to the next character points to the second |
michael@0 | 517 | // 16-bit value, not beyond it, as per Unicode 5.0.0 |
michael@0 | 518 | // Chapter 3 C10, only the first code unit of an illegal |
michael@0 | 519 | // sequence must be treated as an illegally terminated |
michael@0 | 520 | // code unit sequence (also Chapter 3 D91, "isolated [not |
michael@0 | 521 | // paired and ill-formed] UTF-16 code units in the range |
michael@0 | 522 | // D800..DFFF are ill-formed"). |
michael@0 | 523 | p--; |
michael@0 | 524 | |
michael@0 | 525 | NS_WARNING("got a High Surrogate but no low surrogate"); |
michael@0 | 526 | } |
michael@0 | 527 | } |
michael@0 | 528 | else // U+DC00 - U+DFFF |
michael@0 | 529 | { |
michael@0 | 530 | // Treat broken characters as the Unicode replacement |
michael@0 | 531 | // character 0xFFFD (0xEFBFBD in UTF-8) |
michael@0 | 532 | *out++ = '\xEF'; |
michael@0 | 533 | *out++ = '\xBF'; |
michael@0 | 534 | *out++ = '\xBD'; |
michael@0 | 535 | |
michael@0 | 536 | // DC00- DFFF - Low Surrogate |
michael@0 | 537 | NS_WARNING("got a low Surrogate but no high surrogate"); |
michael@0 | 538 | } |
michael@0 | 539 | } |
michael@0 | 540 | |
michael@0 | 541 | mBuffer = out; |
michael@0 | 542 | } |
michael@0 | 543 | |
michael@0 | 544 | void write_terminator() |
michael@0 | 545 | { |
michael@0 | 546 | *mBuffer = buffer_type(0); |
michael@0 | 547 | } |
michael@0 | 548 | |
michael@0 | 549 | private: |
michael@0 | 550 | buffer_type* const mStart; |
michael@0 | 551 | buffer_type* mBuffer; |
michael@0 | 552 | }; |
michael@0 | 553 | |
michael@0 | 554 | /** |
michael@0 | 555 | * A character sink (see |copy_string| in nsAlgorithm.h) for computing |
michael@0 | 556 | * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid |
michael@0 | 557 | * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8). |
michael@0 | 558 | */ |
michael@0 | 559 | class CalculateUTF8Size |
michael@0 | 560 | { |
michael@0 | 561 | public: |
michael@0 | 562 | typedef char16_t value_type; |
michael@0 | 563 | |
michael@0 | 564 | CalculateUTF8Size() |
michael@0 | 565 | : mSize(0) { } |
michael@0 | 566 | |
michael@0 | 567 | size_t Size() const { return mSize; } |
michael@0 | 568 | |
michael@0 | 569 | void write( const value_type* start, uint32_t N ) |
michael@0 | 570 | { |
michael@0 | 571 | // Assume UCS2 surrogate pairs won't be spread across fragments. |
michael@0 | 572 | for (const value_type *p = start, *end = start + N; p < end; ++p ) |
michael@0 | 573 | { |
michael@0 | 574 | value_type c = *p; |
michael@0 | 575 | if (! (c & 0xFF80)) // U+0000 - U+007F |
michael@0 | 576 | mSize += 1; |
michael@0 | 577 | else if (! (c & 0xF800)) // U+0100 - U+07FF |
michael@0 | 578 | mSize += 2; |
michael@0 | 579 | else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF |
michael@0 | 580 | mSize += 3; |
michael@0 | 581 | else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF |
michael@0 | 582 | { |
michael@0 | 583 | ++p; |
michael@0 | 584 | if (p == end) |
michael@0 | 585 | { |
michael@0 | 586 | // Treat broken characters as the Unicode |
michael@0 | 587 | // replacement character 0xFFFD (0xEFBFBD in |
michael@0 | 588 | // UTF-8) |
michael@0 | 589 | mSize += 3; |
michael@0 | 590 | |
michael@0 | 591 | NS_WARNING("String ending in half a surrogate pair!"); |
michael@0 | 592 | |
michael@0 | 593 | break; |
michael@0 | 594 | } |
michael@0 | 595 | c = *p; |
michael@0 | 596 | |
michael@0 | 597 | if (0xDC00 == (0xFC00 & c)) |
michael@0 | 598 | mSize += 4; |
michael@0 | 599 | else |
michael@0 | 600 | { |
michael@0 | 601 | // Treat broken characters as the Unicode |
michael@0 | 602 | // replacement character 0xFFFD (0xEFBFBD in |
michael@0 | 603 | // UTF-8) |
michael@0 | 604 | mSize += 3; |
michael@0 | 605 | |
michael@0 | 606 | // The next code unit is the second 16-bit value, not |
michael@0 | 607 | // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10, |
michael@0 | 608 | // only the first code unit of an illegal sequence must |
michael@0 | 609 | // be treated as an illegally terminated code unit |
michael@0 | 610 | // sequence (also Chapter 3 D91, "isolated [not paired and |
michael@0 | 611 | // ill-formed] UTF-16 code units in the range D800..DFFF |
michael@0 | 612 | // are ill-formed"). |
michael@0 | 613 | p--; |
michael@0 | 614 | |
michael@0 | 615 | NS_WARNING("got a high Surrogate but no low surrogate"); |
michael@0 | 616 | } |
michael@0 | 617 | } |
michael@0 | 618 | else // U+DC00 - U+DFFF |
michael@0 | 619 | { |
michael@0 | 620 | // Treat broken characters as the Unicode replacement |
michael@0 | 621 | // character 0xFFFD (0xEFBFBD in UTF-8) |
michael@0 | 622 | mSize += 3; |
michael@0 | 623 | |
michael@0 | 624 | NS_WARNING("got a low Surrogate but no high surrogate"); |
michael@0 | 625 | } |
michael@0 | 626 | } |
michael@0 | 627 | } |
michael@0 | 628 | |
michael@0 | 629 | private: |
michael@0 | 630 | size_t mSize; |
michael@0 | 631 | }; |
michael@0 | 632 | |
michael@0 | 633 | #ifdef MOZILLA_INTERNAL_API |
michael@0 | 634 | /** |
michael@0 | 635 | * A character sink that performs a |reinterpret_cast|-style conversion |
michael@0 | 636 | * from char to char16_t. |
michael@0 | 637 | */ |
michael@0 | 638 | class LossyConvertEncoding8to16 |
michael@0 | 639 | { |
michael@0 | 640 | public: |
michael@0 | 641 | typedef char value_type; |
michael@0 | 642 | typedef char input_type; |
michael@0 | 643 | typedef char16_t output_type; |
michael@0 | 644 | |
michael@0 | 645 | public: |
michael@0 | 646 | LossyConvertEncoding8to16( char16_t* aDestination ) : |
michael@0 | 647 | mDestination(aDestination) { } |
michael@0 | 648 | |
michael@0 | 649 | void |
michael@0 | 650 | write( const char* aSource, uint32_t aSourceLength ) |
michael@0 | 651 | { |
michael@0 | 652 | #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
michael@0 | 653 | if (mozilla::supports_sse2()) |
michael@0 | 654 | { |
michael@0 | 655 | write_sse2(aSource, aSourceLength); |
michael@0 | 656 | return; |
michael@0 | 657 | } |
michael@0 | 658 | #endif |
michael@0 | 659 | const char* done_writing = aSource + aSourceLength; |
michael@0 | 660 | while ( aSource < done_writing ) |
michael@0 | 661 | *mDestination++ = (char16_t)(unsigned char)(*aSource++); |
michael@0 | 662 | } |
michael@0 | 663 | |
michael@0 | 664 | void |
michael@0 | 665 | write_sse2( const char* aSource, uint32_t aSourceLength ); |
michael@0 | 666 | |
michael@0 | 667 | void |
michael@0 | 668 | write_terminator() |
michael@0 | 669 | { |
michael@0 | 670 | *mDestination = (char16_t)(0); |
michael@0 | 671 | } |
michael@0 | 672 | |
michael@0 | 673 | private: |
michael@0 | 674 | char16_t* mDestination; |
michael@0 | 675 | }; |
michael@0 | 676 | |
michael@0 | 677 | /** |
michael@0 | 678 | * A character sink that performs a |reinterpret_cast|-style conversion |
michael@0 | 679 | * from char16_t to char. |
michael@0 | 680 | */ |
michael@0 | 681 | class LossyConvertEncoding16to8 |
michael@0 | 682 | { |
michael@0 | 683 | public: |
michael@0 | 684 | typedef char16_t value_type; |
michael@0 | 685 | typedef char16_t input_type; |
michael@0 | 686 | typedef char output_type; |
michael@0 | 687 | |
michael@0 | 688 | LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { } |
michael@0 | 689 | |
michael@0 | 690 | void |
michael@0 | 691 | write( const char16_t* aSource, uint32_t aSourceLength) |
michael@0 | 692 | { |
michael@0 | 693 | #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
michael@0 | 694 | if (mozilla::supports_sse2()) |
michael@0 | 695 | { |
michael@0 | 696 | write_sse2(aSource, aSourceLength); |
michael@0 | 697 | return; |
michael@0 | 698 | } |
michael@0 | 699 | #endif |
michael@0 | 700 | const char16_t* done_writing = aSource + aSourceLength; |
michael@0 | 701 | while ( aSource < done_writing ) |
michael@0 | 702 | *mDestination++ = (char)(*aSource++); |
michael@0 | 703 | } |
michael@0 | 704 | |
michael@0 | 705 | #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
michael@0 | 706 | void |
michael@0 | 707 | write_sse2( const char16_t* aSource, uint32_t aSourceLength ); |
michael@0 | 708 | #endif |
michael@0 | 709 | |
michael@0 | 710 | void |
michael@0 | 711 | write_terminator() |
michael@0 | 712 | { |
michael@0 | 713 | *mDestination = '\0'; |
michael@0 | 714 | } |
michael@0 | 715 | |
michael@0 | 716 | private: |
michael@0 | 717 | char *mDestination; |
michael@0 | 718 | }; |
michael@0 | 719 | #endif // MOZILLA_INTERNAL_API |
michael@0 | 720 | |
michael@0 | 721 | #endif /* !defined(nsUTF8Utils_h_) */ |