Sat, 03 Jan 2015 20:18:00 +0100
Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
5 #ifndef nsUTF8Utils_h_
6 #define nsUTF8Utils_h_
8 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this
9 // file will provide signatures for the Mozilla abstract string types. It will
10 // use XPCOM assertion/debugging macros, etc.
12 #include "nscore.h"
13 #include "mozilla/SSE.h"
15 #include "nsCharTraits.h"
17 class UTF8traits
18 {
19 public:
20 static bool isASCII(char c) { return (c & 0x80) == 0x00; }
21 static bool isInSeq(char c) { return (c & 0xC0) == 0x80; }
22 static bool is2byte(char c) { return (c & 0xE0) == 0xC0; }
23 static bool is3byte(char c) { return (c & 0xF0) == 0xE0; }
24 static bool is4byte(char c) { return (c & 0xF8) == 0xF0; }
25 static bool is5byte(char c) { return (c & 0xFC) == 0xF8; }
26 static bool is6byte(char c) { return (c & 0xFE) == 0xFC; }
27 };
29 /**
30 * Extract the next UCS-4 character from the buffer and return it. The
31 * pointer passed in is advanced to the start of the next character in the
32 * buffer. If non-null, the parameters err and overlong are filled in to
33 * indicate that the character was represented by an overlong sequence, or
34 * that an error occurred.
35 */
37 class UTF8CharEnumerator
38 {
39 public:
40 static uint32_t NextChar(const char **buffer, const char *end,
41 bool *err)
42 {
43 NS_ASSERTION(buffer && *buffer, "null buffer!");
45 const char *p = *buffer;
46 *err = false;
48 if (p >= end)
49 {
50 *err = true;
52 return 0;
53 }
55 char c = *p++;
57 if ( UTF8traits::isASCII(c) )
58 {
59 *buffer = p;
60 return c;
61 }
63 uint32_t ucs4;
64 uint32_t minUcs4;
65 int32_t state = 0;
67 if (!CalcState(c, ucs4, minUcs4, state)) {
68 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
69 *err = true;
71 return 0;
72 }
74 while ( state-- )
75 {
76 if (p == end)
77 {
78 *err = true;
80 return 0;
81 }
83 c = *p++;
85 if (!AddByte(c, state, ucs4))
86 {
87 *err = true;
89 return 0;
90 }
91 }
93 if ( ucs4 < minUcs4 )
94 {
95 // Overlong sequence
96 ucs4 = UCS2_REPLACEMENT_CHAR;
97 }
98 else if ( ucs4 >= 0xD800 &&
99 (ucs4 <= 0xDFFF || ucs4 >= UCS_END))
100 {
101 // Surrogates and code points outside the Unicode range.
102 ucs4 = UCS2_REPLACEMENT_CHAR;
103 }
105 *buffer = p;
106 return ucs4;
107 }
109 private:
110 static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4,
111 int32_t& state)
112 {
113 if ( UTF8traits::is2byte(c) )
114 {
115 ucs4 = (uint32_t(c) << 6) & 0x000007C0L;
116 state = 1;
117 minUcs4 = 0x00000080;
118 }
119 else if ( UTF8traits::is3byte(c) )
120 {
121 ucs4 = (uint32_t(c) << 12) & 0x0000F000L;
122 state = 2;
123 minUcs4 = 0x00000800;
124 }
125 else if ( UTF8traits::is4byte(c) )
126 {
127 ucs4 = (uint32_t(c) << 18) & 0x001F0000L;
128 state = 3;
129 minUcs4 = 0x00010000;
130 }
131 else if ( UTF8traits::is5byte(c) )
132 {
133 ucs4 = (uint32_t(c) << 24) & 0x03000000L;
134 state = 4;
135 minUcs4 = 0x00200000;
136 }
137 else if ( UTF8traits::is6byte(c) )
138 {
139 ucs4 = (uint32_t(c) << 30) & 0x40000000L;
140 state = 5;
141 minUcs4 = 0x04000000;
142 }
143 else
144 {
145 return false;
146 }
148 return true;
149 }
151 static bool AddByte(char c, int32_t state, uint32_t& ucs4)
152 {
153 if ( UTF8traits::isInSeq(c) )
154 {
155 int32_t shift = state * 6;
156 ucs4 |= (uint32_t(c) & 0x3F) << shift;
157 return true;
158 }
160 return false;
161 }
162 };
165 /**
166 * Extract the next UCS-4 character from the buffer and return it. The
167 * pointer passed in is advanced to the start of the next character in the
168 * buffer. If non-null, the err parameter is filled in if an error occurs.
169 */
172 class UTF16CharEnumerator
173 {
174 public:
175 static uint32_t NextChar(const char16_t **buffer, const char16_t *end,
176 bool *err = nullptr)
177 {
178 NS_ASSERTION(buffer && *buffer, "null buffer!");
180 const char16_t *p = *buffer;
182 if (p >= end)
183 {
184 NS_ERROR("No input to work with");
185 if (err)
186 *err = true;
188 return 0;
189 }
191 char16_t c = *p++;
193 if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF
194 {
195 if (err)
196 *err = false;
197 *buffer = p;
198 return c;
199 }
200 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
201 {
202 if (p == end)
203 {
204 // Found a high surrogate the end of the buffer. Flag this
205 // as an error and return the Unicode replacement
206 // character 0xFFFD.
208 NS_WARNING("Unexpected end of buffer after high surrogate");
210 if (err)
211 *err = true;
212 *buffer = p;
213 return 0xFFFD;
214 }
216 // D800- DBFF - High Surrogate
217 char16_t h = c;
219 c = *p++;
221 if (NS_IS_LOW_SURROGATE(c))
222 {
223 // DC00- DFFF - Low Surrogate
224 // N = (H - D800) *400 + 10000 + (L - DC00)
225 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
226 if (err)
227 *err = false;
228 *buffer = p;
229 return ucs4;
230 }
231 else
232 {
233 // Found a high surrogate followed by something other than
234 // a low surrogate. Flag this as an error and return the
235 // Unicode replacement character 0xFFFD. Note that the
236 // pointer to the next character points to the second 16-bit
237 // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10,
238 // only the first code unit of an illegal sequence must be
239 // treated as an illegally terminated code unit sequence
240 // (also Chapter 3 D91, "isolated [not paired and ill-formed]
241 // UTF-16 code units in the range D800..DFFF are ill-formed").
242 NS_WARNING("got a High Surrogate but no low surrogate");
244 if (err)
245 *err = true;
246 *buffer = p - 1;
247 return 0xFFFD;
248 }
249 }
250 else // U+DC00 - U+DFFF
251 {
252 // DC00- DFFF - Low Surrogate
254 // Found a low surrogate w/o a preceding high surrogate. Flag
255 // this as an error and return the Unicode replacement
256 // character 0xFFFD.
258 NS_WARNING("got a low Surrogate but no high surrogate");
259 if (err)
260 *err = true;
261 *buffer = p;
262 return 0xFFFD;
263 }
265 if (err)
266 *err = true;
267 return 0;
268 }
269 };
272 /**
273 * A character sink (see |copy_string| in nsAlgorithm.h) for converting
274 * UTF-8 to UTF-16
275 */
276 class ConvertUTF8toUTF16
277 {
278 public:
279 typedef char value_type;
280 typedef char16_t buffer_type;
282 ConvertUTF8toUTF16( buffer_type* aBuffer )
283 : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {}
285 size_t Length() const { return mBuffer - mStart; }
287 bool ErrorEncountered() const { return mErrorEncountered; }
289 void write( const value_type* start, uint32_t N )
290 {
291 if ( mErrorEncountered )
292 return;
294 // algorithm assumes utf8 units won't
295 // be spread across fragments
296 const value_type* p = start;
297 const value_type* end = start + N;
298 buffer_type* out = mBuffer;
299 for ( ; p != end /* && *p */; )
300 {
301 bool err;
302 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err);
304 if ( err )
305 {
306 mErrorEncountered = true;
307 mBuffer = out;
308 return;
309 }
311 if ( ucs4 >= PLANE1_BASE )
312 {
313 *out++ = (buffer_type)H_SURROGATE(ucs4);
314 *out++ = (buffer_type)L_SURROGATE(ucs4);
315 }
316 else
317 {
318 *out++ = ucs4;
319 }
320 }
321 mBuffer = out;
322 }
324 void write_terminator()
325 {
326 *mBuffer = buffer_type(0);
327 }
329 private:
330 buffer_type* const mStart;
331 buffer_type* mBuffer;
332 bool mErrorEncountered;
333 };
335 /**
336 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
337 * the length of the UTF-16 string equivalent to a UTF-8 string.
338 */
339 class CalculateUTF8Length
340 {
341 public:
342 typedef char value_type;
344 CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { }
346 size_t Length() const { return mLength; }
348 void write( const value_type* start, uint32_t N )
349 {
350 // ignore any further requests
351 if ( mErrorEncountered )
352 return;
354 // algorithm assumes utf8 units won't
355 // be spread across fragments
356 const value_type* p = start;
357 const value_type* end = start + N;
358 for ( ; p < end /* && *p */; ++mLength )
359 {
360 if ( UTF8traits::isASCII(*p) )
361 p += 1;
362 else if ( UTF8traits::is2byte(*p) )
363 p += 2;
364 else if ( UTF8traits::is3byte(*p) )
365 p += 3;
366 else if ( UTF8traits::is4byte(*p) ) {
367 // Because a UTF-8 sequence of 4 bytes represents a codepoint
368 // greater than 0xFFFF, it will become a surrogate pair in the
369 // UTF-16 string, so add 1 more to mLength.
370 // This doesn't happen with is5byte and is6byte because they
371 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get
372 // converted to a single replacement character.
374 // However, there is one case when a 4 byte UTF-8 sequence will
375 // only generate 2 UTF-16 bytes. If we have a properly encoded
376 // sequence, but with an invalid value (too small or too big),
377 // that will result in a replacement character being written
378 // This replacement character is encoded as just 1 single
379 // UTF-16 character, which is 2 bytes.
381 // The below code therefore only adds 1 to mLength if the UTF8
382 // data will produce a decoded character which is greater than
383 // or equal to 0x010000 and less than 0x0110000.
385 // A 4byte UTF8 character is encoded as
386 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
387 // Bit 1-3 on the first byte, and bit 5-6 on the second byte,
388 // map to bit 17-21 in the final result. If these bits are
389 // between 0x01 and 0x11, that means that the final result is
390 // between 0x010000 and 0x110000. The below code reads these
391 // bits out and assigns them to c, but shifted up 4 bits to
392 // avoid having to shift twice.
394 // It doesn't matter what to do in the case where p + 4 > end
395 // since no UTF16 characters will be written in that case by
396 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if
397 // any of the surrogate bits are wrong since no UTF16
398 // characters will be written in that case either.
400 if (p + 4 <= end) {
401 uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 |
402 ((uint32_t)(p[1] & 0x30));
403 if (c >= 0x010 && c < 0x110)
404 ++mLength;
405 }
407 p += 4;
408 }
409 else if ( UTF8traits::is5byte(*p) )
410 p += 5;
411 else if ( UTF8traits::is6byte(*p) )
412 p += 6;
413 else // error
414 {
415 ++mLength; // to account for the decrement below
416 break;
417 }
418 }
419 if ( p != end )
420 {
421 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
422 --mLength; // The last multi-byte char wasn't complete, discard it.
423 mErrorEncountered = true;
424 }
425 }
427 private:
428 size_t mLength;
429 bool mErrorEncountered;
430 };
432 /**
433 * A character sink (see |copy_string| in nsAlgorithm.h) for
434 * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD
435 * (0xEFBFBD in UTF-8).
436 */
437 class ConvertUTF16toUTF8
438 {
439 public:
440 typedef char16_t value_type;
441 typedef char buffer_type;
443 // The error handling here is more lenient than that in
444 // |ConvertUTF8toUTF16|, but it's that way for backwards
445 // compatibility.
447 ConvertUTF16toUTF8( buffer_type* aBuffer )
448 : mStart(aBuffer), mBuffer(aBuffer) {}
450 size_t Size() const { return mBuffer - mStart; }
452 void write( const value_type* start, uint32_t N )
453 {
454 buffer_type *out = mBuffer; // gcc isn't smart enough to do this!
456 for (const value_type *p = start, *end = start + N; p < end; ++p )
457 {
458 value_type c = *p;
459 if (! (c & 0xFF80)) // U+0000 - U+007F
460 {
461 *out++ = (char)c;
462 }
463 else if (! (c & 0xF800)) // U+0100 - U+07FF
464 {
465 *out++ = 0xC0 | (char)(c >> 6);
466 *out++ = 0x80 | (char)(0x003F & c);
467 }
468 else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
469 {
470 *out++ = 0xE0 | (char)(c >> 12);
471 *out++ = 0x80 | (char)(0x003F & (c >> 6));
472 *out++ = 0x80 | (char)(0x003F & c );
473 }
474 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF
475 {
476 // D800- DBFF - High Surrogate
477 value_type h = c;
479 ++p;
480 if (p == end)
481 {
482 // Treat broken characters as the Unicode
483 // replacement character 0xFFFD (0xEFBFBD in
484 // UTF-8)
485 *out++ = '\xEF';
486 *out++ = '\xBF';
487 *out++ = '\xBD';
489 NS_WARNING("String ending in half a surrogate pair!");
491 break;
492 }
493 c = *p;
495 if (NS_IS_LOW_SURROGATE(c))
496 {
497 // DC00- DFFF - Low Surrogate
498 // N = (H - D800) *400 + 10000 + ( L - DC00 )
499 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c);
501 // 0001 0000-001F FFFF
502 *out++ = 0xF0 | (char)(ucs4 >> 18);
503 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12));
504 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6));
505 *out++ = 0x80 | (char)(0x003F & ucs4);
506 }
507 else
508 {
509 // Treat broken characters as the Unicode
510 // replacement character 0xFFFD (0xEFBFBD in
511 // UTF-8)
512 *out++ = '\xEF';
513 *out++ = '\xBF';
514 *out++ = '\xBD';
516 // The pointer to the next character points to the second
517 // 16-bit value, not beyond it, as per Unicode 5.0.0
518 // Chapter 3 C10, only the first code unit of an illegal
519 // sequence must be treated as an illegally terminated
520 // code unit sequence (also Chapter 3 D91, "isolated [not
521 // paired and ill-formed] UTF-16 code units in the range
522 // D800..DFFF are ill-formed").
523 p--;
525 NS_WARNING("got a High Surrogate but no low surrogate");
526 }
527 }
528 else // U+DC00 - U+DFFF
529 {
530 // Treat broken characters as the Unicode replacement
531 // character 0xFFFD (0xEFBFBD in UTF-8)
532 *out++ = '\xEF';
533 *out++ = '\xBF';
534 *out++ = '\xBD';
536 // DC00- DFFF - Low Surrogate
537 NS_WARNING("got a low Surrogate but no high surrogate");
538 }
539 }
541 mBuffer = out;
542 }
544 void write_terminator()
545 {
546 *mBuffer = buffer_type(0);
547 }
549 private:
550 buffer_type* const mStart;
551 buffer_type* mBuffer;
552 };
554 /**
555 * A character sink (see |copy_string| in nsAlgorithm.h) for computing
556 * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid
557 * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8).
558 */
559 class CalculateUTF8Size
560 {
561 public:
562 typedef char16_t value_type;
564 CalculateUTF8Size()
565 : mSize(0) { }
567 size_t Size() const { return mSize; }
569 void write( const value_type* start, uint32_t N )
570 {
571 // Assume UCS2 surrogate pairs won't be spread across fragments.
572 for (const value_type *p = start, *end = start + N; p < end; ++p )
573 {
574 value_type c = *p;
575 if (! (c & 0xFF80)) // U+0000 - U+007F
576 mSize += 1;
577 else if (! (c & 0xF800)) // U+0100 - U+07FF
578 mSize += 2;
579 else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF
580 mSize += 3;
581 else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF
582 {
583 ++p;
584 if (p == end)
585 {
586 // Treat broken characters as the Unicode
587 // replacement character 0xFFFD (0xEFBFBD in
588 // UTF-8)
589 mSize += 3;
591 NS_WARNING("String ending in half a surrogate pair!");
593 break;
594 }
595 c = *p;
597 if (0xDC00 == (0xFC00 & c))
598 mSize += 4;
599 else
600 {
601 // Treat broken characters as the Unicode
602 // replacement character 0xFFFD (0xEFBFBD in
603 // UTF-8)
604 mSize += 3;
606 // The next code unit is the second 16-bit value, not
607 // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10,
608 // only the first code unit of an illegal sequence must
609 // be treated as an illegally terminated code unit
610 // sequence (also Chapter 3 D91, "isolated [not paired and
611 // ill-formed] UTF-16 code units in the range D800..DFFF
612 // are ill-formed").
613 p--;
615 NS_WARNING("got a high Surrogate but no low surrogate");
616 }
617 }
618 else // U+DC00 - U+DFFF
619 {
620 // Treat broken characters as the Unicode replacement
621 // character 0xFFFD (0xEFBFBD in UTF-8)
622 mSize += 3;
624 NS_WARNING("got a low Surrogate but no high surrogate");
625 }
626 }
627 }
629 private:
630 size_t mSize;
631 };
633 #ifdef MOZILLA_INTERNAL_API
634 /**
635 * A character sink that performs a |reinterpret_cast|-style conversion
636 * from char to char16_t.
637 */
638 class LossyConvertEncoding8to16
639 {
640 public:
641 typedef char value_type;
642 typedef char input_type;
643 typedef char16_t output_type;
645 public:
646 LossyConvertEncoding8to16( char16_t* aDestination ) :
647 mDestination(aDestination) { }
649 void
650 write( const char* aSource, uint32_t aSourceLength )
651 {
652 #ifdef MOZILLA_MAY_SUPPORT_SSE2
653 if (mozilla::supports_sse2())
654 {
655 write_sse2(aSource, aSourceLength);
656 return;
657 }
658 #endif
659 const char* done_writing = aSource + aSourceLength;
660 while ( aSource < done_writing )
661 *mDestination++ = (char16_t)(unsigned char)(*aSource++);
662 }
664 void
665 write_sse2( const char* aSource, uint32_t aSourceLength );
667 void
668 write_terminator()
669 {
670 *mDestination = (char16_t)(0);
671 }
673 private:
674 char16_t* mDestination;
675 };
677 /**
678 * A character sink that performs a |reinterpret_cast|-style conversion
679 * from char16_t to char.
680 */
681 class LossyConvertEncoding16to8
682 {
683 public:
684 typedef char16_t value_type;
685 typedef char16_t input_type;
686 typedef char output_type;
688 LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { }
690 void
691 write( const char16_t* aSource, uint32_t aSourceLength)
692 {
693 #ifdef MOZILLA_MAY_SUPPORT_SSE2
694 if (mozilla::supports_sse2())
695 {
696 write_sse2(aSource, aSourceLength);
697 return;
698 }
699 #endif
700 const char16_t* done_writing = aSource + aSourceLength;
701 while ( aSource < done_writing )
702 *mDestination++ = (char)(*aSource++);
703 }
705 #ifdef MOZILLA_MAY_SUPPORT_SSE2
706 void
707 write_sse2( const char16_t* aSource, uint32_t aSourceLength );
708 #endif
710 void
711 write_terminator()
712 {
713 *mDestination = '\0';
714 }
716 private:
717 char *mDestination;
718 };
719 #endif // MOZILLA_INTERNAL_API
721 #endif /* !defined(nsUTF8Utils_h_) */