js/src/vm/CharacterEncoding.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * vim: set ts=8 sts=4 et sw=4 tw=99:
michael@0 3 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 4 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 6
michael@0 7 #include "js/CharacterEncoding.h"
michael@0 8
michael@0 9 #include "jscntxt.h"
michael@0 10 #include "jsprf.h"
michael@0 11
michael@0 12 using namespace JS;
michael@0 13
michael@0 14 Latin1CharsZ
michael@0 15 JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
michael@0 16 {
michael@0 17 JS_ASSERT(cx);
michael@0 18 size_t len = tbchars.length();
michael@0 19 unsigned char *latin1 = cx->pod_malloc<unsigned char>(len + 1);
michael@0 20 if (!latin1)
michael@0 21 return Latin1CharsZ();
michael@0 22 for (size_t i = 0; i < len; ++i)
michael@0 23 latin1[i] = static_cast<unsigned char>(tbchars[i]);
michael@0 24 latin1[len] = '\0';
michael@0 25 return Latin1CharsZ(latin1, len);
michael@0 26 }
michael@0 27
michael@0 28 static size_t
michael@0 29 GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars)
michael@0 30 {
michael@0 31 size_t nbytes;
michael@0 32 const jschar *end;
michael@0 33 unsigned c, c2;
michael@0 34
michael@0 35 nbytes = nchars;
michael@0 36 for (end = chars + nchars; chars != end; chars++) {
michael@0 37 c = *chars;
michael@0 38 if (c < 0x80)
michael@0 39 continue;
michael@0 40 if (0xD800 <= c && c <= 0xDFFF) {
michael@0 41 /* nbytes sets 1 length since this is surrogate pair. */
michael@0 42 if (c >= 0xDC00 || (chars + 1) == end) {
michael@0 43 nbytes += 2; /* Bad Surrogate */
michael@0 44 continue;
michael@0 45 }
michael@0 46 c2 = chars[1];
michael@0 47 if (c2 < 0xDC00 || c2 > 0xDFFF) {
michael@0 48 nbytes += 2; /* Bad Surrogate */
michael@0 49 continue;
michael@0 50 }
michael@0 51 c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
michael@0 52 nbytes--;
michael@0 53 chars++;
michael@0 54 }
michael@0 55 c >>= 11;
michael@0 56 nbytes++;
michael@0 57 while (c) {
michael@0 58 c >>= 5;
michael@0 59 nbytes++;
michael@0 60 }
michael@0 61 }
michael@0 62 return nbytes;
michael@0 63 }
michael@0 64
michael@0 65 static bool
michael@0 66 PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) {
michael@0 67 if (*dstlenp < 3)
michael@0 68 return false;
michael@0 69 *(*dst)++ = (char) 0xEF;
michael@0 70 *(*dst)++ = (char) 0xBF;
michael@0 71 *(*dst)++ = (char) 0xBD;
michael@0 72 *dstlenp -= 3;
michael@0 73 return true;
michael@0 74 }
michael@0 75
michael@0 76 /*
michael@0 77 * Write up to |*dstlenp| bytes into |dst|. Writes the number of bytes used
michael@0 78 * into |*dstlenp| on success. Returns false on failure.
michael@0 79 */
michael@0 80 static bool
michael@0 81 DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen,
michael@0 82 char *dst, size_t *dstlenp)
michael@0 83 {
michael@0 84 size_t dstlen = *dstlenp;
michael@0 85 size_t origDstlen = dstlen;
michael@0 86
michael@0 87 while (srclen) {
michael@0 88 uint32_t v;
michael@0 89 jschar c = *src++;
michael@0 90 srclen--;
michael@0 91 if (c >= 0xDC00 && c <= 0xDFFF) {
michael@0 92 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
michael@0 93 goto bufferTooSmall;
michael@0 94 continue;
michael@0 95 } else if (c < 0xD800 || c > 0xDBFF) {
michael@0 96 v = c;
michael@0 97 } else {
michael@0 98 if (srclen < 1) {
michael@0 99 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
michael@0 100 goto bufferTooSmall;
michael@0 101 continue;
michael@0 102 }
michael@0 103 jschar c2 = *src;
michael@0 104 if ((c2 < 0xDC00) || (c2 > 0xDFFF)) {
michael@0 105 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
michael@0 106 goto bufferTooSmall;
michael@0 107 continue;
michael@0 108 }
michael@0 109 src++;
michael@0 110 srclen--;
michael@0 111 v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
michael@0 112 }
michael@0 113 size_t utf8Len;
michael@0 114 if (v < 0x0080) {
michael@0 115 /* no encoding necessary - performance hack */
michael@0 116 if (dstlen == 0)
michael@0 117 goto bufferTooSmall;
michael@0 118 *dst++ = (char) v;
michael@0 119 utf8Len = 1;
michael@0 120 } else {
michael@0 121 uint8_t utf8buf[4];
michael@0 122 utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v);
michael@0 123 if (utf8Len > dstlen)
michael@0 124 goto bufferTooSmall;
michael@0 125 for (size_t i = 0; i < utf8Len; i++)
michael@0 126 *dst++ = (char) utf8buf[i];
michael@0 127 }
michael@0 128 dstlen -= utf8Len;
michael@0 129 }
michael@0 130 *dstlenp = (origDstlen - dstlen);
michael@0 131 return true;
michael@0 132
michael@0 133 bufferTooSmall:
michael@0 134 *dstlenp = (origDstlen - dstlen);
michael@0 135 if (cx->isJSContext())
michael@0 136 JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr,
michael@0 137 JSMSG_BUFFER_TOO_SMALL);
michael@0 138 return false;
michael@0 139 }
michael@0 140
michael@0 141
michael@0 142 UTF8CharsZ
michael@0 143 JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
michael@0 144 {
michael@0 145 JS_ASSERT(cx);
michael@0 146
michael@0 147 /* Get required buffer size. */
michael@0 148 jschar *str = tbchars.start().get();
michael@0 149 size_t len = GetDeflatedUTF8StringLength(str, tbchars.length());
michael@0 150
michael@0 151 /* Allocate buffer. */
michael@0 152 unsigned char *utf8 = cx->pod_malloc<unsigned char>(len + 1);
michael@0 153 if (!utf8)
michael@0 154 return UTF8CharsZ();
michael@0 155
michael@0 156 /* Encode to UTF8. */
michael@0 157 DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len);
michael@0 158 utf8[len] = '\0';
michael@0 159
michael@0 160 return UTF8CharsZ(utf8, len);
michael@0 161 }
michael@0 162
michael@0 163 static const uint32_t INVALID_UTF8 = UINT32_MAX;
michael@0 164
michael@0 165 /*
michael@0 166 * Convert a utf8 character sequence into a UCS-4 character and return that
michael@0 167 * character. It is assumed that the caller already checked that the sequence
michael@0 168 * is valid.
michael@0 169 */
michael@0 170 uint32_t
michael@0 171 JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length)
michael@0 172 {
michael@0 173 JS_ASSERT(1 <= utf8Length && utf8Length <= 4);
michael@0 174
michael@0 175 if (utf8Length == 1) {
michael@0 176 JS_ASSERT(!(*utf8Buffer & 0x80));
michael@0 177 return *utf8Buffer;
michael@0 178 }
michael@0 179
michael@0 180 /* from Unicode 3.1, non-shortest form is illegal */
michael@0 181 static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
michael@0 182
michael@0 183 JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
michael@0 184 (0x100 - (1 << (8 - utf8Length))));
michael@0 185 uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
michael@0 186 uint32_t minucs4Char = minucs4Table[utf8Length - 2];
michael@0 187 while (--utf8Length) {
michael@0 188 JS_ASSERT((*utf8Buffer & 0xC0) == 0x80);
michael@0 189 ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
michael@0 190 }
michael@0 191
michael@0 192 if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
michael@0 193 return INVALID_UTF8;
michael@0 194
michael@0 195 return ucs4Char;
michael@0 196 }
michael@0 197
michael@0 198 static void
michael@0 199 ReportInvalidCharacter(JSContext *cx, uint32_t offset)
michael@0 200 {
michael@0 201 char buffer[10];
michael@0 202 JS_snprintf(buffer, 10, "%d", offset);
michael@0 203 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
michael@0 204 JSMSG_MALFORMED_UTF8_CHAR, buffer);
michael@0 205 }
michael@0 206
michael@0 207 static void
michael@0 208 ReportBufferTooSmall(JSContext *cx, uint32_t dummy)
michael@0 209 {
michael@0 210 JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
michael@0 211 }
michael@0 212
michael@0 213 static void
michael@0 214 ReportTooBigCharacter(JSContext *cx, uint32_t v)
michael@0 215 {
michael@0 216 char buffer[10];
michael@0 217 JS_snprintf(buffer, 10, "0x%x", v + 0x10000);
michael@0 218 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
michael@0 219 JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
michael@0 220 }
michael@0 221
michael@0 222 enum InflateUTF8Action {
michael@0 223 CountAndReportInvalids,
michael@0 224 CountAndIgnoreInvalids,
michael@0 225 Copy
michael@0 226 };
michael@0 227
michael@0 228 static const uint32_t REPLACE_UTF8 = 0xFFFD;
michael@0 229
michael@0 230 // If making changes to this algorithm, make sure to also update
michael@0 231 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
michael@0 232 template <InflateUTF8Action action>
michael@0 233 static bool
michael@0 234 InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp,
michael@0 235 bool *isAsciip)
michael@0 236 {
michael@0 237 *isAsciip = true;
michael@0 238
michael@0 239 // First, count how many jschars need to be in the inflated string.
michael@0 240 // |i| is the index into |src|, and |j| is the the index into |dst|.
michael@0 241 size_t srclen = src.length();
michael@0 242 uint32_t j = 0;
michael@0 243 for (uint32_t i = 0; i < srclen; i++, j++) {
michael@0 244 uint32_t v = uint32_t(src[i]);
michael@0 245 if (!(v & 0x80)) {
michael@0 246 // ASCII code unit. Simple copy.
michael@0 247 if (action == Copy)
michael@0 248 dst[j] = jschar(v);
michael@0 249
michael@0 250 } else {
michael@0 251 // Non-ASCII code unit. Determine its length in bytes (n).
michael@0 252 *isAsciip = false;
michael@0 253 uint32_t n = 1;
michael@0 254 while (v & (0x80 >> n))
michael@0 255 n++;
michael@0 256
michael@0 257 #define INVALID(report, arg, n2) \
michael@0 258 do { \
michael@0 259 if (action == CountAndReportInvalids) { \
michael@0 260 report(cx, arg); \
michael@0 261 return false; \
michael@0 262 } else { \
michael@0 263 if (action == Copy) \
michael@0 264 dst[j] = jschar(REPLACE_UTF8); \
michael@0 265 else \
michael@0 266 JS_ASSERT(action == CountAndIgnoreInvalids); \
michael@0 267 n = n2; \
michael@0 268 goto invalidMultiByteCodeUnit; \
michael@0 269 } \
michael@0 270 } while (0)
michael@0 271
michael@0 272 // Check the leading byte.
michael@0 273 if (n < 2 || n > 4)
michael@0 274 INVALID(ReportInvalidCharacter, i, 1);
michael@0 275
michael@0 276 // Check that |src| is large enough to hold an n-byte code unit.
michael@0 277 if (i + n > srclen)
michael@0 278 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
michael@0 279
michael@0 280 // Check the second byte. From Unicode Standard v6.2, Table 3-7
michael@0 281 // Well-Formed UTF-8 Byte Sequences.
michael@0 282 if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF
michael@0 283 (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F
michael@0 284 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
michael@0 285 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
michael@0 286 {
michael@0 287 INVALID(ReportInvalidCharacter, i, 1);
michael@0 288 }
michael@0 289
michael@0 290 // Check the continuation bytes.
michael@0 291 for (uint32_t m = 1; m < n; m++)
michael@0 292 if ((src[i + m] & 0xC0) != 0x80)
michael@0 293 INVALID(ReportInvalidCharacter, i, m);
michael@0 294
michael@0 295 // Determine the code unit's length in jschars and act accordingly.
michael@0 296 v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n);
michael@0 297 if (v < 0x10000) {
michael@0 298 // The n-byte UTF8 code unit will fit in a single jschar.
michael@0 299 if (action == Copy)
michael@0 300 dst[j] = jschar(v);
michael@0 301
michael@0 302 } else {
michael@0 303 v -= 0x10000;
michael@0 304 if (v <= 0xFFFFF) {
michael@0 305 // The n-byte UTF8 code unit will fit in two jschars.
michael@0 306 if (action == Copy)
michael@0 307 dst[j] = jschar((v >> 10) + 0xD800);
michael@0 308 j++;
michael@0 309 if (action == Copy)
michael@0 310 dst[j] = jschar((v & 0x3FF) + 0xDC00);
michael@0 311
michael@0 312 } else {
michael@0 313 // The n-byte UTF8 code unit won't fit in two jschars.
michael@0 314 INVALID(ReportTooBigCharacter, v, 1);
michael@0 315 }
michael@0 316 }
michael@0 317
michael@0 318 invalidMultiByteCodeUnit:
michael@0 319 // Move i to the last byte of the multi-byte code unit; the loop
michael@0 320 // header will do the final i++ to move to the start of the next
michael@0 321 // code unit.
michael@0 322 i += n - 1;
michael@0 323 }
michael@0 324 }
michael@0 325
michael@0 326 *dstlenp = j;
michael@0 327
michael@0 328 return true;
michael@0 329 }
michael@0 330
michael@0 331 typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip);
michael@0 332
michael@0 333 static TwoByteCharsZ
michael@0 334 InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen)
michael@0 335 {
michael@0 336 *outlen = 0;
michael@0 337
michael@0 338 bool isAscii;
michael@0 339 if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii))
michael@0 340 return TwoByteCharsZ();
michael@0 341
michael@0 342 jschar *dst = cx->pod_malloc<jschar>(*outlen + 1); // +1 for NUL
michael@0 343 if (!dst)
michael@0 344 return TwoByteCharsZ();
michael@0 345
michael@0 346 if (isAscii) {
michael@0 347 size_t srclen = src.length();
michael@0 348 JS_ASSERT(*outlen == srclen);
michael@0 349 for (uint32_t i = 0; i < srclen; i++)
michael@0 350 dst[i] = jschar(src[i]);
michael@0 351
michael@0 352 } else {
michael@0 353 JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii));
michael@0 354 }
michael@0 355
michael@0 356 dst[*outlen] = 0; // NUL char
michael@0 357
michael@0 358 return TwoByteCharsZ(dst, *outlen);
michael@0 359 }
michael@0 360
michael@0 361 TwoByteCharsZ
michael@0 362 JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
michael@0 363 {
michael@0 364 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndReportInvalids>,
michael@0 365 outlen);
michael@0 366 }
michael@0 367
michael@0 368 TwoByteCharsZ
michael@0 369 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
michael@0 370 {
michael@0 371 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndIgnoreInvalids>,
michael@0 372 outlen);
michael@0 373 }
michael@0 374

mercurial