1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/vm/CharacterEncoding.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,374 @@ 1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99: 1.6 + * This Source Code Form is subject to the terms of the Mozilla Public 1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.9 + 1.10 +#include "js/CharacterEncoding.h" 1.11 + 1.12 +#include "jscntxt.h" 1.13 +#include "jsprf.h" 1.14 + 1.15 +using namespace JS; 1.16 + 1.17 +Latin1CharsZ 1.18 +JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) 1.19 +{ 1.20 + JS_ASSERT(cx); 1.21 + size_t len = tbchars.length(); 1.22 + unsigned char *latin1 = cx->pod_malloc<unsigned char>(len + 1); 1.23 + if (!latin1) 1.24 + return Latin1CharsZ(); 1.25 + for (size_t i = 0; i < len; ++i) 1.26 + latin1[i] = static_cast<unsigned char>(tbchars[i]); 1.27 + latin1[len] = '\0'; 1.28 + return Latin1CharsZ(latin1, len); 1.29 +} 1.30 + 1.31 +static size_t 1.32 +GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars) 1.33 +{ 1.34 + size_t nbytes; 1.35 + const jschar *end; 1.36 + unsigned c, c2; 1.37 + 1.38 + nbytes = nchars; 1.39 + for (end = chars + nchars; chars != end; chars++) { 1.40 + c = *chars; 1.41 + if (c < 0x80) 1.42 + continue; 1.43 + if (0xD800 <= c && c <= 0xDFFF) { 1.44 + /* nbytes sets 1 length since this is surrogate pair. */ 1.45 + if (c >= 0xDC00 || (chars + 1) == end) { 1.46 + nbytes += 2; /* Bad Surrogate */ 1.47 + continue; 1.48 + } 1.49 + c2 = chars[1]; 1.50 + if (c2 < 0xDC00 || c2 > 0xDFFF) { 1.51 + nbytes += 2; /* Bad Surrogate */ 1.52 + continue; 1.53 + } 1.54 + c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; 1.55 + nbytes--; 1.56 + chars++; 1.57 + } 1.58 + c >>= 11; 1.59 + nbytes++; 1.60 + while (c) { 1.61 + c >>= 5; 1.62 + nbytes++; 1.63 + } 1.64 + } 1.65 + return nbytes; 1.66 +} 1.67 + 1.68 +static bool 1.69 +PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) { 1.70 + if (*dstlenp < 3) 1.71 + return false; 1.72 + *(*dst)++ = (char) 0xEF; 1.73 + *(*dst)++ = (char) 0xBF; 1.74 + *(*dst)++ = (char) 0xBD; 1.75 + *dstlenp -= 3; 1.76 + return true; 1.77 +} 1.78 + 1.79 +/* 1.80 + * Write up to |*dstlenp| bytes into |dst|. Writes the number of bytes used 1.81 + * into |*dstlenp| on success. Returns false on failure. 1.82 + */ 1.83 +static bool 1.84 +DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen, 1.85 + char *dst, size_t *dstlenp) 1.86 +{ 1.87 + size_t dstlen = *dstlenp; 1.88 + size_t origDstlen = dstlen; 1.89 + 1.90 + while (srclen) { 1.91 + uint32_t v; 1.92 + jschar c = *src++; 1.93 + srclen--; 1.94 + if (c >= 0xDC00 && c <= 0xDFFF) { 1.95 + if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) 1.96 + goto bufferTooSmall; 1.97 + continue; 1.98 + } else if (c < 0xD800 || c > 0xDBFF) { 1.99 + v = c; 1.100 + } else { 1.101 + if (srclen < 1) { 1.102 + if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) 1.103 + goto bufferTooSmall; 1.104 + continue; 1.105 + } 1.106 + jschar c2 = *src; 1.107 + if ((c2 < 0xDC00) || (c2 > 0xDFFF)) { 1.108 + if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) 1.109 + goto bufferTooSmall; 1.110 + continue; 1.111 + } 1.112 + src++; 1.113 + srclen--; 1.114 + v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; 1.115 + } 1.116 + size_t utf8Len; 1.117 + if (v < 0x0080) { 1.118 + /* no encoding necessary - performance hack */ 1.119 + if (dstlen == 0) 1.120 + goto bufferTooSmall; 1.121 + *dst++ = (char) v; 1.122 + utf8Len = 1; 1.123 + } else { 1.124 + uint8_t utf8buf[4]; 1.125 + utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v); 1.126 + if (utf8Len > dstlen) 1.127 + goto bufferTooSmall; 1.128 + for (size_t i = 0; i < utf8Len; i++) 1.129 + *dst++ = (char) utf8buf[i]; 1.130 + } 1.131 + dstlen -= utf8Len; 1.132 + } 1.133 + *dstlenp = (origDstlen - dstlen); 1.134 + return true; 1.135 + 1.136 +bufferTooSmall: 1.137 + *dstlenp = (origDstlen - dstlen); 1.138 + if (cx->isJSContext()) 1.139 + JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr, 1.140 + JSMSG_BUFFER_TOO_SMALL); 1.141 + return false; 1.142 +} 1.143 + 1.144 + 1.145 +UTF8CharsZ 1.146 +JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) 1.147 +{ 1.148 + JS_ASSERT(cx); 1.149 + 1.150 + /* Get required buffer size. */ 1.151 + jschar *str = tbchars.start().get(); 1.152 + size_t len = GetDeflatedUTF8StringLength(str, tbchars.length()); 1.153 + 1.154 + /* Allocate buffer. */ 1.155 + unsigned char *utf8 = cx->pod_malloc<unsigned char>(len + 1); 1.156 + if (!utf8) 1.157 + return UTF8CharsZ(); 1.158 + 1.159 + /* Encode to UTF8. */ 1.160 + DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len); 1.161 + utf8[len] = '\0'; 1.162 + 1.163 + return UTF8CharsZ(utf8, len); 1.164 +} 1.165 + 1.166 +static const uint32_t INVALID_UTF8 = UINT32_MAX; 1.167 + 1.168 +/* 1.169 + * Convert a utf8 character sequence into a UCS-4 character and return that 1.170 + * character. It is assumed that the caller already checked that the sequence 1.171 + * is valid. 1.172 + */ 1.173 +uint32_t 1.174 +JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length) 1.175 +{ 1.176 + JS_ASSERT(1 <= utf8Length && utf8Length <= 4); 1.177 + 1.178 + if (utf8Length == 1) { 1.179 + JS_ASSERT(!(*utf8Buffer & 0x80)); 1.180 + return *utf8Buffer; 1.181 + } 1.182 + 1.183 + /* from Unicode 3.1, non-shortest form is illegal */ 1.184 + static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 }; 1.185 + 1.186 + JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == 1.187 + (0x100 - (1 << (8 - utf8Length)))); 1.188 + uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); 1.189 + uint32_t minucs4Char = minucs4Table[utf8Length - 2]; 1.190 + while (--utf8Length) { 1.191 + JS_ASSERT((*utf8Buffer & 0xC0) == 0x80); 1.192 + ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); 1.193 + } 1.194 + 1.195 + if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) 1.196 + return INVALID_UTF8; 1.197 + 1.198 + return ucs4Char; 1.199 +} 1.200 + 1.201 +static void 1.202 +ReportInvalidCharacter(JSContext *cx, uint32_t offset) 1.203 +{ 1.204 + char buffer[10]; 1.205 + JS_snprintf(buffer, 10, "%d", offset); 1.206 + JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, 1.207 + JSMSG_MALFORMED_UTF8_CHAR, buffer); 1.208 +} 1.209 + 1.210 +static void 1.211 +ReportBufferTooSmall(JSContext *cx, uint32_t dummy) 1.212 +{ 1.213 + JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL); 1.214 +} 1.215 + 1.216 +static void 1.217 +ReportTooBigCharacter(JSContext *cx, uint32_t v) 1.218 +{ 1.219 + char buffer[10]; 1.220 + JS_snprintf(buffer, 10, "0x%x", v + 0x10000); 1.221 + JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, 1.222 + JSMSG_UTF8_CHAR_TOO_LARGE, buffer); 1.223 +} 1.224 + 1.225 +enum InflateUTF8Action { 1.226 + CountAndReportInvalids, 1.227 + CountAndIgnoreInvalids, 1.228 + Copy 1.229 +}; 1.230 + 1.231 +static const uint32_t REPLACE_UTF8 = 0xFFFD; 1.232 + 1.233 +// If making changes to this algorithm, make sure to also update 1.234 +// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp 1.235 +template <InflateUTF8Action action> 1.236 +static bool 1.237 +InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp, 1.238 + bool *isAsciip) 1.239 +{ 1.240 + *isAsciip = true; 1.241 + 1.242 + // First, count how many jschars need to be in the inflated string. 1.243 + // |i| is the index into |src|, and |j| is the the index into |dst|. 1.244 + size_t srclen = src.length(); 1.245 + uint32_t j = 0; 1.246 + for (uint32_t i = 0; i < srclen; i++, j++) { 1.247 + uint32_t v = uint32_t(src[i]); 1.248 + if (!(v & 0x80)) { 1.249 + // ASCII code unit. Simple copy. 1.250 + if (action == Copy) 1.251 + dst[j] = jschar(v); 1.252 + 1.253 + } else { 1.254 + // Non-ASCII code unit. Determine its length in bytes (n). 1.255 + *isAsciip = false; 1.256 + uint32_t n = 1; 1.257 + while (v & (0x80 >> n)) 1.258 + n++; 1.259 + 1.260 + #define INVALID(report, arg, n2) \ 1.261 + do { \ 1.262 + if (action == CountAndReportInvalids) { \ 1.263 + report(cx, arg); \ 1.264 + return false; \ 1.265 + } else { \ 1.266 + if (action == Copy) \ 1.267 + dst[j] = jschar(REPLACE_UTF8); \ 1.268 + else \ 1.269 + JS_ASSERT(action == CountAndIgnoreInvalids); \ 1.270 + n = n2; \ 1.271 + goto invalidMultiByteCodeUnit; \ 1.272 + } \ 1.273 + } while (0) 1.274 + 1.275 + // Check the leading byte. 1.276 + if (n < 2 || n > 4) 1.277 + INVALID(ReportInvalidCharacter, i, 1); 1.278 + 1.279 + // Check that |src| is large enough to hold an n-byte code unit. 1.280 + if (i + n > srclen) 1.281 + INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); 1.282 + 1.283 + // Check the second byte. From Unicode Standard v6.2, Table 3-7 1.284 + // Well-Formed UTF-8 Byte Sequences. 1.285 + if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF 1.286 + (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F 1.287 + (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF 1.288 + (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F 1.289 + { 1.290 + INVALID(ReportInvalidCharacter, i, 1); 1.291 + } 1.292 + 1.293 + // Check the continuation bytes. 1.294 + for (uint32_t m = 1; m < n; m++) 1.295 + if ((src[i + m] & 0xC0) != 0x80) 1.296 + INVALID(ReportInvalidCharacter, i, m); 1.297 + 1.298 + // Determine the code unit's length in jschars and act accordingly. 1.299 + v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n); 1.300 + if (v < 0x10000) { 1.301 + // The n-byte UTF8 code unit will fit in a single jschar. 1.302 + if (action == Copy) 1.303 + dst[j] = jschar(v); 1.304 + 1.305 + } else { 1.306 + v -= 0x10000; 1.307 + if (v <= 0xFFFFF) { 1.308 + // The n-byte UTF8 code unit will fit in two jschars. 1.309 + if (action == Copy) 1.310 + dst[j] = jschar((v >> 10) + 0xD800); 1.311 + j++; 1.312 + if (action == Copy) 1.313 + dst[j] = jschar((v & 0x3FF) + 0xDC00); 1.314 + 1.315 + } else { 1.316 + // The n-byte UTF8 code unit won't fit in two jschars. 1.317 + INVALID(ReportTooBigCharacter, v, 1); 1.318 + } 1.319 + } 1.320 + 1.321 + invalidMultiByteCodeUnit: 1.322 + // Move i to the last byte of the multi-byte code unit; the loop 1.323 + // header will do the final i++ to move to the start of the next 1.324 + // code unit. 1.325 + i += n - 1; 1.326 + } 1.327 + } 1.328 + 1.329 + *dstlenp = j; 1.330 + 1.331 + return true; 1.332 +} 1.333 + 1.334 +typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip); 1.335 + 1.336 +static TwoByteCharsZ 1.337 +InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen) 1.338 +{ 1.339 + *outlen = 0; 1.340 + 1.341 + bool isAscii; 1.342 + if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii)) 1.343 + return TwoByteCharsZ(); 1.344 + 1.345 + jschar *dst = cx->pod_malloc<jschar>(*outlen + 1); // +1 for NUL 1.346 + if (!dst) 1.347 + return TwoByteCharsZ(); 1.348 + 1.349 + if (isAscii) { 1.350 + size_t srclen = src.length(); 1.351 + JS_ASSERT(*outlen == srclen); 1.352 + for (uint32_t i = 0; i < srclen; i++) 1.353 + dst[i] = jschar(src[i]); 1.354 + 1.355 + } else { 1.356 + JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii)); 1.357 + } 1.358 + 1.359 + dst[*outlen] = 0; // NUL char 1.360 + 1.361 + return TwoByteCharsZ(dst, *outlen); 1.362 +} 1.363 + 1.364 +TwoByteCharsZ 1.365 +JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) 1.366 +{ 1.367 + return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndReportInvalids>, 1.368 + outlen); 1.369 +} 1.370 + 1.371 +TwoByteCharsZ 1.372 +JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) 1.373 +{ 1.374 + return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndIgnoreInvalids>, 1.375 + outlen); 1.376 +} 1.377 +