js/src/vm/CharacterEncoding.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/vm/CharacterEncoding.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,374 @@
     1.4 +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * vim: set ts=8 sts=4 et sw=4 tw=99:
     1.6 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.7 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.8 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.9 +
    1.10 +#include "js/CharacterEncoding.h"
    1.11 +
    1.12 +#include "jscntxt.h"
    1.13 +#include "jsprf.h"
    1.14 +
    1.15 +using namespace JS;
    1.16 +
    1.17 +Latin1CharsZ
    1.18 +JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
    1.19 +{
    1.20 +    JS_ASSERT(cx);
    1.21 +    size_t len = tbchars.length();
    1.22 +    unsigned char *latin1 = cx->pod_malloc<unsigned char>(len + 1);
    1.23 +    if (!latin1)
    1.24 +        return Latin1CharsZ();
    1.25 +    for (size_t i = 0; i < len; ++i)
    1.26 +        latin1[i] = static_cast<unsigned char>(tbchars[i]);
    1.27 +    latin1[len] = '\0';
    1.28 +    return Latin1CharsZ(latin1, len);
    1.29 +}
    1.30 +
    1.31 +static size_t
    1.32 +GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars)
    1.33 +{
    1.34 +    size_t nbytes;
    1.35 +    const jschar *end;
    1.36 +    unsigned c, c2;
    1.37 +
    1.38 +    nbytes = nchars;
    1.39 +    for (end = chars + nchars; chars != end; chars++) {
    1.40 +        c = *chars;
    1.41 +        if (c < 0x80)
    1.42 +            continue;
    1.43 +        if (0xD800 <= c && c <= 0xDFFF) {
    1.44 +            /* nbytes sets 1 length since this is surrogate pair. */
    1.45 +            if (c >= 0xDC00 || (chars + 1) == end) {
    1.46 +                nbytes += 2; /* Bad Surrogate */
    1.47 +                continue;
    1.48 +            }
    1.49 +            c2 = chars[1];
    1.50 +            if (c2 < 0xDC00 || c2 > 0xDFFF) {
    1.51 +                nbytes += 2; /* Bad Surrogate */
    1.52 +                continue;
    1.53 +            }
    1.54 +            c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
    1.55 +            nbytes--;
    1.56 +            chars++;
    1.57 +        }
    1.58 +        c >>= 11;
    1.59 +        nbytes++;
    1.60 +        while (c) {
    1.61 +            c >>= 5;
    1.62 +            nbytes++;
    1.63 +        }
    1.64 +    }
    1.65 +    return nbytes;
    1.66 +}
    1.67 +
    1.68 +static bool
    1.69 +PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) {
    1.70 +    if (*dstlenp < 3)
    1.71 +        return false;
    1.72 +    *(*dst)++ = (char) 0xEF;
    1.73 +    *(*dst)++ = (char) 0xBF;
    1.74 +    *(*dst)++ = (char) 0xBD;
    1.75 +    *dstlenp -= 3;
    1.76 +    return true;
    1.77 +}
    1.78 +
    1.79 +/*
    1.80 + * Write up to |*dstlenp| bytes into |dst|.  Writes the number of bytes used
    1.81 + * into |*dstlenp| on success.  Returns false on failure.
    1.82 + */
    1.83 +static bool
    1.84 +DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen,
    1.85 +                          char *dst, size_t *dstlenp)
    1.86 +{
    1.87 +    size_t dstlen = *dstlenp;
    1.88 +    size_t origDstlen = dstlen;
    1.89 +
    1.90 +    while (srclen) {
    1.91 +        uint32_t v;
    1.92 +        jschar c = *src++;
    1.93 +        srclen--;
    1.94 +        if (c >= 0xDC00 && c <= 0xDFFF) {
    1.95 +            if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
    1.96 +                goto bufferTooSmall;
    1.97 +            continue;
    1.98 +        } else if (c < 0xD800 || c > 0xDBFF) {
    1.99 +            v = c;
   1.100 +        } else {
   1.101 +            if (srclen < 1) {
   1.102 +                if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
   1.103 +                    goto bufferTooSmall;
   1.104 +                continue;
   1.105 +            }
   1.106 +            jschar c2 = *src;
   1.107 +            if ((c2 < 0xDC00) || (c2 > 0xDFFF)) {
   1.108 +                if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
   1.109 +                    goto bufferTooSmall;
   1.110 +                continue;
   1.111 +            }
   1.112 +            src++;
   1.113 +            srclen--;
   1.114 +            v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
   1.115 +        }
   1.116 +        size_t utf8Len;
   1.117 +        if (v < 0x0080) {
   1.118 +            /* no encoding necessary - performance hack */
   1.119 +            if (dstlen == 0)
   1.120 +                goto bufferTooSmall;
   1.121 +            *dst++ = (char) v;
   1.122 +            utf8Len = 1;
   1.123 +        } else {
   1.124 +            uint8_t utf8buf[4];
   1.125 +            utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v);
   1.126 +            if (utf8Len > dstlen)
   1.127 +                goto bufferTooSmall;
   1.128 +            for (size_t i = 0; i < utf8Len; i++)
   1.129 +                *dst++ = (char) utf8buf[i];
   1.130 +        }
   1.131 +        dstlen -= utf8Len;
   1.132 +    }
   1.133 +    *dstlenp = (origDstlen - dstlen);
   1.134 +    return true;
   1.135 +
   1.136 +bufferTooSmall:
   1.137 +    *dstlenp = (origDstlen - dstlen);
   1.138 +    if (cx->isJSContext())
   1.139 +        JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr,
   1.140 +                             JSMSG_BUFFER_TOO_SMALL);
   1.141 +    return false;
   1.142 +}
   1.143 +
   1.144 +
   1.145 +UTF8CharsZ
   1.146 +JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
   1.147 +{
   1.148 +    JS_ASSERT(cx);
   1.149 +
   1.150 +    /* Get required buffer size. */
   1.151 +    jschar *str = tbchars.start().get();
   1.152 +    size_t len = GetDeflatedUTF8StringLength(str, tbchars.length());
   1.153 +
   1.154 +    /* Allocate buffer. */
   1.155 +    unsigned char *utf8 = cx->pod_malloc<unsigned char>(len + 1);
   1.156 +    if (!utf8)
   1.157 +        return UTF8CharsZ();
   1.158 +
   1.159 +    /* Encode to UTF8. */
   1.160 +    DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len);
   1.161 +    utf8[len] = '\0';
   1.162 +
   1.163 +    return UTF8CharsZ(utf8, len);
   1.164 +}
   1.165 +
   1.166 +static const uint32_t INVALID_UTF8 = UINT32_MAX;
   1.167 +
   1.168 +/*
   1.169 + * Convert a utf8 character sequence into a UCS-4 character and return that
   1.170 + * character.  It is assumed that the caller already checked that the sequence
   1.171 + * is valid.
   1.172 + */
   1.173 +uint32_t
   1.174 +JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length)
   1.175 +{
   1.176 +    JS_ASSERT(1 <= utf8Length && utf8Length <= 4);
   1.177 +
   1.178 +    if (utf8Length == 1) {
   1.179 +        JS_ASSERT(!(*utf8Buffer & 0x80));
   1.180 +        return *utf8Buffer;
   1.181 +    }
   1.182 +
   1.183 +    /* from Unicode 3.1, non-shortest form is illegal */
   1.184 +    static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
   1.185 +
   1.186 +    JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
   1.187 +              (0x100 - (1 << (8 - utf8Length))));
   1.188 +    uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
   1.189 +    uint32_t minucs4Char = minucs4Table[utf8Length - 2];
   1.190 +    while (--utf8Length) {
   1.191 +        JS_ASSERT((*utf8Buffer & 0xC0) == 0x80);
   1.192 +        ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
   1.193 +    }
   1.194 +
   1.195 +    if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
   1.196 +        return INVALID_UTF8;
   1.197 +
   1.198 +    return ucs4Char;
   1.199 +}
   1.200 +
   1.201 +static void
   1.202 +ReportInvalidCharacter(JSContext *cx, uint32_t offset)
   1.203 +{
   1.204 +    char buffer[10];
   1.205 +    JS_snprintf(buffer, 10, "%d", offset);
   1.206 +    JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
   1.207 +                                 JSMSG_MALFORMED_UTF8_CHAR, buffer);
   1.208 +}
   1.209 +
   1.210 +static void
   1.211 +ReportBufferTooSmall(JSContext *cx, uint32_t dummy)
   1.212 +{
   1.213 +    JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
   1.214 +}
   1.215 +
   1.216 +static void
   1.217 +ReportTooBigCharacter(JSContext *cx, uint32_t v)
   1.218 +{
   1.219 +    char buffer[10];
   1.220 +    JS_snprintf(buffer, 10, "0x%x", v + 0x10000);
   1.221 +    JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
   1.222 +                                 JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
   1.223 +}
   1.224 +
   1.225 +enum InflateUTF8Action {
   1.226 +    CountAndReportInvalids,
   1.227 +    CountAndIgnoreInvalids,
   1.228 +    Copy
   1.229 +};
   1.230 +
   1.231 +static const uint32_t REPLACE_UTF8 = 0xFFFD;
   1.232 +
   1.233 +// If making changes to this algorithm, make sure to also update
   1.234 +// LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
   1.235 +template <InflateUTF8Action action>
   1.236 +static bool
   1.237 +InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp,
   1.238 +                          bool *isAsciip)
   1.239 +{
   1.240 +    *isAsciip = true;
   1.241 +
   1.242 +    // First, count how many jschars need to be in the inflated string.
   1.243 +    // |i| is the index into |src|, and |j| is the the index into |dst|.
   1.244 +    size_t srclen = src.length();
   1.245 +    uint32_t j = 0;
   1.246 +    for (uint32_t i = 0; i < srclen; i++, j++) {
   1.247 +        uint32_t v = uint32_t(src[i]);
   1.248 +        if (!(v & 0x80)) {
   1.249 +            // ASCII code unit.  Simple copy.
   1.250 +            if (action == Copy)
   1.251 +                dst[j] = jschar(v);
   1.252 +
   1.253 +        } else {
   1.254 +            // Non-ASCII code unit.  Determine its length in bytes (n).
   1.255 +            *isAsciip = false;
   1.256 +            uint32_t n = 1;
   1.257 +            while (v & (0x80 >> n))
   1.258 +                n++;
   1.259 +
   1.260 +        #define INVALID(report, arg, n2)                                \
   1.261 +            do {                                                        \
   1.262 +                if (action == CountAndReportInvalids) {                 \
   1.263 +                    report(cx, arg);                                    \
   1.264 +                    return false;                                       \
   1.265 +                } else {                                                \
   1.266 +                    if (action == Copy)                                 \
   1.267 +                        dst[j] = jschar(REPLACE_UTF8);                  \
   1.268 +                    else                                                \
   1.269 +                        JS_ASSERT(action == CountAndIgnoreInvalids);    \
   1.270 +                    n = n2;                                             \
   1.271 +                    goto invalidMultiByteCodeUnit;                      \
   1.272 +                }                                                       \
   1.273 +            } while (0)
   1.274 +
   1.275 +            // Check the leading byte.
   1.276 +            if (n < 2 || n > 4)
   1.277 +                INVALID(ReportInvalidCharacter, i, 1);
   1.278 +
   1.279 +            // Check that |src| is large enough to hold an n-byte code unit.
   1.280 +            if (i + n > srclen)
   1.281 +                INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
   1.282 +
   1.283 +            // Check the second byte.  From Unicode Standard v6.2, Table 3-7
   1.284 +            // Well-Formed UTF-8 Byte Sequences.
   1.285 +            if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) ||  // E0 A0~BF
   1.286 +                (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) ||  // ED 80~9F
   1.287 +                (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) ||  // F0 90~BF
   1.288 +                (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80))    // F4 80~8F
   1.289 +            {
   1.290 +                INVALID(ReportInvalidCharacter, i, 1);
   1.291 +            }
   1.292 +
   1.293 +            // Check the continuation bytes.
   1.294 +            for (uint32_t m = 1; m < n; m++)
   1.295 +                if ((src[i + m] & 0xC0) != 0x80)
   1.296 +                    INVALID(ReportInvalidCharacter, i, m);
   1.297 +
   1.298 +            // Determine the code unit's length in jschars and act accordingly.
   1.299 +            v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n);
   1.300 +            if (v < 0x10000) {
   1.301 +                // The n-byte UTF8 code unit will fit in a single jschar.
   1.302 +                if (action == Copy)
   1.303 +                    dst[j] = jschar(v);
   1.304 +
   1.305 +            } else {
   1.306 +                v -= 0x10000;
   1.307 +                if (v <= 0xFFFFF) {
   1.308 +                    // The n-byte UTF8 code unit will fit in two jschars.
   1.309 +                    if (action == Copy)
   1.310 +                        dst[j] = jschar((v >> 10) + 0xD800);
   1.311 +                    j++;
   1.312 +                    if (action == Copy)
   1.313 +                        dst[j] = jschar((v & 0x3FF) + 0xDC00);
   1.314 +
   1.315 +                } else {
   1.316 +                    // The n-byte UTF8 code unit won't fit in two jschars.
   1.317 +                    INVALID(ReportTooBigCharacter, v, 1);
   1.318 +                }
   1.319 +            }
   1.320 +
   1.321 +          invalidMultiByteCodeUnit:
   1.322 +            // Move i to the last byte of the multi-byte code unit;  the loop
   1.323 +            // header will do the final i++ to move to the start of the next
   1.324 +            // code unit.
   1.325 +            i += n - 1;
   1.326 +        }
   1.327 +    }
   1.328 +
   1.329 +    *dstlenp = j;
   1.330 +
   1.331 +    return true;
   1.332 +}
   1.333 +
   1.334 +typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip);
   1.335 +
   1.336 +static TwoByteCharsZ
   1.337 +InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen)
   1.338 +{
   1.339 +    *outlen = 0;
   1.340 +
   1.341 +    bool isAscii;
   1.342 +    if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii))
   1.343 +        return TwoByteCharsZ();
   1.344 +
   1.345 +    jschar *dst = cx->pod_malloc<jschar>(*outlen + 1);  // +1 for NUL
   1.346 +    if (!dst)
   1.347 +        return TwoByteCharsZ();
   1.348 +
   1.349 +    if (isAscii) {
   1.350 +        size_t srclen = src.length();
   1.351 +        JS_ASSERT(*outlen == srclen);
   1.352 +        for (uint32_t i = 0; i < srclen; i++)
   1.353 +            dst[i] = jschar(src[i]);
   1.354 +
   1.355 +    } else {
   1.356 +        JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii));
   1.357 +    }
   1.358 +
   1.359 +    dst[*outlen] = 0;    // NUL char
   1.360 +
   1.361 +    return TwoByteCharsZ(dst, *outlen);
   1.362 +}
   1.363 +
   1.364 +TwoByteCharsZ
   1.365 +JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
   1.366 +{
   1.367 +    return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndReportInvalids>,
   1.368 +                                   outlen);
   1.369 +}
   1.370 +
   1.371 +TwoByteCharsZ
   1.372 +JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
   1.373 +{
   1.374 +    return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndIgnoreInvalids>,
   1.375 +                                   outlen);
   1.376 +}
   1.377 +

mercurial