michael@0: /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * vim: set ts=8 sts=4 et sw=4 tw=99: michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "js/CharacterEncoding.h" michael@0: michael@0: #include "jscntxt.h" michael@0: #include "jsprf.h" michael@0: michael@0: using namespace JS; michael@0: michael@0: Latin1CharsZ michael@0: JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) michael@0: { michael@0: JS_ASSERT(cx); michael@0: size_t len = tbchars.length(); michael@0: unsigned char *latin1 = cx->pod_malloc(len + 1); michael@0: if (!latin1) michael@0: return Latin1CharsZ(); michael@0: for (size_t i = 0; i < len; ++i) michael@0: latin1[i] = static_cast(tbchars[i]); michael@0: latin1[len] = '\0'; michael@0: return Latin1CharsZ(latin1, len); michael@0: } michael@0: michael@0: static size_t michael@0: GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars) michael@0: { michael@0: size_t nbytes; michael@0: const jschar *end; michael@0: unsigned c, c2; michael@0: michael@0: nbytes = nchars; michael@0: for (end = chars + nchars; chars != end; chars++) { michael@0: c = *chars; michael@0: if (c < 0x80) michael@0: continue; michael@0: if (0xD800 <= c && c <= 0xDFFF) { michael@0: /* nbytes sets 1 length since this is surrogate pair. */ michael@0: if (c >= 0xDC00 || (chars + 1) == end) { michael@0: nbytes += 2; /* Bad Surrogate */ michael@0: continue; michael@0: } michael@0: c2 = chars[1]; michael@0: if (c2 < 0xDC00 || c2 > 0xDFFF) { michael@0: nbytes += 2; /* Bad Surrogate */ michael@0: continue; michael@0: } michael@0: c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; michael@0: nbytes--; michael@0: chars++; michael@0: } michael@0: c >>= 11; michael@0: nbytes++; michael@0: while (c) { michael@0: c >>= 5; michael@0: nbytes++; michael@0: } michael@0: } michael@0: return nbytes; michael@0: } michael@0: michael@0: static bool michael@0: PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) { michael@0: if (*dstlenp < 3) michael@0: return false; michael@0: *(*dst)++ = (char) 0xEF; michael@0: *(*dst)++ = (char) 0xBF; michael@0: *(*dst)++ = (char) 0xBD; michael@0: *dstlenp -= 3; michael@0: return true; michael@0: } michael@0: michael@0: /* michael@0: * Write up to |*dstlenp| bytes into |dst|. Writes the number of bytes used michael@0: * into |*dstlenp| on success. Returns false on failure. michael@0: */ michael@0: static bool michael@0: DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen, michael@0: char *dst, size_t *dstlenp) michael@0: { michael@0: size_t dstlen = *dstlenp; michael@0: size_t origDstlen = dstlen; michael@0: michael@0: while (srclen) { michael@0: uint32_t v; michael@0: jschar c = *src++; michael@0: srclen--; michael@0: if (c >= 0xDC00 && c <= 0xDFFF) { michael@0: if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) michael@0: goto bufferTooSmall; michael@0: continue; michael@0: } else if (c < 0xD800 || c > 0xDBFF) { michael@0: v = c; michael@0: } else { michael@0: if (srclen < 1) { michael@0: if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) michael@0: goto bufferTooSmall; michael@0: continue; michael@0: } michael@0: jschar c2 = *src; michael@0: if ((c2 < 0xDC00) || (c2 > 0xDFFF)) { michael@0: if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) michael@0: goto bufferTooSmall; michael@0: continue; michael@0: } michael@0: src++; michael@0: srclen--; michael@0: v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; michael@0: } michael@0: size_t utf8Len; michael@0: if (v < 0x0080) { michael@0: /* no encoding necessary - performance hack */ michael@0: if (dstlen == 0) michael@0: goto bufferTooSmall; michael@0: *dst++ = (char) v; michael@0: utf8Len = 1; michael@0: } else { michael@0: uint8_t utf8buf[4]; michael@0: utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v); michael@0: if (utf8Len > dstlen) michael@0: goto bufferTooSmall; michael@0: for (size_t i = 0; i < utf8Len; i++) michael@0: *dst++ = (char) utf8buf[i]; michael@0: } michael@0: dstlen -= utf8Len; michael@0: } michael@0: *dstlenp = (origDstlen - dstlen); michael@0: return true; michael@0: michael@0: bufferTooSmall: michael@0: *dstlenp = (origDstlen - dstlen); michael@0: if (cx->isJSContext()) michael@0: JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr, michael@0: JSMSG_BUFFER_TOO_SMALL); michael@0: return false; michael@0: } michael@0: michael@0: michael@0: UTF8CharsZ michael@0: JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) michael@0: { michael@0: JS_ASSERT(cx); michael@0: michael@0: /* Get required buffer size. */ michael@0: jschar *str = tbchars.start().get(); michael@0: size_t len = GetDeflatedUTF8StringLength(str, tbchars.length()); michael@0: michael@0: /* Allocate buffer. */ michael@0: unsigned char *utf8 = cx->pod_malloc(len + 1); michael@0: if (!utf8) michael@0: return UTF8CharsZ(); michael@0: michael@0: /* Encode to UTF8. */ michael@0: DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len); michael@0: utf8[len] = '\0'; michael@0: michael@0: return UTF8CharsZ(utf8, len); michael@0: } michael@0: michael@0: static const uint32_t INVALID_UTF8 = UINT32_MAX; michael@0: michael@0: /* michael@0: * Convert a utf8 character sequence into a UCS-4 character and return that michael@0: * character. It is assumed that the caller already checked that the sequence michael@0: * is valid. michael@0: */ michael@0: uint32_t michael@0: JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length) michael@0: { michael@0: JS_ASSERT(1 <= utf8Length && utf8Length <= 4); michael@0: michael@0: if (utf8Length == 1) { michael@0: JS_ASSERT(!(*utf8Buffer & 0x80)); michael@0: return *utf8Buffer; michael@0: } michael@0: michael@0: /* from Unicode 3.1, non-shortest form is illegal */ michael@0: static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 }; michael@0: michael@0: JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == michael@0: (0x100 - (1 << (8 - utf8Length)))); michael@0: uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); michael@0: uint32_t minucs4Char = minucs4Table[utf8Length - 2]; michael@0: while (--utf8Length) { michael@0: JS_ASSERT((*utf8Buffer & 0xC0) == 0x80); michael@0: ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); michael@0: } michael@0: michael@0: if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) michael@0: return INVALID_UTF8; michael@0: michael@0: return ucs4Char; michael@0: } michael@0: michael@0: static void michael@0: ReportInvalidCharacter(JSContext *cx, uint32_t offset) michael@0: { michael@0: char buffer[10]; michael@0: JS_snprintf(buffer, 10, "%d", offset); michael@0: JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, michael@0: JSMSG_MALFORMED_UTF8_CHAR, buffer); michael@0: } michael@0: michael@0: static void michael@0: ReportBufferTooSmall(JSContext *cx, uint32_t dummy) michael@0: { michael@0: JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL); michael@0: } michael@0: michael@0: static void michael@0: ReportTooBigCharacter(JSContext *cx, uint32_t v) michael@0: { michael@0: char buffer[10]; michael@0: JS_snprintf(buffer, 10, "0x%x", v + 0x10000); michael@0: JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, michael@0: JSMSG_UTF8_CHAR_TOO_LARGE, buffer); michael@0: } michael@0: michael@0: enum InflateUTF8Action { michael@0: CountAndReportInvalids, michael@0: CountAndIgnoreInvalids, michael@0: Copy michael@0: }; michael@0: michael@0: static const uint32_t REPLACE_UTF8 = 0xFFFD; michael@0: michael@0: // If making changes to this algorithm, make sure to also update michael@0: // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp michael@0: template michael@0: static bool michael@0: InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp, michael@0: bool *isAsciip) michael@0: { michael@0: *isAsciip = true; michael@0: michael@0: // First, count how many jschars need to be in the inflated string. michael@0: // |i| is the index into |src|, and |j| is the the index into |dst|. michael@0: size_t srclen = src.length(); michael@0: uint32_t j = 0; michael@0: for (uint32_t i = 0; i < srclen; i++, j++) { michael@0: uint32_t v = uint32_t(src[i]); michael@0: if (!(v & 0x80)) { michael@0: // ASCII code unit. Simple copy. michael@0: if (action == Copy) michael@0: dst[j] = jschar(v); michael@0: michael@0: } else { michael@0: // Non-ASCII code unit. Determine its length in bytes (n). michael@0: *isAsciip = false; michael@0: uint32_t n = 1; michael@0: while (v & (0x80 >> n)) michael@0: n++; michael@0: michael@0: #define INVALID(report, arg, n2) \ michael@0: do { \ michael@0: if (action == CountAndReportInvalids) { \ michael@0: report(cx, arg); \ michael@0: return false; \ michael@0: } else { \ michael@0: if (action == Copy) \ michael@0: dst[j] = jschar(REPLACE_UTF8); \ michael@0: else \ michael@0: JS_ASSERT(action == CountAndIgnoreInvalids); \ michael@0: n = n2; \ michael@0: goto invalidMultiByteCodeUnit; \ michael@0: } \ michael@0: } while (0) michael@0: michael@0: // Check the leading byte. michael@0: if (n < 2 || n > 4) michael@0: INVALID(ReportInvalidCharacter, i, 1); michael@0: michael@0: // Check that |src| is large enough to hold an n-byte code unit. michael@0: if (i + n > srclen) michael@0: INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); michael@0: michael@0: // Check the second byte. From Unicode Standard v6.2, Table 3-7 michael@0: // Well-Formed UTF-8 Byte Sequences. michael@0: if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF michael@0: (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F michael@0: (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF michael@0: (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F michael@0: { michael@0: INVALID(ReportInvalidCharacter, i, 1); michael@0: } michael@0: michael@0: // Check the continuation bytes. michael@0: for (uint32_t m = 1; m < n; m++) michael@0: if ((src[i + m] & 0xC0) != 0x80) michael@0: INVALID(ReportInvalidCharacter, i, m); michael@0: michael@0: // Determine the code unit's length in jschars and act accordingly. michael@0: v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n); michael@0: if (v < 0x10000) { michael@0: // The n-byte UTF8 code unit will fit in a single jschar. michael@0: if (action == Copy) michael@0: dst[j] = jschar(v); michael@0: michael@0: } else { michael@0: v -= 0x10000; michael@0: if (v <= 0xFFFFF) { michael@0: // The n-byte UTF8 code unit will fit in two jschars. michael@0: if (action == Copy) michael@0: dst[j] = jschar((v >> 10) + 0xD800); michael@0: j++; michael@0: if (action == Copy) michael@0: dst[j] = jschar((v & 0x3FF) + 0xDC00); michael@0: michael@0: } else { michael@0: // The n-byte UTF8 code unit won't fit in two jschars. michael@0: INVALID(ReportTooBigCharacter, v, 1); michael@0: } michael@0: } michael@0: michael@0: invalidMultiByteCodeUnit: michael@0: // Move i to the last byte of the multi-byte code unit; the loop michael@0: // header will do the final i++ to move to the start of the next michael@0: // code unit. michael@0: i += n - 1; michael@0: } michael@0: } michael@0: michael@0: *dstlenp = j; michael@0: michael@0: return true; michael@0: } michael@0: michael@0: typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip); michael@0: michael@0: static TwoByteCharsZ michael@0: InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen) michael@0: { michael@0: *outlen = 0; michael@0: michael@0: bool isAscii; michael@0: if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii)) michael@0: return TwoByteCharsZ(); michael@0: michael@0: jschar *dst = cx->pod_malloc(*outlen + 1); // +1 for NUL michael@0: if (!dst) michael@0: return TwoByteCharsZ(); michael@0: michael@0: if (isAscii) { michael@0: size_t srclen = src.length(); michael@0: JS_ASSERT(*outlen == srclen); michael@0: for (uint32_t i = 0; i < srclen; i++) michael@0: dst[i] = jschar(src[i]); michael@0: michael@0: } else { michael@0: JS_ALWAYS_TRUE(InflateUTF8StringToBuffer(cx, src, dst, outlen, &isAscii)); michael@0: } michael@0: michael@0: dst[*outlen] = 0; // NUL char michael@0: michael@0: return TwoByteCharsZ(dst, *outlen); michael@0: } michael@0: michael@0: TwoByteCharsZ michael@0: JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) michael@0: { michael@0: return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer, michael@0: outlen); michael@0: } michael@0: michael@0: TwoByteCharsZ michael@0: JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) michael@0: { michael@0: return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer, michael@0: outlen); michael@0: } michael@0: