michael@0: // NOTE: Requires testharness.js michael@0: // http://www.w3.org/2008/webapps/wiki/Harness michael@0: michael@0: // Extension to testharness.js API which avoids logging enormous strings michael@0: // on a coding failure. michael@0: function assert_string_equals(actual, expected, description) { michael@0: // short circuit success case michael@0: if (actual === expected) { michael@0: assert_true(true, description + ": === "); michael@0: return; michael@0: } michael@0: michael@0: // length check michael@0: assert_equals(actual.length, expected.length, michael@0: description + ": string lengths") michael@0: michael@0: var i, a, b; michael@0: for (i = 0; i < actual.length; i++) { michael@0: a = actual.charCodeAt(i); michael@0: b = expected.charCodeAt(i); michael@0: if (a !== b) michael@0: assert_true(false, michael@0: description + michael@0: ": code unit " + i.toString() + " unequal: " + michael@0: cpname(a) + " != " + cpname(b)); // doesn't return michael@0: } michael@0: michael@0: // It should be impossible to get here, because the initial michael@0: // comparison failed, so either the length comparison or the michael@0: // codeunit-by-codeunit comparison should also fail. michael@0: assert_true(false, description + ": failed to detect string difference"); michael@0: } michael@0: michael@0: // Inspired by: michael@0: // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html michael@0: function encode_utf8(string) { michael@0: var utf8 = unescape(encodeURIComponent(string)); michael@0: var octets = new Uint8Array(utf8.length), i; michael@0: for (i = 0; i < utf8.length; i += 1) { michael@0: octets[i] = utf8.charCodeAt(i); michael@0: } michael@0: return octets; michael@0: } michael@0: michael@0: function decode_utf8(octets) { michael@0: var utf8 = String.fromCharCode.apply(null, octets); michael@0: return decodeURIComponent(escape(utf8)); michael@0: } michael@0: michael@0: // Helpers for test_utf_roundtrip. michael@0: function cpname(n) { michael@0: if (n+0 !== n) michael@0: return n.toString(); michael@0: var w = (n <= 0xFFFF) ? 4 : 6; michael@0: return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w); michael@0: } michael@0: michael@0: function genblock(from, len) { michael@0: var i, j, point, offset; michael@0: var size, block; michael@0: michael@0: // determine size required: michael@0: // 1 unit for each point from U+000000 through U+00D7FF michael@0: // 0 units U+00D800 through U+00DFFF michael@0: // 1 unit U+00E000 through U+00FFFF michael@0: // 2 units U+010000 through U+10FFFF michael@0: function overlap(min1, max1, min2, max2) { michael@0: return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2)); michael@0: } michael@0: size = (overlap(from, from+len, 0x000000, 0x00D800) + michael@0: overlap(from, from+len, 0x00E000, 0x010000) + michael@0: overlap(from, from+len, 0x010000, 0x110000)*2); michael@0: michael@0: block = new Uint16Array(size); michael@0: for (i = 0, j = 0; i < len; i++) { michael@0: point = from + i; michael@0: if (0xD800 <= point && point <= 0xDFFF) michael@0: continue; michael@0: else if (point <= 0xFFFF) michael@0: block[j++] = point; michael@0: else { michael@0: offset = point - 0x10000; michael@0: block[j++] = 0xD800 + (offset >> 10); michael@0: block[j++] = 0xDC00 + (offset & 0x3FF); michael@0: } michael@0: } michael@0: return String.fromCharCode.apply(null, block); michael@0: } michael@0: michael@0: function test_utf_roundtrip () { michael@0: var MIN_CODEPOINT = 0; michael@0: var MAX_CODEPOINT = 0x10FFFF; michael@0: var BLOCK_SIZE = 0x1000; michael@0: michael@0: var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded; michael@0: michael@0: var TE_U16LE = new TextEncoder("UTF-16LE"); michael@0: var TD_U16LE = new TextDecoder("UTF-16LE"); michael@0: michael@0: var TE_U16BE = new TextEncoder("UTF-16BE"); michael@0: var TD_U16BE = new TextDecoder("UTF-16BE"); michael@0: michael@0: var TE_U8 = new TextEncoder("UTF-8"); michael@0: var TD_U8 = new TextDecoder("UTF-8"); michael@0: michael@0: for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { michael@0: block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1); michael@0: block = genblock(i, BLOCK_SIZE); michael@0: michael@0: // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves michael@0: encoded = TE_U16LE.encode(block); michael@0: decoded = TD_U16LE.decode(encoded); michael@0: assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); michael@0: michael@0: encoded = TE_U16BE.encode(block); michael@0: decoded = TD_U16BE.decode(encoded); michael@0: assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); michael@0: michael@0: encoded = TE_U8.encode(block); michael@0: decoded = TD_U8.decode(encoded); michael@0: assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag); michael@0: michael@0: // test TextEncoder(UTF-8) against the older idiom michael@0: exp_encoded = encode_utf8(block); michael@0: assert_array_equals(encoded, exp_encoded, michael@0: "UTF-8 reference encoding " + block_tag); michael@0: michael@0: exp_decoded = decode_utf8(exp_encoded); michael@0: assert_string_equals(decoded, exp_decoded, michael@0: "UTF-8 reference decoding " + block_tag); michael@0: } michael@0: } michael@0: michael@0: function test_utf_samples () { michael@0: // z, cent, CJK water, G-Clef, Private-use character michael@0: var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; michael@0: var cases = [ michael@0: { encoding: "utf-8", michael@0: expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] }, michael@0: { encoding: "utf-16le", michael@0: expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, michael@0: { encoding: "utf-16", michael@0: expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, michael@0: { encoding: "utf-16be", michael@0: expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] } michael@0: ]; michael@0: michael@0: cases.forEach( michael@0: function(t) { michael@0: var encoded = new TextEncoder(t.encoding).encode(sample); michael@0: assert_array_equals(encoded, t.expected, michael@0: "expected equal encodings - " + t.encoding); michael@0: michael@0: var decoded = new TextDecoder(t.encoding) michael@0: .decode(new Uint8Array(t.expected)); michael@0: assert_equals(decoded, sample, michael@0: "expected equal decodings - " + t.encoding); michael@0: }); michael@0: } michael@0: michael@0: test(test_utf_samples, michael@0: "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample"); michael@0: michael@0: test(test_utf_roundtrip, michael@0: "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+ michael@0: "agreement with encode/decodeURIComponent");