1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/dom/encoding/test/unit/test_utf.js Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,166 @@ 1.4 +// NOTE: Requires testharness.js 1.5 +// http://www.w3.org/2008/webapps/wiki/Harness 1.6 + 1.7 +// Extension to testharness.js API which avoids logging enormous strings 1.8 +// on a coding failure. 1.9 +function assert_string_equals(actual, expected, description) { 1.10 + // short circuit success case 1.11 + if (actual === expected) { 1.12 + assert_true(true, description + ": <actual> === <expected>"); 1.13 + return; 1.14 + } 1.15 + 1.16 + // length check 1.17 + assert_equals(actual.length, expected.length, 1.18 + description + ": string lengths") 1.19 + 1.20 + var i, a, b; 1.21 + for (i = 0; i < actual.length; i++) { 1.22 + a = actual.charCodeAt(i); 1.23 + b = expected.charCodeAt(i); 1.24 + if (a !== b) 1.25 + assert_true(false, 1.26 + description + 1.27 + ": code unit " + i.toString() + " unequal: " + 1.28 + cpname(a) + " != " + cpname(b)); // doesn't return 1.29 + } 1.30 + 1.31 + // It should be impossible to get here, because the initial 1.32 + // comparison failed, so either the length comparison or the 1.33 + // codeunit-by-codeunit comparison should also fail. 1.34 + assert_true(false, description + ": failed to detect string difference"); 1.35 +} 1.36 + 1.37 +// Inspired by: 1.38 +// http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html 1.39 +function encode_utf8(string) { 1.40 + var utf8 = unescape(encodeURIComponent(string)); 1.41 + var octets = new Uint8Array(utf8.length), i; 1.42 + for (i = 0; i < utf8.length; i += 1) { 1.43 + octets[i] = utf8.charCodeAt(i); 1.44 + } 1.45 + return octets; 1.46 +} 1.47 + 1.48 +function decode_utf8(octets) { 1.49 + var utf8 = String.fromCharCode.apply(null, octets); 1.50 + return decodeURIComponent(escape(utf8)); 1.51 +} 1.52 + 1.53 +// Helpers for test_utf_roundtrip. 1.54 +function cpname(n) { 1.55 + if (n+0 !== n) 1.56 + return n.toString(); 1.57 + var w = (n <= 0xFFFF) ? 4 : 6; 1.58 + return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w); 1.59 +} 1.60 + 1.61 +function genblock(from, len) { 1.62 + var i, j, point, offset; 1.63 + var size, block; 1.64 + 1.65 + // determine size required: 1.66 + // 1 unit for each point from U+000000 through U+00D7FF 1.67 + // 0 units U+00D800 through U+00DFFF 1.68 + // 1 unit U+00E000 through U+00FFFF 1.69 + // 2 units U+010000 through U+10FFFF 1.70 + function overlap(min1, max1, min2, max2) { 1.71 + return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2)); 1.72 + } 1.73 + size = (overlap(from, from+len, 0x000000, 0x00D800) + 1.74 + overlap(from, from+len, 0x00E000, 0x010000) + 1.75 + overlap(from, from+len, 0x010000, 0x110000)*2); 1.76 + 1.77 + block = new Uint16Array(size); 1.78 + for (i = 0, j = 0; i < len; i++) { 1.79 + point = from + i; 1.80 + if (0xD800 <= point && point <= 0xDFFF) 1.81 + continue; 1.82 + else if (point <= 0xFFFF) 1.83 + block[j++] = point; 1.84 + else { 1.85 + offset = point - 0x10000; 1.86 + block[j++] = 0xD800 + (offset >> 10); 1.87 + block[j++] = 0xDC00 + (offset & 0x3FF); 1.88 + } 1.89 + } 1.90 + return String.fromCharCode.apply(null, block); 1.91 +} 1.92 + 1.93 +function test_utf_roundtrip () { 1.94 + var MIN_CODEPOINT = 0; 1.95 + var MAX_CODEPOINT = 0x10FFFF; 1.96 + var BLOCK_SIZE = 0x1000; 1.97 + 1.98 + var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded; 1.99 + 1.100 + var TE_U16LE = new TextEncoder("UTF-16LE"); 1.101 + var TD_U16LE = new TextDecoder("UTF-16LE"); 1.102 + 1.103 + var TE_U16BE = new TextEncoder("UTF-16BE"); 1.104 + var TD_U16BE = new TextDecoder("UTF-16BE"); 1.105 + 1.106 + var TE_U8 = new TextEncoder("UTF-8"); 1.107 + var TD_U8 = new TextDecoder("UTF-8"); 1.108 + 1.109 + for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { 1.110 + block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1); 1.111 + block = genblock(i, BLOCK_SIZE); 1.112 + 1.113 + // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves 1.114 + encoded = TE_U16LE.encode(block); 1.115 + decoded = TD_U16LE.decode(encoded); 1.116 + assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); 1.117 + 1.118 + encoded = TE_U16BE.encode(block); 1.119 + decoded = TD_U16BE.decode(encoded); 1.120 + assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); 1.121 + 1.122 + encoded = TE_U8.encode(block); 1.123 + decoded = TD_U8.decode(encoded); 1.124 + assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag); 1.125 + 1.126 + // test TextEncoder(UTF-8) against the older idiom 1.127 + exp_encoded = encode_utf8(block); 1.128 + assert_array_equals(encoded, exp_encoded, 1.129 + "UTF-8 reference encoding " + block_tag); 1.130 + 1.131 + exp_decoded = decode_utf8(exp_encoded); 1.132 + assert_string_equals(decoded, exp_decoded, 1.133 + "UTF-8 reference decoding " + block_tag); 1.134 + } 1.135 +} 1.136 + 1.137 +function test_utf_samples () { 1.138 + // z, cent, CJK water, G-Clef, Private-use character 1.139 + var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; 1.140 + var cases = [ 1.141 + { encoding: "utf-8", 1.142 + expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] }, 1.143 + { encoding: "utf-16le", 1.144 + expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, 1.145 + { encoding: "utf-16", 1.146 + expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, 1.147 + { encoding: "utf-16be", 1.148 + expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] } 1.149 + ]; 1.150 + 1.151 + cases.forEach( 1.152 + function(t) { 1.153 + var encoded = new TextEncoder(t.encoding).encode(sample); 1.154 + assert_array_equals(encoded, t.expected, 1.155 + "expected equal encodings - " + t.encoding); 1.156 + 1.157 + var decoded = new TextDecoder(t.encoding) 1.158 + .decode(new Uint8Array(t.expected)); 1.159 + assert_equals(decoded, sample, 1.160 + "expected equal decodings - " + t.encoding); 1.161 + }); 1.162 +} 1.163 + 1.164 +test(test_utf_samples, 1.165 + "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample"); 1.166 + 1.167 +test(test_utf_roundtrip, 1.168 + "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+ 1.169 + "agreement with encode/decodeURIComponent");