dom/encoding/test/unit/test_misc.js

Fri, 16 Jan 2015 18:13:44 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 18:13:44 +0100
branch
TOR_BUG_9701
changeset 14
925c144e1f1f
permissions
-rw-r--r--

Integrate suggestion from review to improve consistency with existing code.

michael@0 1 // NOTE: Requires testharness.js
michael@0 2 // http://www.w3.org/2008/webapps/wiki/Harness
michael@0 3
michael@0 4 test(
michael@0 5 function() {
michael@0 6 var badStrings = [
michael@0 7 { input: '\ud800', expected: '\ufffd' }, // Surrogate half
michael@0 8 { input: '\udc00', expected: '\ufffd' }, // Surrogate half
michael@0 9 { input: 'abc\ud800def', expected: 'abc\ufffddef' }, // Surrogate half
michael@0 10 { input: 'abc\udc00def', expected: 'abc\ufffddef' }, // Surrogate half
michael@0 11 { input: '\udc00\ud800', expected: '\ufffd\ufffd' } // Wrong order
michael@0 12 ];
michael@0 13
michael@0 14 badStrings.forEach(
michael@0 15 function(t) {
michael@0 16 var encoded = new TextEncoder('utf-8').encode(t.input);
michael@0 17 var decoded = new TextDecoder('utf-8').decode(encoded);
michael@0 18 assert_equals(t.expected, decoded);
michael@0 19 });
michael@0 20 },
michael@0 21 "bad data"
michael@0 22 );
michael@0 23
michael@0 24 test(
michael@0 25 function() {
michael@0 26 var bad = [
michael@0 27 { encoding: 'utf-8', input: [0xC0] }, // ends early
michael@0 28 { encoding: 'utf-8', input: [0xC0, 0x00] }, // invalid trail
michael@0 29 { encoding: 'utf-8', input: [0xC0, 0xC0] }, // invalid trail
michael@0 30 { encoding: 'utf-8', input: [0xE0] }, // ends early
michael@0 31 { encoding: 'utf-8', input: [0xE0, 0x00] }, // invalid trail
michael@0 32 { encoding: 'utf-8', input: [0xE0, 0xC0] }, // invalid trail
michael@0 33 { encoding: 'utf-8', input: [0xE0, 0x80, 0x00] }, // invalid trail
michael@0 34 { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0] }, // invalid trail
michael@0 35 { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80] }, // > 0x10FFFF
michael@0 36 { encoding: 'utf-16le', input: [0x00] }, // truncated code unit
michael@0 37 { encoding: 'utf-16le', input: [0x00, 0xd8] }, // surrogate half
michael@0 38 { encoding: 'utf-16le', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half
michael@0 39 { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate
michael@0 40 { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0xd8] } // swapped surrogates
michael@0 41 // TODO: Single byte encoding cases
michael@0 42 ];
michael@0 43
michael@0 44 bad.forEach(
michael@0 45 function(t) {
michael@0 46 assert_throws({name: 'EncodingError'}, function () {
michael@0 47 new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input));
michael@0 48 });
michael@0 49 });
michael@0 50 },
michael@0 51 "fatal flag"
michael@0 52 );
michael@0 53
michael@0 54 test(
michael@0 55 function() {
michael@0 56 var encodings = [
michael@0 57 { label: 'utf-8', encoding: 'utf-8' },
michael@0 58 { label: 'utf-16', encoding: 'utf-16le' },
michael@0 59 { label: 'utf-16le', encoding: 'utf-16le' },
michael@0 60 { label: 'utf-16be', encoding: 'utf-16be' },
michael@0 61 { label: 'ascii', encoding: 'windows-1252' },
michael@0 62 { label: 'iso-8859-1', encoding: 'windows-1252' }
michael@0 63 ];
michael@0 64
michael@0 65 encodings.forEach(
michael@0 66 function(test) {
michael@0 67 assert_equals(new TextDecoder(test.label.toLowerCase()).encoding, test.encoding);
michael@0 68 assert_equals(new TextDecoder(test.label.toUpperCase()).encoding, test.encoding);
michael@0 69 });
michael@0 70 },
michael@0 71 "Encoding names are case insensitive"
michael@0 72 );
michael@0 73
michael@0 74 test(
michael@0 75 function() {
michael@0 76 var utf8_bom = [0xEF, 0xBB, 0xBF];
michael@0 77 var utf8 = [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD];
michael@0 78
michael@0 79 var utf16le_bom = [0xff, 0xfe];
michael@0 80 var utf16le = [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF];
michael@0 81
michael@0 82 var utf16be_bom = [0xfe, 0xff];
michael@0 83 var utf16be = [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD];
michael@0 84
michael@0 85 var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character
michael@0 86
michael@0 87 // missing BOMs
michael@0 88 assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8)), string);
michael@0 89 assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string);
michael@0 90 assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string);
michael@0 91
michael@0 92 // matching BOMs
michael@0 93 assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8_bom.concat(utf8))), string);
michael@0 94 assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le_bom.concat(utf16le))), string)
michael@0 95 assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be_bom.concat(utf16be))), string);
michael@0 96
michael@0 97 // matching BOMs split
michael@0 98 var decoder8 = new TextDecoder('utf-8');
michael@0 99 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 1)), {stream: true}), '');
michael@0 100 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(1).concat(utf8))), string);
michael@0 101 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 2)), {stream: true}), '');
michael@0 102 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(2).concat(utf8))), string);
michael@0 103 var decoder16le = new TextDecoder('utf-16le');
michael@0 104 assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(0, 1)), {stream: true}), '');
michael@0 105 assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(1).concat(utf16le))), string);
michael@0 106 var decoder16be = new TextDecoder('utf-16be');
michael@0 107 assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(0, 1)), {stream: true}), '');
michael@0 108 assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(1).concat(utf16be))), string);
michael@0 109
michael@0 110 // mismatching BOMs
michael@0 111 assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16le_bom.concat(utf8))), string);
michael@0 112 assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16be_bom.concat(utf8))), string);
michael@0 113 assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf8_bom.concat(utf16le))), string);
michael@0 114 assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16be_bom.concat(utf16le))), string);
michael@0 115 assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf8_bom.concat(utf16be))), string);
michael@0 116 assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16le_bom.concat(utf16be))), string);
michael@0 117 },
michael@0 118 "Byte-order marks"
michael@0 119 );
michael@0 120
michael@0 121 test(
michael@0 122 function () {
michael@0 123 assert_equals(new TextDecoder("utf-8").encoding, "utf-8"); // canonical case
michael@0 124 assert_equals(new TextDecoder("UTF-16").encoding, "utf-16le"); // canonical case and name
michael@0 125 assert_equals(new TextDecoder("UTF-16BE").encoding, "utf-16be"); // canonical case and name
michael@0 126 assert_equals(new TextDecoder("iso8859-1").encoding, "windows-1252"); // canonical case and name
michael@0 127 assert_equals(new TextDecoder("iso-8859-1").encoding, "windows-1252"); // canonical case and name
michael@0 128 },
michael@0 129 "Encoding names"
michael@0 130 );
michael@0 131
michael@0 132 test(
michael@0 133 function () {
michael@0 134 ["utf-8", "utf-16le", "utf-16be"].forEach(function (encoding) {
michael@0 135 var string = "\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF";
michael@0 136 var encoded = new TextEncoder(encoding).encode(string);
michael@0 137
michael@0 138 for (var len = 1; len <= 5; ++len) {
michael@0 139 var out = "", decoder = new TextDecoder(encoding);
michael@0 140 for (var i = 0; i < encoded.length; i += len) {
michael@0 141 var sub = [];
michael@0 142 for (var j = i; j < encoded.length && j < i + len; ++j) {
michael@0 143 sub.push(encoded[j]);
michael@0 144 }
michael@0 145 out += decoder.decode(new Uint8Array(sub), {stream: true});
michael@0 146 }
michael@0 147 out += decoder.decode();
michael@0 148 assert_equals(out, string, "streaming decode " + encoding);
michael@0 149 }
michael@0 150 });
michael@0 151 },
michael@0 152 "Streaming Decode"
michael@0 153 );
michael@0 154
michael@0 155 test(
michael@0 156 function () {
michael@0 157 var jis = [0x82, 0xC9, 0x82, 0xD9, 0x82, 0xF1];
michael@0 158 var expected = "\u306B\u307B\u3093"; // Nihon
michael@0 159 assert_equals(new TextDecoder("shift_jis").decode(new Uint8Array(jis)), expected);
michael@0 160 },
michael@0 161 "Shift_JIS Decode"
michael@0 162 );
michael@0 163
michael@0 164 test(
michael@0 165 function () {
michael@0 166 var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"];
michael@0 167
michael@0 168 encodings.forEach(function (encoding) {
michael@0 169 var string = '', bytes = [];
michael@0 170 for (var i = 0; i < 128; ++i) {
michael@0 171
michael@0 172 // Encodings that have escape codes in 0x00-0x7F
michael@0 173 if (encoding === "hz-gb-2312" && i === 0x7E)
michael@0 174 continue;
michael@0 175 if (encoding === "iso-2022-jp" && i === 0x1B)
michael@0 176 continue;
michael@0 177
michael@0 178 string += String.fromCharCode(i);
michael@0 179 bytes.push(i);
michael@0 180 }
michael@0 181 var ascii_encoded = new TextEncoder('utf-8').encode(string);
michael@0 182 assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding);
michael@0 183 //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding);
michael@0 184 });
michael@0 185 },
michael@0 186 "Supersets of ASCII decode ASCII correctly"
michael@0 187 );
michael@0 188
michael@0 189 test(
michael@0 190 function () {
michael@0 191 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-8", {fatal: true}).decode(new Uint8Array([0xff])); });
michael@0 192 // This should not hang:
michael@0 193 new TextDecoder("utf-8").decode(new Uint8Array([0xff]));
michael@0 194
michael@0 195 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16", {fatal: true}).decode(new Uint8Array([0x00])); });
michael@0 196 // This should not hang:
michael@0 197 new TextDecoder("utf-16").decode(new Uint8Array([0x00]));
michael@0 198
michael@0 199 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16be", {fatal: true}).decode(new Uint8Array([0x00])); });
michael@0 200 // This should not hang:
michael@0 201 new TextDecoder("utf-16be").decode(new Uint8Array([0x00]));
michael@0 202 },
michael@0 203 "Non-fatal errors at EOF"
michael@0 204 );
michael@0 205
michael@0 206 test(
michael@0 207 function () {
michael@0 208
michael@0 209 var utf_encodings = ["utf-8", "utf-16le", "utf-16be"];
michael@0 210
michael@0 211 var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"];
michael@0 212
michael@0 213 utf_encodings.forEach(function(encoding) {
michael@0 214 assert_equals(new TextDecoder(encoding).encoding, encoding);
michael@0 215 assert_equals(new TextEncoder(encoding).encoding, encoding);
michael@0 216 });
michael@0 217
michael@0 218 legacy_encodings.forEach(function(encoding) {
michael@0 219 assert_equals(new TextDecoder(encoding).encoding, encoding);
michael@0 220 assert_throws({name: 'TypeError'}, function() { new TextEncoder(encoding); });
michael@0 221 });
michael@0 222 },
michael@0 223 "Non-UTF encodings supported only for decode, not encode"
michael@0 224 );

mercurial