Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | // NOTE: Requires testharness.js |
michael@0 | 2 | // http://www.w3.org/2008/webapps/wiki/Harness |
michael@0 | 3 | |
michael@0 | 4 | test( |
michael@0 | 5 | function() { |
michael@0 | 6 | var badStrings = [ |
michael@0 | 7 | { input: '\ud800', expected: '\ufffd' }, // Surrogate half |
michael@0 | 8 | { input: '\udc00', expected: '\ufffd' }, // Surrogate half |
michael@0 | 9 | { input: 'abc\ud800def', expected: 'abc\ufffddef' }, // Surrogate half |
michael@0 | 10 | { input: 'abc\udc00def', expected: 'abc\ufffddef' }, // Surrogate half |
michael@0 | 11 | { input: '\udc00\ud800', expected: '\ufffd\ufffd' } // Wrong order |
michael@0 | 12 | ]; |
michael@0 | 13 | |
michael@0 | 14 | badStrings.forEach( |
michael@0 | 15 | function(t) { |
michael@0 | 16 | var encoded = new TextEncoder('utf-8').encode(t.input); |
michael@0 | 17 | var decoded = new TextDecoder('utf-8').decode(encoded); |
michael@0 | 18 | assert_equals(t.expected, decoded); |
michael@0 | 19 | }); |
michael@0 | 20 | }, |
michael@0 | 21 | "bad data" |
michael@0 | 22 | ); |
michael@0 | 23 | |
michael@0 | 24 | test( |
michael@0 | 25 | function() { |
michael@0 | 26 | var bad = [ |
michael@0 | 27 | { encoding: 'utf-8', input: [0xC0] }, // ends early |
michael@0 | 28 | { encoding: 'utf-8', input: [0xC0, 0x00] }, // invalid trail |
michael@0 | 29 | { encoding: 'utf-8', input: [0xC0, 0xC0] }, // invalid trail |
michael@0 | 30 | { encoding: 'utf-8', input: [0xE0] }, // ends early |
michael@0 | 31 | { encoding: 'utf-8', input: [0xE0, 0x00] }, // invalid trail |
michael@0 | 32 | { encoding: 'utf-8', input: [0xE0, 0xC0] }, // invalid trail |
michael@0 | 33 | { encoding: 'utf-8', input: [0xE0, 0x80, 0x00] }, // invalid trail |
michael@0 | 34 | { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0] }, // invalid trail |
michael@0 | 35 | { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80] }, // > 0x10FFFF |
michael@0 | 36 | { encoding: 'utf-16le', input: [0x00] }, // truncated code unit |
michael@0 | 37 | { encoding: 'utf-16le', input: [0x00, 0xd8] }, // surrogate half |
michael@0 | 38 | { encoding: 'utf-16le', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half |
michael@0 | 39 | { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate |
michael@0 | 40 | { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0xd8] } // swapped surrogates |
michael@0 | 41 | // TODO: Single byte encoding cases |
michael@0 | 42 | ]; |
michael@0 | 43 | |
michael@0 | 44 | bad.forEach( |
michael@0 | 45 | function(t) { |
michael@0 | 46 | assert_throws({name: 'EncodingError'}, function () { |
michael@0 | 47 | new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input)); |
michael@0 | 48 | }); |
michael@0 | 49 | }); |
michael@0 | 50 | }, |
michael@0 | 51 | "fatal flag" |
michael@0 | 52 | ); |
michael@0 | 53 | |
michael@0 | 54 | test( |
michael@0 | 55 | function() { |
michael@0 | 56 | var encodings = [ |
michael@0 | 57 | { label: 'utf-8', encoding: 'utf-8' }, |
michael@0 | 58 | { label: 'utf-16', encoding: 'utf-16le' }, |
michael@0 | 59 | { label: 'utf-16le', encoding: 'utf-16le' }, |
michael@0 | 60 | { label: 'utf-16be', encoding: 'utf-16be' }, |
michael@0 | 61 | { label: 'ascii', encoding: 'windows-1252' }, |
michael@0 | 62 | { label: 'iso-8859-1', encoding: 'windows-1252' } |
michael@0 | 63 | ]; |
michael@0 | 64 | |
michael@0 | 65 | encodings.forEach( |
michael@0 | 66 | function(test) { |
michael@0 | 67 | assert_equals(new TextDecoder(test.label.toLowerCase()).encoding, test.encoding); |
michael@0 | 68 | assert_equals(new TextDecoder(test.label.toUpperCase()).encoding, test.encoding); |
michael@0 | 69 | }); |
michael@0 | 70 | }, |
michael@0 | 71 | "Encoding names are case insensitive" |
michael@0 | 72 | ); |
michael@0 | 73 | |
michael@0 | 74 | test( |
michael@0 | 75 | function() { |
michael@0 | 76 | var utf8_bom = [0xEF, 0xBB, 0xBF]; |
michael@0 | 77 | var utf8 = [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD]; |
michael@0 | 78 | |
michael@0 | 79 | var utf16le_bom = [0xff, 0xfe]; |
michael@0 | 80 | var utf16le = [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF]; |
michael@0 | 81 | |
michael@0 | 82 | var utf16be_bom = [0xfe, 0xff]; |
michael@0 | 83 | var utf16be = [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD]; |
michael@0 | 84 | |
michael@0 | 85 | var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character |
michael@0 | 86 | |
michael@0 | 87 | // missing BOMs |
michael@0 | 88 | assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8)), string); |
michael@0 | 89 | assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string); |
michael@0 | 90 | assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string); |
michael@0 | 91 | |
michael@0 | 92 | // matching BOMs |
michael@0 | 93 | assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8_bom.concat(utf8))), string); |
michael@0 | 94 | assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le_bom.concat(utf16le))), string) |
michael@0 | 95 | assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be_bom.concat(utf16be))), string); |
michael@0 | 96 | |
michael@0 | 97 | // matching BOMs split |
michael@0 | 98 | var decoder8 = new TextDecoder('utf-8'); |
michael@0 | 99 | assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 1)), {stream: true}), ''); |
michael@0 | 100 | assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(1).concat(utf8))), string); |
michael@0 | 101 | assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 2)), {stream: true}), ''); |
michael@0 | 102 | assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(2).concat(utf8))), string); |
michael@0 | 103 | var decoder16le = new TextDecoder('utf-16le'); |
michael@0 | 104 | assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(0, 1)), {stream: true}), ''); |
michael@0 | 105 | assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(1).concat(utf16le))), string); |
michael@0 | 106 | var decoder16be = new TextDecoder('utf-16be'); |
michael@0 | 107 | assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(0, 1)), {stream: true}), ''); |
michael@0 | 108 | assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(1).concat(utf16be))), string); |
michael@0 | 109 | |
michael@0 | 110 | // mismatching BOMs |
michael@0 | 111 | assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16le_bom.concat(utf8))), string); |
michael@0 | 112 | assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16be_bom.concat(utf8))), string); |
michael@0 | 113 | assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf8_bom.concat(utf16le))), string); |
michael@0 | 114 | assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16be_bom.concat(utf16le))), string); |
michael@0 | 115 | assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf8_bom.concat(utf16be))), string); |
michael@0 | 116 | assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16le_bom.concat(utf16be))), string); |
michael@0 | 117 | }, |
michael@0 | 118 | "Byte-order marks" |
michael@0 | 119 | ); |
michael@0 | 120 | |
michael@0 | 121 | test( |
michael@0 | 122 | function () { |
michael@0 | 123 | assert_equals(new TextDecoder("utf-8").encoding, "utf-8"); // canonical case |
michael@0 | 124 | assert_equals(new TextDecoder("UTF-16").encoding, "utf-16le"); // canonical case and name |
michael@0 | 125 | assert_equals(new TextDecoder("UTF-16BE").encoding, "utf-16be"); // canonical case and name |
michael@0 | 126 | assert_equals(new TextDecoder("iso8859-1").encoding, "windows-1252"); // canonical case and name |
michael@0 | 127 | assert_equals(new TextDecoder("iso-8859-1").encoding, "windows-1252"); // canonical case and name |
michael@0 | 128 | }, |
michael@0 | 129 | "Encoding names" |
michael@0 | 130 | ); |
michael@0 | 131 | |
michael@0 | 132 | test( |
michael@0 | 133 | function () { |
michael@0 | 134 | ["utf-8", "utf-16le", "utf-16be"].forEach(function (encoding) { |
michael@0 | 135 | var string = "\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF"; |
michael@0 | 136 | var encoded = new TextEncoder(encoding).encode(string); |
michael@0 | 137 | |
michael@0 | 138 | for (var len = 1; len <= 5; ++len) { |
michael@0 | 139 | var out = "", decoder = new TextDecoder(encoding); |
michael@0 | 140 | for (var i = 0; i < encoded.length; i += len) { |
michael@0 | 141 | var sub = []; |
michael@0 | 142 | for (var j = i; j < encoded.length && j < i + len; ++j) { |
michael@0 | 143 | sub.push(encoded[j]); |
michael@0 | 144 | } |
michael@0 | 145 | out += decoder.decode(new Uint8Array(sub), {stream: true}); |
michael@0 | 146 | } |
michael@0 | 147 | out += decoder.decode(); |
michael@0 | 148 | assert_equals(out, string, "streaming decode " + encoding); |
michael@0 | 149 | } |
michael@0 | 150 | }); |
michael@0 | 151 | }, |
michael@0 | 152 | "Streaming Decode" |
michael@0 | 153 | ); |
michael@0 | 154 | |
michael@0 | 155 | test( |
michael@0 | 156 | function () { |
michael@0 | 157 | var jis = [0x82, 0xC9, 0x82, 0xD9, 0x82, 0xF1]; |
michael@0 | 158 | var expected = "\u306B\u307B\u3093"; // Nihon |
michael@0 | 159 | assert_equals(new TextDecoder("shift_jis").decode(new Uint8Array(jis)), expected); |
michael@0 | 160 | }, |
michael@0 | 161 | "Shift_JIS Decode" |
michael@0 | 162 | ); |
michael@0 | 163 | |
michael@0 | 164 | test( |
michael@0 | 165 | function () { |
michael@0 | 166 | var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; |
michael@0 | 167 | |
michael@0 | 168 | encodings.forEach(function (encoding) { |
michael@0 | 169 | var string = '', bytes = []; |
michael@0 | 170 | for (var i = 0; i < 128; ++i) { |
michael@0 | 171 | |
michael@0 | 172 | // Encodings that have escape codes in 0x00-0x7F |
michael@0 | 173 | if (encoding === "hz-gb-2312" && i === 0x7E) |
michael@0 | 174 | continue; |
michael@0 | 175 | if (encoding === "iso-2022-jp" && i === 0x1B) |
michael@0 | 176 | continue; |
michael@0 | 177 | |
michael@0 | 178 | string += String.fromCharCode(i); |
michael@0 | 179 | bytes.push(i); |
michael@0 | 180 | } |
michael@0 | 181 | var ascii_encoded = new TextEncoder('utf-8').encode(string); |
michael@0 | 182 | assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding); |
michael@0 | 183 | //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding); |
michael@0 | 184 | }); |
michael@0 | 185 | }, |
michael@0 | 186 | "Supersets of ASCII decode ASCII correctly" |
michael@0 | 187 | ); |
michael@0 | 188 | |
michael@0 | 189 | test( |
michael@0 | 190 | function () { |
michael@0 | 191 | assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-8", {fatal: true}).decode(new Uint8Array([0xff])); }); |
michael@0 | 192 | // This should not hang: |
michael@0 | 193 | new TextDecoder("utf-8").decode(new Uint8Array([0xff])); |
michael@0 | 194 | |
michael@0 | 195 | assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16", {fatal: true}).decode(new Uint8Array([0x00])); }); |
michael@0 | 196 | // This should not hang: |
michael@0 | 197 | new TextDecoder("utf-16").decode(new Uint8Array([0x00])); |
michael@0 | 198 | |
michael@0 | 199 | assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16be", {fatal: true}).decode(new Uint8Array([0x00])); }); |
michael@0 | 200 | // This should not hang: |
michael@0 | 201 | new TextDecoder("utf-16be").decode(new Uint8Array([0x00])); |
michael@0 | 202 | }, |
michael@0 | 203 | "Non-fatal errors at EOF" |
michael@0 | 204 | ); |
michael@0 | 205 | |
michael@0 | 206 | test( |
michael@0 | 207 | function () { |
michael@0 | 208 | |
michael@0 | 209 | var utf_encodings = ["utf-8", "utf-16le", "utf-16be"]; |
michael@0 | 210 | |
michael@0 | 211 | var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; |
michael@0 | 212 | |
michael@0 | 213 | utf_encodings.forEach(function(encoding) { |
michael@0 | 214 | assert_equals(new TextDecoder(encoding).encoding, encoding); |
michael@0 | 215 | assert_equals(new TextEncoder(encoding).encoding, encoding); |
michael@0 | 216 | }); |
michael@0 | 217 | |
michael@0 | 218 | legacy_encodings.forEach(function(encoding) { |
michael@0 | 219 | assert_equals(new TextDecoder(encoding).encoding, encoding); |
michael@0 | 220 | assert_throws({name: 'TypeError'}, function() { new TextEncoder(encoding); }); |
michael@0 | 221 | }); |
michael@0 | 222 | }, |
michael@0 | 223 | "Non-UTF encodings supported only for decode, not encode" |
michael@0 | 224 | ); |