michael@0: // NOTE: Requires testharness.js michael@0: // http://www.w3.org/2008/webapps/wiki/Harness michael@0: michael@0: test( michael@0: function() { michael@0: var badStrings = [ michael@0: { input: '\ud800', expected: '\ufffd' }, // Surrogate half michael@0: { input: '\udc00', expected: '\ufffd' }, // Surrogate half michael@0: { input: 'abc\ud800def', expected: 'abc\ufffddef' }, // Surrogate half michael@0: { input: 'abc\udc00def', expected: 'abc\ufffddef' }, // Surrogate half michael@0: { input: '\udc00\ud800', expected: '\ufffd\ufffd' } // Wrong order michael@0: ]; michael@0: michael@0: badStrings.forEach( michael@0: function(t) { michael@0: var encoded = new TextEncoder('utf-8').encode(t.input); michael@0: var decoded = new TextDecoder('utf-8').decode(encoded); michael@0: assert_equals(t.expected, decoded); michael@0: }); michael@0: }, michael@0: "bad data" michael@0: ); michael@0: michael@0: test( michael@0: function() { michael@0: var bad = [ michael@0: { encoding: 'utf-8', input: [0xC0] }, // ends early michael@0: { encoding: 'utf-8', input: [0xC0, 0x00] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xC0, 0xC0] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xE0] }, // ends early michael@0: { encoding: 'utf-8', input: [0xE0, 0x00] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xE0, 0xC0] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xE0, 0x80, 0x00] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0] }, // invalid trail michael@0: { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80] }, // > 0x10FFFF michael@0: { encoding: 'utf-16le', input: [0x00] }, // truncated code unit michael@0: { encoding: 'utf-16le', input: [0x00, 0xd8] }, // surrogate half michael@0: { encoding: 'utf-16le', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half michael@0: { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate michael@0: { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0xd8] } // swapped surrogates michael@0: // TODO: Single byte encoding cases michael@0: ]; michael@0: michael@0: bad.forEach( michael@0: function(t) { michael@0: assert_throws({name: 'EncodingError'}, function () { michael@0: new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input)); michael@0: }); michael@0: }); michael@0: }, michael@0: "fatal flag" michael@0: ); michael@0: michael@0: test( michael@0: function() { michael@0: var encodings = [ michael@0: { label: 'utf-8', encoding: 'utf-8' }, michael@0: { label: 'utf-16', encoding: 'utf-16le' }, michael@0: { label: 'utf-16le', encoding: 'utf-16le' }, michael@0: { label: 'utf-16be', encoding: 'utf-16be' }, michael@0: { label: 'ascii', encoding: 'windows-1252' }, michael@0: { label: 'iso-8859-1', encoding: 'windows-1252' } michael@0: ]; michael@0: michael@0: encodings.forEach( michael@0: function(test) { michael@0: assert_equals(new TextDecoder(test.label.toLowerCase()).encoding, test.encoding); michael@0: assert_equals(new TextDecoder(test.label.toUpperCase()).encoding, test.encoding); michael@0: }); michael@0: }, michael@0: "Encoding names are case insensitive" michael@0: ); michael@0: michael@0: test( michael@0: function() { michael@0: var utf8_bom = [0xEF, 0xBB, 0xBF]; michael@0: var utf8 = [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD]; michael@0: michael@0: var utf16le_bom = [0xff, 0xfe]; michael@0: var utf16le = [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF]; michael@0: michael@0: var utf16be_bom = [0xfe, 0xff]; michael@0: var utf16be = [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD]; michael@0: michael@0: var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character michael@0: michael@0: // missing BOMs michael@0: assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8)), string); michael@0: assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string); michael@0: assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string); michael@0: michael@0: // matching BOMs michael@0: assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8_bom.concat(utf8))), string); michael@0: assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le_bom.concat(utf16le))), string) michael@0: assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be_bom.concat(utf16be))), string); michael@0: michael@0: // matching BOMs split michael@0: var decoder8 = new TextDecoder('utf-8'); michael@0: assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 1)), {stream: true}), ''); michael@0: assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(1).concat(utf8))), string); michael@0: assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 2)), {stream: true}), ''); michael@0: assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(2).concat(utf8))), string); michael@0: var decoder16le = new TextDecoder('utf-16le'); michael@0: assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(0, 1)), {stream: true}), ''); michael@0: assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(1).concat(utf16le))), string); michael@0: var decoder16be = new TextDecoder('utf-16be'); michael@0: assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(0, 1)), {stream: true}), ''); michael@0: assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(1).concat(utf16be))), string); michael@0: michael@0: // mismatching BOMs michael@0: assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16le_bom.concat(utf8))), string); michael@0: assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16be_bom.concat(utf8))), string); michael@0: assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf8_bom.concat(utf16le))), string); michael@0: assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16be_bom.concat(utf16le))), string); michael@0: assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf8_bom.concat(utf16be))), string); michael@0: assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16le_bom.concat(utf16be))), string); michael@0: }, michael@0: "Byte-order marks" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: assert_equals(new TextDecoder("utf-8").encoding, "utf-8"); // canonical case michael@0: assert_equals(new TextDecoder("UTF-16").encoding, "utf-16le"); // canonical case and name michael@0: assert_equals(new TextDecoder("UTF-16BE").encoding, "utf-16be"); // canonical case and name michael@0: assert_equals(new TextDecoder("iso8859-1").encoding, "windows-1252"); // canonical case and name michael@0: assert_equals(new TextDecoder("iso-8859-1").encoding, "windows-1252"); // canonical case and name michael@0: }, michael@0: "Encoding names" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: ["utf-8", "utf-16le", "utf-16be"].forEach(function (encoding) { michael@0: var string = "\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF"; michael@0: var encoded = new TextEncoder(encoding).encode(string); michael@0: michael@0: for (var len = 1; len <= 5; ++len) { michael@0: var out = "", decoder = new TextDecoder(encoding); michael@0: for (var i = 0; i < encoded.length; i += len) { michael@0: var sub = []; michael@0: for (var j = i; j < encoded.length && j < i + len; ++j) { michael@0: sub.push(encoded[j]); michael@0: } michael@0: out += decoder.decode(new Uint8Array(sub), {stream: true}); michael@0: } michael@0: out += decoder.decode(); michael@0: assert_equals(out, string, "streaming decode " + encoding); michael@0: } michael@0: }); michael@0: }, michael@0: "Streaming Decode" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: var jis = [0x82, 0xC9, 0x82, 0xD9, 0x82, 0xF1]; michael@0: var expected = "\u306B\u307B\u3093"; // Nihon michael@0: assert_equals(new TextDecoder("shift_jis").decode(new Uint8Array(jis)), expected); michael@0: }, michael@0: "Shift_JIS Decode" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; michael@0: michael@0: encodings.forEach(function (encoding) { michael@0: var string = '', bytes = []; michael@0: for (var i = 0; i < 128; ++i) { michael@0: michael@0: // Encodings that have escape codes in 0x00-0x7F michael@0: if (encoding === "hz-gb-2312" && i === 0x7E) michael@0: continue; michael@0: if (encoding === "iso-2022-jp" && i === 0x1B) michael@0: continue; michael@0: michael@0: string += String.fromCharCode(i); michael@0: bytes.push(i); michael@0: } michael@0: var ascii_encoded = new TextEncoder('utf-8').encode(string); michael@0: assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding); michael@0: //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding); michael@0: }); michael@0: }, michael@0: "Supersets of ASCII decode ASCII correctly" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-8", {fatal: true}).decode(new Uint8Array([0xff])); }); michael@0: // This should not hang: michael@0: new TextDecoder("utf-8").decode(new Uint8Array([0xff])); michael@0: michael@0: assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16", {fatal: true}).decode(new Uint8Array([0x00])); }); michael@0: // This should not hang: michael@0: new TextDecoder("utf-16").decode(new Uint8Array([0x00])); michael@0: michael@0: assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16be", {fatal: true}).decode(new Uint8Array([0x00])); }); michael@0: // This should not hang: michael@0: new TextDecoder("utf-16be").decode(new Uint8Array([0x00])); michael@0: }, michael@0: "Non-fatal errors at EOF" michael@0: ); michael@0: michael@0: test( michael@0: function () { michael@0: michael@0: var utf_encodings = ["utf-8", "utf-16le", "utf-16be"]; michael@0: michael@0: var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; michael@0: michael@0: utf_encodings.forEach(function(encoding) { michael@0: assert_equals(new TextDecoder(encoding).encoding, encoding); michael@0: assert_equals(new TextEncoder(encoding).encoding, encoding); michael@0: }); michael@0: michael@0: legacy_encodings.forEach(function(encoding) { michael@0: assert_equals(new TextDecoder(encoding).encoding, encoding); michael@0: assert_throws({name: 'TypeError'}, function() { new TextEncoder(encoding); }); michael@0: }); michael@0: }, michael@0: "Non-UTF encodings supported only for decode, not encode" michael@0: );