|
1 // NOTE: Requires testharness.js |
|
2 // http://www.w3.org/2008/webapps/wiki/Harness |
|
3 |
|
4 test( |
|
5 function() { |
|
6 var badStrings = [ |
|
7 { input: '\ud800', expected: '\ufffd' }, // Surrogate half |
|
8 { input: '\udc00', expected: '\ufffd' }, // Surrogate half |
|
9 { input: 'abc\ud800def', expected: 'abc\ufffddef' }, // Surrogate half |
|
10 { input: 'abc\udc00def', expected: 'abc\ufffddef' }, // Surrogate half |
|
11 { input: '\udc00\ud800', expected: '\ufffd\ufffd' } // Wrong order |
|
12 ]; |
|
13 |
|
14 badStrings.forEach( |
|
15 function(t) { |
|
16 var encoded = new TextEncoder('utf-8').encode(t.input); |
|
17 var decoded = new TextDecoder('utf-8').decode(encoded); |
|
18 assert_equals(t.expected, decoded); |
|
19 }); |
|
20 }, |
|
21 "bad data" |
|
22 ); |
|
23 |
|
24 test( |
|
25 function() { |
|
26 var bad = [ |
|
27 { encoding: 'utf-8', input: [0xC0] }, // ends early |
|
28 { encoding: 'utf-8', input: [0xC0, 0x00] }, // invalid trail |
|
29 { encoding: 'utf-8', input: [0xC0, 0xC0] }, // invalid trail |
|
30 { encoding: 'utf-8', input: [0xE0] }, // ends early |
|
31 { encoding: 'utf-8', input: [0xE0, 0x00] }, // invalid trail |
|
32 { encoding: 'utf-8', input: [0xE0, 0xC0] }, // invalid trail |
|
33 { encoding: 'utf-8', input: [0xE0, 0x80, 0x00] }, // invalid trail |
|
34 { encoding: 'utf-8', input: [0xE0, 0x80, 0xC0] }, // invalid trail |
|
35 { encoding: 'utf-8', input: [0xFC, 0x80, 0x80, 0x80, 0x80, 0x80] }, // > 0x10FFFF |
|
36 { encoding: 'utf-16le', input: [0x00] }, // truncated code unit |
|
37 { encoding: 'utf-16le', input: [0x00, 0xd8] }, // surrogate half |
|
38 { encoding: 'utf-16le', input: [0x00, 0xd8, 0x00, 0x00] }, // surrogate half |
|
39 { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0x00] }, // trail surrogate |
|
40 { encoding: 'utf-16le', input: [0x00, 0xdc, 0x00, 0xd8] } // swapped surrogates |
|
41 // TODO: Single byte encoding cases |
|
42 ]; |
|
43 |
|
44 bad.forEach( |
|
45 function(t) { |
|
46 assert_throws({name: 'EncodingError'}, function () { |
|
47 new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input)); |
|
48 }); |
|
49 }); |
|
50 }, |
|
51 "fatal flag" |
|
52 ); |
|
53 |
|
54 test( |
|
55 function() { |
|
56 var encodings = [ |
|
57 { label: 'utf-8', encoding: 'utf-8' }, |
|
58 { label: 'utf-16', encoding: 'utf-16le' }, |
|
59 { label: 'utf-16le', encoding: 'utf-16le' }, |
|
60 { label: 'utf-16be', encoding: 'utf-16be' }, |
|
61 { label: 'ascii', encoding: 'windows-1252' }, |
|
62 { label: 'iso-8859-1', encoding: 'windows-1252' } |
|
63 ]; |
|
64 |
|
65 encodings.forEach( |
|
66 function(test) { |
|
67 assert_equals(new TextDecoder(test.label.toLowerCase()).encoding, test.encoding); |
|
68 assert_equals(new TextDecoder(test.label.toUpperCase()).encoding, test.encoding); |
|
69 }); |
|
70 }, |
|
71 "Encoding names are case insensitive" |
|
72 ); |
|
73 |
|
74 test( |
|
75 function() { |
|
76 var utf8_bom = [0xEF, 0xBB, 0xBF]; |
|
77 var utf8 = [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD]; |
|
78 |
|
79 var utf16le_bom = [0xff, 0xfe]; |
|
80 var utf16le = [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF]; |
|
81 |
|
82 var utf16be_bom = [0xfe, 0xff]; |
|
83 var utf16be = [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD]; |
|
84 |
|
85 var string = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; // z, cent, CJK water, G-Clef, Private-use character |
|
86 |
|
87 // missing BOMs |
|
88 assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8)), string); |
|
89 assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le)), string); |
|
90 assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be)), string); |
|
91 |
|
92 // matching BOMs |
|
93 assert_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf8_bom.concat(utf8))), string); |
|
94 assert_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16le_bom.concat(utf16le))), string) |
|
95 assert_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16be_bom.concat(utf16be))), string); |
|
96 |
|
97 // matching BOMs split |
|
98 var decoder8 = new TextDecoder('utf-8'); |
|
99 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 1)), {stream: true}), ''); |
|
100 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(1).concat(utf8))), string); |
|
101 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(0, 2)), {stream: true}), ''); |
|
102 assert_equals(decoder8.decode(new Uint8Array(utf8_bom.slice(2).concat(utf8))), string); |
|
103 var decoder16le = new TextDecoder('utf-16le'); |
|
104 assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(0, 1)), {stream: true}), ''); |
|
105 assert_equals(decoder16le.decode(new Uint8Array(utf16le_bom.slice(1).concat(utf16le))), string); |
|
106 var decoder16be = new TextDecoder('utf-16be'); |
|
107 assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(0, 1)), {stream: true}), ''); |
|
108 assert_equals(decoder16be.decode(new Uint8Array(utf16be_bom.slice(1).concat(utf16be))), string); |
|
109 |
|
110 // mismatching BOMs |
|
111 assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16le_bom.concat(utf8))), string); |
|
112 assert_not_equals(new TextDecoder('utf-8').decode(new Uint8Array(utf16be_bom.concat(utf8))), string); |
|
113 assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf8_bom.concat(utf16le))), string); |
|
114 assert_not_equals(new TextDecoder('utf-16le').decode(new Uint8Array(utf16be_bom.concat(utf16le))), string); |
|
115 assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf8_bom.concat(utf16be))), string); |
|
116 assert_not_equals(new TextDecoder('utf-16be').decode(new Uint8Array(utf16le_bom.concat(utf16be))), string); |
|
117 }, |
|
118 "Byte-order marks" |
|
119 ); |
|
120 |
|
121 test( |
|
122 function () { |
|
123 assert_equals(new TextDecoder("utf-8").encoding, "utf-8"); // canonical case |
|
124 assert_equals(new TextDecoder("UTF-16").encoding, "utf-16le"); // canonical case and name |
|
125 assert_equals(new TextDecoder("UTF-16BE").encoding, "utf-16be"); // canonical case and name |
|
126 assert_equals(new TextDecoder("iso8859-1").encoding, "windows-1252"); // canonical case and name |
|
127 assert_equals(new TextDecoder("iso-8859-1").encoding, "windows-1252"); // canonical case and name |
|
128 }, |
|
129 "Encoding names" |
|
130 ); |
|
131 |
|
132 test( |
|
133 function () { |
|
134 ["utf-8", "utf-16le", "utf-16be"].forEach(function (encoding) { |
|
135 var string = "\x00123ABCabc\x80\xFF\u0100\u1000\uFFFD\uD800\uDC00\uDBFF\uDFFF"; |
|
136 var encoded = new TextEncoder(encoding).encode(string); |
|
137 |
|
138 for (var len = 1; len <= 5; ++len) { |
|
139 var out = "", decoder = new TextDecoder(encoding); |
|
140 for (var i = 0; i < encoded.length; i += len) { |
|
141 var sub = []; |
|
142 for (var j = i; j < encoded.length && j < i + len; ++j) { |
|
143 sub.push(encoded[j]); |
|
144 } |
|
145 out += decoder.decode(new Uint8Array(sub), {stream: true}); |
|
146 } |
|
147 out += decoder.decode(); |
|
148 assert_equals(out, string, "streaming decode " + encoding); |
|
149 } |
|
150 }); |
|
151 }, |
|
152 "Streaming Decode" |
|
153 ); |
|
154 |
|
155 test( |
|
156 function () { |
|
157 var jis = [0x82, 0xC9, 0x82, 0xD9, 0x82, 0xF1]; |
|
158 var expected = "\u306B\u307B\u3093"; // Nihon |
|
159 assert_equals(new TextDecoder("shift_jis").decode(new Uint8Array(jis)), expected); |
|
160 }, |
|
161 "Shift_JIS Decode" |
|
162 ); |
|
163 |
|
164 test( |
|
165 function () { |
|
166 var encodings = ["utf-8", "ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; |
|
167 |
|
168 encodings.forEach(function (encoding) { |
|
169 var string = '', bytes = []; |
|
170 for (var i = 0; i < 128; ++i) { |
|
171 |
|
172 // Encodings that have escape codes in 0x00-0x7F |
|
173 if (encoding === "hz-gb-2312" && i === 0x7E) |
|
174 continue; |
|
175 if (encoding === "iso-2022-jp" && i === 0x1B) |
|
176 continue; |
|
177 |
|
178 string += String.fromCharCode(i); |
|
179 bytes.push(i); |
|
180 } |
|
181 var ascii_encoded = new TextEncoder('utf-8').encode(string); |
|
182 assert_equals(new TextDecoder(encoding).decode(ascii_encoded), string, encoding); |
|
183 //assert_array_equals(new TextEncoder(encoding).encode(string), bytes, encoding); |
|
184 }); |
|
185 }, |
|
186 "Supersets of ASCII decode ASCII correctly" |
|
187 ); |
|
188 |
|
189 test( |
|
190 function () { |
|
191 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-8", {fatal: true}).decode(new Uint8Array([0xff])); }); |
|
192 // This should not hang: |
|
193 new TextDecoder("utf-8").decode(new Uint8Array([0xff])); |
|
194 |
|
195 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16", {fatal: true}).decode(new Uint8Array([0x00])); }); |
|
196 // This should not hang: |
|
197 new TextDecoder("utf-16").decode(new Uint8Array([0x00])); |
|
198 |
|
199 assert_throws({name: 'EncodingError'}, function() { new TextDecoder("utf-16be", {fatal: true}).decode(new Uint8Array([0x00])); }); |
|
200 // This should not hang: |
|
201 new TextDecoder("utf-16be").decode(new Uint8Array([0x00])); |
|
202 }, |
|
203 "Non-fatal errors at EOF" |
|
204 ); |
|
205 |
|
206 test( |
|
207 function () { |
|
208 |
|
209 var utf_encodings = ["utf-8", "utf-16le", "utf-16be"]; |
|
210 |
|
211 var legacy_encodings = ["ibm866", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7", "iso-8859-8", "iso-8859-8-i", "iso-8859-10", "iso-8859-13", "iso-8859-14", "iso-8859-15", "iso-8859-16", "koi8-r", "koi8-u", "macintosh", "windows-874", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254", "windows-1255", "windows-1256", "windows-1257", "windows-1258", "x-mac-cyrillic", "gbk", "gb18030", "hz-gb-2312", "big5", "euc-jp", "iso-2022-jp", "shift_jis", "euc-kr", "x-user-defined"]; |
|
212 |
|
213 utf_encodings.forEach(function(encoding) { |
|
214 assert_equals(new TextDecoder(encoding).encoding, encoding); |
|
215 assert_equals(new TextEncoder(encoding).encoding, encoding); |
|
216 }); |
|
217 |
|
218 legacy_encodings.forEach(function(encoding) { |
|
219 assert_equals(new TextDecoder(encoding).encoding, encoding); |
|
220 assert_throws({name: 'TypeError'}, function() { new TextEncoder(encoding); }); |
|
221 }); |
|
222 }, |
|
223 "Non-UTF encodings supported only for decode, not encode" |
|
224 ); |