|
1 const Cc = Components.classes; |
|
2 const Ci = Components.interfaces; |
|
3 |
|
4 const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE; |
|
5 |
|
6 var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; |
|
7 |
|
8 var dataDir; |
|
9 |
|
10 function run_test() |
|
11 { |
|
12 BIS = Components.Constructor("@mozilla.org/binaryinputstream;1", |
|
13 "nsIBinaryInputStream", |
|
14 "setInputStream"); |
|
15 BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1", |
|
16 "nsIBinaryOutputStream", |
|
17 "setOutputStream"); |
|
18 _Pipe = Components.Constructor("@mozilla.org/pipe;1", |
|
19 "nsIPipe", |
|
20 "init"); |
|
21 COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1", |
|
22 "nsIConverterOutputStream", |
|
23 "init"); |
|
24 FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1", |
|
25 "nsIFileInputStream", |
|
26 "init"); |
|
27 _SS = Components.Constructor("@mozilla.org/storagestream;1", |
|
28 "nsIStorageStream", |
|
29 "init"); |
|
30 CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1", |
|
31 "nsIConverterInputStream", |
|
32 "init"); |
|
33 |
|
34 dataDir = do_get_file("data/"); |
|
35 |
|
36 test_utf8_1(); |
|
37 test_utf16_1(); |
|
38 test_utf16_2(); |
|
39 test_utf16_3(); |
|
40 test_cross_conversion(); |
|
41 } |
|
42 |
|
43 const UNICODE_STRINGS = |
|
44 [ |
|
45 '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE', |
|
46 |
|
47 'AZaz09 \u007F ' + // U+000000 to U+00007F |
|
48 '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF |
|
49 '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF |
|
50 |
|
51 // there would be strings containing non-BMP code points here, but |
|
52 // unfortunately JS strings are UCS-2 (and worse yet are treated as |
|
53 // 16-bit values by the spec), so we have to do gymnastics to work |
|
54 // with non-BMP -- manual surrogate decoding doesn't work because |
|
55 // String.prototype.charCodeAt() ignores surrogate pairs and only |
|
56 // returns 16-bit values |
|
57 ]; |
|
58 |
|
59 // test conversion equality -- keys are names of files containing equivalent |
|
60 // Unicode data, values are the encoding of the file in the format expected by |
|
61 // nsIConverter(In|Out)putStream.init |
|
62 const UNICODE_FILES = |
|
63 { |
|
64 "unicode-conversion.utf8.txt": "UTF-8", |
|
65 "unicode-conversion.utf16.txt": "UTF-16", |
|
66 "unicode-conversion.utf16le.txt": "UTF-16LE", |
|
67 "unicode-conversion.utf16be.txt": "UTF-16BE" |
|
68 }; |
|
69 |
|
70 function test_utf8_1() |
|
71 { |
|
72 for (var i = 0; i < UNICODE_STRINGS.length; i++) |
|
73 { |
|
74 var pipe = Pipe(); |
|
75 var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0); |
|
76 do_check_true(conv.writeString(UNICODE_STRINGS[i])); |
|
77 conv.close(); |
|
78 |
|
79 if (!equal(new UTF8(pipe.inputStream), |
|
80 stringToCodePoints(UNICODE_STRINGS[i]))) |
|
81 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); |
|
82 } |
|
83 } |
|
84 |
|
85 function test_utf16_1() |
|
86 { |
|
87 for (var i = 0; i < UNICODE_STRINGS.length; i++) |
|
88 { |
|
89 var pipe = Pipe(); |
|
90 var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0); |
|
91 do_check_true(conv.writeString(UNICODE_STRINGS[i])); |
|
92 conv.close(); |
|
93 |
|
94 if (!equal(new UTF16(pipe.inputStream), |
|
95 stringToCodePoints(UNICODE_STRINGS[i]))) |
|
96 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); |
|
97 } |
|
98 } |
|
99 |
|
100 function test_utf16_2() |
|
101 { |
|
102 for (var i = 0; i < UNICODE_STRINGS.length; i++) |
|
103 { |
|
104 var pipe = Pipe(); |
|
105 var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0); |
|
106 do_check_true(conv.writeString(UNICODE_STRINGS[i])); |
|
107 conv.close(); |
|
108 |
|
109 if (!equal(new UTF16(pipe.inputStream, false), |
|
110 stringToCodePoints(UNICODE_STRINGS[i]))) |
|
111 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); |
|
112 } |
|
113 } |
|
114 |
|
115 function test_utf16_3() |
|
116 { |
|
117 for (var i = 0; i < UNICODE_STRINGS.length; i++) |
|
118 { |
|
119 var pipe = Pipe(); |
|
120 var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0); |
|
121 do_check_true(conv.writeString(UNICODE_STRINGS[i])); |
|
122 conv.close(); |
|
123 |
|
124 if (!equal(new UTF16(pipe.inputStream, true), |
|
125 stringToCodePoints(UNICODE_STRINGS[i]))) |
|
126 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); |
|
127 } |
|
128 } |
|
129 |
|
130 |
|
131 function test_cross_conversion() |
|
132 { |
|
133 for (var fn1 in UNICODE_FILES) |
|
134 { |
|
135 var fin = getBinaryInputStream(fn1); |
|
136 var ss = StorageStream(); |
|
137 |
|
138 var bos = new BOS(ss.getOutputStream(0)); |
|
139 var av; |
|
140 while ((av = fin.available()) > 0) |
|
141 { |
|
142 var data = fin.readByteArray(av); |
|
143 bos.writeByteArray(data, data.length); |
|
144 } |
|
145 fin.close(); |
|
146 bos.close(); |
|
147 |
|
148 for (var fn2 in UNICODE_FILES) |
|
149 { |
|
150 var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); |
|
151 var unichar = new CIS(ss.newInputStream(0), |
|
152 UNICODE_FILES[fn1], 8192, 0x0); |
|
153 |
|
154 if (!equalUnicharStreams(unichar, fin2)) |
|
155 do_throw("unequal streams: " + |
|
156 UNICODE_FILES[fn1] + ", " + |
|
157 UNICODE_FILES[fn2]); |
|
158 } |
|
159 } |
|
160 } |
|
161 |
|
162 |
|
163 // utility functions |
|
164 |
|
165 function StorageStream() |
|
166 { |
|
167 return new _SS(8192, Math.pow(2, 32) - 1, null); |
|
168 } |
|
169 |
|
170 function getUnicharInputStream(filename, encoding) |
|
171 { |
|
172 var file = dataDir.clone(); |
|
173 file.append(filename); |
|
174 |
|
175 const PR_RDONLY = 0x1; |
|
176 var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); |
|
177 return new CIS(fis, encoding, 8192, 0x0); |
|
178 } |
|
179 |
|
180 function getBinaryInputStream(filename, encoding) |
|
181 { |
|
182 var file = dataDir.clone(); |
|
183 file.append(filename); |
|
184 |
|
185 const PR_RDONLY = 0x1; |
|
186 var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); |
|
187 return new BIS(fis); |
|
188 } |
|
189 |
|
190 function equal(stream, codePoints) |
|
191 { |
|
192 var sz, currIndex = 0; |
|
193 while (true) |
|
194 { |
|
195 var unit = stream.readUnit(); |
|
196 if (unit < 0) |
|
197 return currIndex == codePoints.length; |
|
198 if (unit !== codePoints[currIndex++]) |
|
199 return false; |
|
200 } |
|
201 |
|
202 do_throw("not reached"); |
|
203 return false; |
|
204 } |
|
205 |
|
206 function equalUnicharStreams(s1, s2) |
|
207 { |
|
208 var r1, r2; |
|
209 var str1 = {}, str2 = {}; |
|
210 while (true) |
|
211 { |
|
212 r1 = s1.readString(1024, str1); |
|
213 r2 = s2.readString(1024, str2); |
|
214 |
|
215 if (r1 != r2 || str1.value != str2.value) |
|
216 { |
|
217 print("r1: " + r1 + ", r2: " + r2); |
|
218 print(str1.value.length); |
|
219 print(str2.value.length); |
|
220 return false; |
|
221 } |
|
222 if (r1 == 0 && r2 == 0) |
|
223 return true; |
|
224 } |
|
225 |
|
226 // not reached |
|
227 return false; |
|
228 } |
|
229 |
|
230 function stringToCodePoints(str) |
|
231 { |
|
232 return str.split('').map(function(v){ return v.charCodeAt(0); }); |
|
233 } |
|
234 |
|
235 function lowbits(n) |
|
236 { |
|
237 return Math.pow(2, n) - 1; |
|
238 } |
|
239 |
|
240 function Pipe() |
|
241 { |
|
242 return new _Pipe(false, false, 1024, 10, null); |
|
243 } |
|
244 |
|
245 |
|
246 // complex charset readers |
|
247 |
|
248 /** |
|
249 * Wraps a UTF-8 stream to allow access to the Unicode code points in it. |
|
250 * |
|
251 * @param stream |
|
252 * the stream to wrap |
|
253 */ |
|
254 function UTF8(stream) |
|
255 { |
|
256 this._stream = new BIS(stream); |
|
257 } |
|
258 UTF8.prototype = |
|
259 { |
|
260 // returns numeric code point at front of stream encoded in UTF-8, -1 if at |
|
261 // end of stream, or throws if valid (and properly encoded!) code point not |
|
262 // found |
|
263 readUnit: function() |
|
264 { |
|
265 var str = this._stream; |
|
266 |
|
267 var c, c2, c3, c4, rv; |
|
268 |
|
269 // if at end of stream, must distinguish failure to read any bytes |
|
270 // (correct behavior) from failure to read some byte after the first |
|
271 // in the character |
|
272 try |
|
273 { |
|
274 c = str.read8(); |
|
275 } |
|
276 catch (e) |
|
277 { |
|
278 return -1; |
|
279 } |
|
280 |
|
281 if (c < 0x80) |
|
282 return c; |
|
283 |
|
284 if (c < 0xC0) // c < 11000000 |
|
285 { |
|
286 // byte doesn't have enough leading ones (must be at least two) |
|
287 throw NS_ERROR_ILLEGAL_VALUE; |
|
288 } |
|
289 |
|
290 |
|
291 c2 = str.read8(); |
|
292 if (c2 >= 0xC0 || c2 < 0x80) |
|
293 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx |
|
294 |
|
295 if (c < 0xE0) // c < 11100000 |
|
296 { |
|
297 // two-byte between U+000080 and U+0007FF |
|
298 rv = ((lowbits(5) & c) << 6) + |
|
299 (lowbits(6) & c2); |
|
300 // no upper bounds-check needed, by previous lines |
|
301 if (rv >= 0x80) |
|
302 return rv; |
|
303 throw NS_ERROR_ILLEGAL_VALUE; |
|
304 } |
|
305 |
|
306 |
|
307 c3 = str.read8(); |
|
308 if (c3 >= 0xC0 || c3 < 0x80) |
|
309 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx |
|
310 |
|
311 if (c < 0xF0) // c < 11110000 |
|
312 { |
|
313 // three-byte between U+000800 and U+00FFFF |
|
314 rv = ((lowbits(4) & c) << 12) + |
|
315 ((lowbits(6) & c2) << 6) + |
|
316 (lowbits(6) & c3); |
|
317 // no upper bounds-check needed, by previous lines |
|
318 if (rv >= 0xE000 || |
|
319 (rv >= 0x800 && rv <= 0xD7FF)) |
|
320 return rv; |
|
321 throw NS_ERROR_ILLEGAL_VALUE; |
|
322 } |
|
323 |
|
324 |
|
325 c4 = str.read8(); |
|
326 if (c4 >= 0xC0 || c4 < 0x80) |
|
327 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx |
|
328 |
|
329 if (c < 0xF8) // c < 11111000 |
|
330 { |
|
331 // four-byte between U+010000 and U+10FFFF |
|
332 rv = ((lowbits(3) & c) << 18) + |
|
333 ((lowbits(6) & c2) << 12) + |
|
334 ((lowbits(6) & c3) << 6) + |
|
335 (lowbits(6) & c4); |
|
336 // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) |
|
337 if (rv >= 0x10000 && rv <= 0x10FFFF) |
|
338 return rv; |
|
339 throw NS_ERROR_ILLEGAL_VALUE; |
|
340 } |
|
341 |
|
342 // 11111000 or greater -- no UTF-8 mapping |
|
343 throw NS_ERROR_ILLEGAL_VALUE; |
|
344 } |
|
345 }; |
|
346 |
|
347 /** |
|
348 * Wraps a UTF-16 stream to allow access to the Unicode code points in it. |
|
349 * |
|
350 * @param stream |
|
351 * the stream to wrap |
|
352 * @param bigEndian |
|
353 * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with |
|
354 * a byte-order mark |
|
355 */ |
|
356 function UTF16(stream, bigEndian) |
|
357 { |
|
358 this._stream = new BIS(stream); |
|
359 if (arguments.length > 1) |
|
360 { |
|
361 this._bigEndian = bigEndian; |
|
362 } |
|
363 else |
|
364 { |
|
365 var bom = this._stream.read16(); |
|
366 if (bom == 0xFEFF) |
|
367 this._bigEndian = true; |
|
368 else if (bom == 0xFFFE) |
|
369 this._bigEndian = false; |
|
370 else |
|
371 do_throw("missing BOM: " + bom.toString(16).toUpperCase()); |
|
372 } |
|
373 } |
|
374 UTF16.prototype = |
|
375 { |
|
376 // returns numeric code point at front of stream encoded in UTF-16, |
|
377 // -1 if at end of stream, or throws if UTF-16 code point not found |
|
378 readUnit: function() |
|
379 { |
|
380 var str = this._stream; |
|
381 |
|
382 // if at end of stream, must distinguish failure to read any bytes |
|
383 // (correct behavior) from failure to read some byte after the first |
|
384 // in the character |
|
385 try |
|
386 { |
|
387 var b1 = str.read8(); |
|
388 } |
|
389 catch (e) |
|
390 { |
|
391 return -1; |
|
392 } |
|
393 |
|
394 var b2 = str.read8(); |
|
395 |
|
396 var w1 = this._bigEndian |
|
397 ? (b1 << 8) + b2 |
|
398 : (b2 << 8) + b1; |
|
399 |
|
400 if (w1 > 0xDBFF && w1 < 0xE000) |
|
401 { |
|
402 // second surrogate, but expecting none or first |
|
403 throw NS_ERROR_ILLEGAL_VALUE; |
|
404 } |
|
405 |
|
406 if (w1 > 0xD7FF && w1 < 0xDC00) |
|
407 { |
|
408 // non-BMP, use surrogate pair |
|
409 b1 = str.read8(); |
|
410 b2 = str.read8(); |
|
411 var w2 = this._bigEndian |
|
412 ? (b1 << 8) + b2 |
|
413 : (b2 << 8) + b1; |
|
414 if (w2 < 0xDC00 || w2 > 0xDFFF) |
|
415 throw NS_ERROR_ILLEGAL_VALUE; |
|
416 |
|
417 var rv = 0x100000 + |
|
418 ((lowbits(10) & w2) << 10) + |
|
419 (lowbits(10) & w1); |
|
420 if (rv <= 0x10FFFF) |
|
421 return rv; |
|
422 throw NS_ERROR_ILLEGAL_VALUE; |
|
423 } |
|
424 |
|
425 // non-surrogate |
|
426 return w1; |
|
427 } |
|
428 }; |