1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/intl/uconv/tests/unit/test_charset_conversion.js Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,428 @@ 1.4 +const Cc = Components.classes; 1.5 +const Ci = Components.interfaces; 1.6 + 1.7 +const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE; 1.8 + 1.9 +var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; 1.10 + 1.11 +var dataDir; 1.12 + 1.13 +function run_test() 1.14 +{ 1.15 + BIS = Components.Constructor("@mozilla.org/binaryinputstream;1", 1.16 + "nsIBinaryInputStream", 1.17 + "setInputStream"); 1.18 + BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1", 1.19 + "nsIBinaryOutputStream", 1.20 + "setOutputStream"); 1.21 + _Pipe = Components.Constructor("@mozilla.org/pipe;1", 1.22 + "nsIPipe", 1.23 + "init"); 1.24 + COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1", 1.25 + "nsIConverterOutputStream", 1.26 + "init"); 1.27 + FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1", 1.28 + "nsIFileInputStream", 1.29 + "init"); 1.30 + _SS = Components.Constructor("@mozilla.org/storagestream;1", 1.31 + "nsIStorageStream", 1.32 + "init"); 1.33 + CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1", 1.34 + "nsIConverterInputStream", 1.35 + "init"); 1.36 + 1.37 + dataDir = do_get_file("data/"); 1.38 + 1.39 + test_utf8_1(); 1.40 + test_utf16_1(); 1.41 + test_utf16_2(); 1.42 + test_utf16_3(); 1.43 + test_cross_conversion(); 1.44 +} 1.45 + 1.46 +const UNICODE_STRINGS = 1.47 + [ 1.48 + '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE', 1.49 + 1.50 + 'AZaz09 \u007F ' + // U+000000 to U+00007F 1.51 + '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF 1.52 + '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF 1.53 + 1.54 + // there would be strings containing non-BMP code points here, but 1.55 + // unfortunately JS strings are UCS-2 (and worse yet are treated as 1.56 + // 16-bit values by the spec), so we have to do gymnastics to work 1.57 + // with non-BMP -- manual surrogate decoding doesn't work because 1.58 + // String.prototype.charCodeAt() ignores surrogate pairs and only 1.59 + // returns 16-bit values 1.60 + ]; 1.61 + 1.62 +// test conversion equality -- keys are names of files containing equivalent 1.63 +// Unicode data, values are the encoding of the file in the format expected by 1.64 +// nsIConverter(In|Out)putStream.init 1.65 +const UNICODE_FILES = 1.66 + { 1.67 + "unicode-conversion.utf8.txt": "UTF-8", 1.68 + "unicode-conversion.utf16.txt": "UTF-16", 1.69 + "unicode-conversion.utf16le.txt": "UTF-16LE", 1.70 + "unicode-conversion.utf16be.txt": "UTF-16BE" 1.71 + }; 1.72 + 1.73 +function test_utf8_1() 1.74 +{ 1.75 + for (var i = 0; i < UNICODE_STRINGS.length; i++) 1.76 + { 1.77 + var pipe = Pipe(); 1.78 + var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0); 1.79 + do_check_true(conv.writeString(UNICODE_STRINGS[i])); 1.80 + conv.close(); 1.81 + 1.82 + if (!equal(new UTF8(pipe.inputStream), 1.83 + stringToCodePoints(UNICODE_STRINGS[i]))) 1.84 + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); 1.85 + } 1.86 +} 1.87 + 1.88 +function test_utf16_1() 1.89 +{ 1.90 + for (var i = 0; i < UNICODE_STRINGS.length; i++) 1.91 + { 1.92 + var pipe = Pipe(); 1.93 + var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0); 1.94 + do_check_true(conv.writeString(UNICODE_STRINGS[i])); 1.95 + conv.close(); 1.96 + 1.97 + if (!equal(new UTF16(pipe.inputStream), 1.98 + stringToCodePoints(UNICODE_STRINGS[i]))) 1.99 + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); 1.100 + } 1.101 +} 1.102 + 1.103 +function test_utf16_2() 1.104 +{ 1.105 + for (var i = 0; i < UNICODE_STRINGS.length; i++) 1.106 + { 1.107 + var pipe = Pipe(); 1.108 + var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0); 1.109 + do_check_true(conv.writeString(UNICODE_STRINGS[i])); 1.110 + conv.close(); 1.111 + 1.112 + if (!equal(new UTF16(pipe.inputStream, false), 1.113 + stringToCodePoints(UNICODE_STRINGS[i]))) 1.114 + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); 1.115 + } 1.116 +} 1.117 + 1.118 +function test_utf16_3() 1.119 +{ 1.120 + for (var i = 0; i < UNICODE_STRINGS.length; i++) 1.121 + { 1.122 + var pipe = Pipe(); 1.123 + var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0); 1.124 + do_check_true(conv.writeString(UNICODE_STRINGS[i])); 1.125 + conv.close(); 1.126 + 1.127 + if (!equal(new UTF16(pipe.inputStream, true), 1.128 + stringToCodePoints(UNICODE_STRINGS[i]))) 1.129 + do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); 1.130 + } 1.131 +} 1.132 + 1.133 + 1.134 +function test_cross_conversion() 1.135 +{ 1.136 + for (var fn1 in UNICODE_FILES) 1.137 + { 1.138 + var fin = getBinaryInputStream(fn1); 1.139 + var ss = StorageStream(); 1.140 + 1.141 + var bos = new BOS(ss.getOutputStream(0)); 1.142 + var av; 1.143 + while ((av = fin.available()) > 0) 1.144 + { 1.145 + var data = fin.readByteArray(av); 1.146 + bos.writeByteArray(data, data.length); 1.147 + } 1.148 + fin.close(); 1.149 + bos.close(); 1.150 + 1.151 + for (var fn2 in UNICODE_FILES) 1.152 + { 1.153 + var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); 1.154 + var unichar = new CIS(ss.newInputStream(0), 1.155 + UNICODE_FILES[fn1], 8192, 0x0); 1.156 + 1.157 + if (!equalUnicharStreams(unichar, fin2)) 1.158 + do_throw("unequal streams: " + 1.159 + UNICODE_FILES[fn1] + ", " + 1.160 + UNICODE_FILES[fn2]); 1.161 + } 1.162 + } 1.163 +} 1.164 + 1.165 + 1.166 +// utility functions 1.167 + 1.168 +function StorageStream() 1.169 +{ 1.170 + return new _SS(8192, Math.pow(2, 32) - 1, null); 1.171 +} 1.172 + 1.173 +function getUnicharInputStream(filename, encoding) 1.174 +{ 1.175 + var file = dataDir.clone(); 1.176 + file.append(filename); 1.177 + 1.178 + const PR_RDONLY = 0x1; 1.179 + var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); 1.180 + return new CIS(fis, encoding, 8192, 0x0); 1.181 +} 1.182 + 1.183 +function getBinaryInputStream(filename, encoding) 1.184 +{ 1.185 + var file = dataDir.clone(); 1.186 + file.append(filename); 1.187 + 1.188 + const PR_RDONLY = 0x1; 1.189 + var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); 1.190 + return new BIS(fis); 1.191 +} 1.192 + 1.193 +function equal(stream, codePoints) 1.194 +{ 1.195 + var sz, currIndex = 0; 1.196 + while (true) 1.197 + { 1.198 + var unit = stream.readUnit(); 1.199 + if (unit < 0) 1.200 + return currIndex == codePoints.length; 1.201 + if (unit !== codePoints[currIndex++]) 1.202 + return false; 1.203 + } 1.204 + 1.205 + do_throw("not reached"); 1.206 + return false; 1.207 +} 1.208 + 1.209 +function equalUnicharStreams(s1, s2) 1.210 +{ 1.211 + var r1, r2; 1.212 + var str1 = {}, str2 = {}; 1.213 + while (true) 1.214 + { 1.215 + r1 = s1.readString(1024, str1); 1.216 + r2 = s2.readString(1024, str2); 1.217 + 1.218 + if (r1 != r2 || str1.value != str2.value) 1.219 + { 1.220 + print("r1: " + r1 + ", r2: " + r2); 1.221 + print(str1.value.length); 1.222 + print(str2.value.length); 1.223 + return false; 1.224 + } 1.225 + if (r1 == 0 && r2 == 0) 1.226 + return true; 1.227 + } 1.228 + 1.229 + // not reached 1.230 + return false; 1.231 +} 1.232 + 1.233 +function stringToCodePoints(str) 1.234 +{ 1.235 + return str.split('').map(function(v){ return v.charCodeAt(0); }); 1.236 +} 1.237 + 1.238 +function lowbits(n) 1.239 +{ 1.240 + return Math.pow(2, n) - 1; 1.241 +} 1.242 + 1.243 +function Pipe() 1.244 +{ 1.245 + return new _Pipe(false, false, 1024, 10, null); 1.246 +} 1.247 + 1.248 + 1.249 +// complex charset readers 1.250 + 1.251 +/** 1.252 + * Wraps a UTF-8 stream to allow access to the Unicode code points in it. 1.253 + * 1.254 + * @param stream 1.255 + * the stream to wrap 1.256 + */ 1.257 +function UTF8(stream) 1.258 +{ 1.259 + this._stream = new BIS(stream); 1.260 +} 1.261 +UTF8.prototype = 1.262 + { 1.263 + // returns numeric code point at front of stream encoded in UTF-8, -1 if at 1.264 + // end of stream, or throws if valid (and properly encoded!) code point not 1.265 + // found 1.266 + readUnit: function() 1.267 + { 1.268 + var str = this._stream; 1.269 + 1.270 + var c, c2, c3, c4, rv; 1.271 + 1.272 + // if at end of stream, must distinguish failure to read any bytes 1.273 + // (correct behavior) from failure to read some byte after the first 1.274 + // in the character 1.275 + try 1.276 + { 1.277 + c = str.read8(); 1.278 + } 1.279 + catch (e) 1.280 + { 1.281 + return -1; 1.282 + } 1.283 + 1.284 + if (c < 0x80) 1.285 + return c; 1.286 + 1.287 + if (c < 0xC0) // c < 11000000 1.288 + { 1.289 + // byte doesn't have enough leading ones (must be at least two) 1.290 + throw NS_ERROR_ILLEGAL_VALUE; 1.291 + } 1.292 + 1.293 + 1.294 + c2 = str.read8(); 1.295 + if (c2 >= 0xC0 || c2 < 0x80) 1.296 + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx 1.297 + 1.298 + if (c < 0xE0) // c < 11100000 1.299 + { 1.300 + // two-byte between U+000080 and U+0007FF 1.301 + rv = ((lowbits(5) & c) << 6) + 1.302 + (lowbits(6) & c2); 1.303 + // no upper bounds-check needed, by previous lines 1.304 + if (rv >= 0x80) 1.305 + return rv; 1.306 + throw NS_ERROR_ILLEGAL_VALUE; 1.307 + } 1.308 + 1.309 + 1.310 + c3 = str.read8(); 1.311 + if (c3 >= 0xC0 || c3 < 0x80) 1.312 + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx 1.313 + 1.314 + if (c < 0xF0) // c < 11110000 1.315 + { 1.316 + // three-byte between U+000800 and U+00FFFF 1.317 + rv = ((lowbits(4) & c) << 12) + 1.318 + ((lowbits(6) & c2) << 6) + 1.319 + (lowbits(6) & c3); 1.320 + // no upper bounds-check needed, by previous lines 1.321 + if (rv >= 0xE000 || 1.322 + (rv >= 0x800 && rv <= 0xD7FF)) 1.323 + return rv; 1.324 + throw NS_ERROR_ILLEGAL_VALUE; 1.325 + } 1.326 + 1.327 + 1.328 + c4 = str.read8(); 1.329 + if (c4 >= 0xC0 || c4 < 0x80) 1.330 + throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx 1.331 + 1.332 + if (c < 0xF8) // c < 11111000 1.333 + { 1.334 + // four-byte between U+010000 and U+10FFFF 1.335 + rv = ((lowbits(3) & c) << 18) + 1.336 + ((lowbits(6) & c2) << 12) + 1.337 + ((lowbits(6) & c3) << 6) + 1.338 + (lowbits(6) & c4); 1.339 + // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) 1.340 + if (rv >= 0x10000 && rv <= 0x10FFFF) 1.341 + return rv; 1.342 + throw NS_ERROR_ILLEGAL_VALUE; 1.343 + } 1.344 + 1.345 + // 11111000 or greater -- no UTF-8 mapping 1.346 + throw NS_ERROR_ILLEGAL_VALUE; 1.347 + } 1.348 + }; 1.349 + 1.350 +/** 1.351 + * Wraps a UTF-16 stream to allow access to the Unicode code points in it. 1.352 + * 1.353 + * @param stream 1.354 + * the stream to wrap 1.355 + * @param bigEndian 1.356 + * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with 1.357 + * a byte-order mark 1.358 + */ 1.359 +function UTF16(stream, bigEndian) 1.360 +{ 1.361 + this._stream = new BIS(stream); 1.362 + if (arguments.length > 1) 1.363 + { 1.364 + this._bigEndian = bigEndian; 1.365 + } 1.366 + else 1.367 + { 1.368 + var bom = this._stream.read16(); 1.369 + if (bom == 0xFEFF) 1.370 + this._bigEndian = true; 1.371 + else if (bom == 0xFFFE) 1.372 + this._bigEndian = false; 1.373 + else 1.374 + do_throw("missing BOM: " + bom.toString(16).toUpperCase()); 1.375 + } 1.376 +} 1.377 +UTF16.prototype = 1.378 + { 1.379 + // returns numeric code point at front of stream encoded in UTF-16, 1.380 + // -1 if at end of stream, or throws if UTF-16 code point not found 1.381 + readUnit: function() 1.382 + { 1.383 + var str = this._stream; 1.384 + 1.385 + // if at end of stream, must distinguish failure to read any bytes 1.386 + // (correct behavior) from failure to read some byte after the first 1.387 + // in the character 1.388 + try 1.389 + { 1.390 + var b1 = str.read8(); 1.391 + } 1.392 + catch (e) 1.393 + { 1.394 + return -1; 1.395 + } 1.396 + 1.397 + var b2 = str.read8(); 1.398 + 1.399 + var w1 = this._bigEndian 1.400 + ? (b1 << 8) + b2 1.401 + : (b2 << 8) + b1; 1.402 + 1.403 + if (w1 > 0xDBFF && w1 < 0xE000) 1.404 + { 1.405 + // second surrogate, but expecting none or first 1.406 + throw NS_ERROR_ILLEGAL_VALUE; 1.407 + } 1.408 + 1.409 + if (w1 > 0xD7FF && w1 < 0xDC00) 1.410 + { 1.411 + // non-BMP, use surrogate pair 1.412 + b1 = str.read8(); 1.413 + b2 = str.read8(); 1.414 + var w2 = this._bigEndian 1.415 + ? (b1 << 8) + b2 1.416 + : (b2 << 8) + b1; 1.417 + if (w2 < 0xDC00 || w2 > 0xDFFF) 1.418 + throw NS_ERROR_ILLEGAL_VALUE; 1.419 + 1.420 + var rv = 0x100000 + 1.421 + ((lowbits(10) & w2) << 10) + 1.422 + (lowbits(10) & w1); 1.423 + if (rv <= 0x10FFFF) 1.424 + return rv; 1.425 + throw NS_ERROR_ILLEGAL_VALUE; 1.426 + } 1.427 + 1.428 + // non-surrogate 1.429 + return w1; 1.430 + } 1.431 + };