intl/uconv/tests/unit/test_charset_conversion.js

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 const Cc = Components.classes;
michael@0 2 const Ci = Components.interfaces;
michael@0 3
michael@0 4 const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE;
michael@0 5
michael@0 6 var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
michael@0 7
michael@0 8 var dataDir;
michael@0 9
michael@0 10 function run_test()
michael@0 11 {
michael@0 12 BIS = Components.Constructor("@mozilla.org/binaryinputstream;1",
michael@0 13 "nsIBinaryInputStream",
michael@0 14 "setInputStream");
michael@0 15 BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1",
michael@0 16 "nsIBinaryOutputStream",
michael@0 17 "setOutputStream");
michael@0 18 _Pipe = Components.Constructor("@mozilla.org/pipe;1",
michael@0 19 "nsIPipe",
michael@0 20 "init");
michael@0 21 COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1",
michael@0 22 "nsIConverterOutputStream",
michael@0 23 "init");
michael@0 24 FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1",
michael@0 25 "nsIFileInputStream",
michael@0 26 "init");
michael@0 27 _SS = Components.Constructor("@mozilla.org/storagestream;1",
michael@0 28 "nsIStorageStream",
michael@0 29 "init");
michael@0 30 CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
michael@0 31 "nsIConverterInputStream",
michael@0 32 "init");
michael@0 33
michael@0 34 dataDir = do_get_file("data/");
michael@0 35
michael@0 36 test_utf8_1();
michael@0 37 test_utf16_1();
michael@0 38 test_utf16_2();
michael@0 39 test_utf16_3();
michael@0 40 test_cross_conversion();
michael@0 41 }
michael@0 42
michael@0 43 const UNICODE_STRINGS =
michael@0 44 [
michael@0 45 '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE',
michael@0 46
michael@0 47 'AZaz09 \u007F ' + // U+000000 to U+00007F
michael@0 48 '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF
michael@0 49 '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF
michael@0 50
michael@0 51 // there would be strings containing non-BMP code points here, but
michael@0 52 // unfortunately JS strings are UCS-2 (and worse yet are treated as
michael@0 53 // 16-bit values by the spec), so we have to do gymnastics to work
michael@0 54 // with non-BMP -- manual surrogate decoding doesn't work because
michael@0 55 // String.prototype.charCodeAt() ignores surrogate pairs and only
michael@0 56 // returns 16-bit values
michael@0 57 ];
michael@0 58
michael@0 59 // test conversion equality -- keys are names of files containing equivalent
michael@0 60 // Unicode data, values are the encoding of the file in the format expected by
michael@0 61 // nsIConverter(In|Out)putStream.init
michael@0 62 const UNICODE_FILES =
michael@0 63 {
michael@0 64 "unicode-conversion.utf8.txt": "UTF-8",
michael@0 65 "unicode-conversion.utf16.txt": "UTF-16",
michael@0 66 "unicode-conversion.utf16le.txt": "UTF-16LE",
michael@0 67 "unicode-conversion.utf16be.txt": "UTF-16BE"
michael@0 68 };
michael@0 69
michael@0 70 function test_utf8_1()
michael@0 71 {
michael@0 72 for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0 73 {
michael@0 74 var pipe = Pipe();
michael@0 75 var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0);
michael@0 76 do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0 77 conv.close();
michael@0 78
michael@0 79 if (!equal(new UTF8(pipe.inputStream),
michael@0 80 stringToCodePoints(UNICODE_STRINGS[i])))
michael@0 81 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0 82 }
michael@0 83 }
michael@0 84
michael@0 85 function test_utf16_1()
michael@0 86 {
michael@0 87 for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0 88 {
michael@0 89 var pipe = Pipe();
michael@0 90 var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0);
michael@0 91 do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0 92 conv.close();
michael@0 93
michael@0 94 if (!equal(new UTF16(pipe.inputStream),
michael@0 95 stringToCodePoints(UNICODE_STRINGS[i])))
michael@0 96 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0 97 }
michael@0 98 }
michael@0 99
michael@0 100 function test_utf16_2()
michael@0 101 {
michael@0 102 for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0 103 {
michael@0 104 var pipe = Pipe();
michael@0 105 var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0);
michael@0 106 do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0 107 conv.close();
michael@0 108
michael@0 109 if (!equal(new UTF16(pipe.inputStream, false),
michael@0 110 stringToCodePoints(UNICODE_STRINGS[i])))
michael@0 111 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0 112 }
michael@0 113 }
michael@0 114
michael@0 115 function test_utf16_3()
michael@0 116 {
michael@0 117 for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0 118 {
michael@0 119 var pipe = Pipe();
michael@0 120 var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0);
michael@0 121 do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0 122 conv.close();
michael@0 123
michael@0 124 if (!equal(new UTF16(pipe.inputStream, true),
michael@0 125 stringToCodePoints(UNICODE_STRINGS[i])))
michael@0 126 do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0 127 }
michael@0 128 }
michael@0 129
michael@0 130
michael@0 131 function test_cross_conversion()
michael@0 132 {
michael@0 133 for (var fn1 in UNICODE_FILES)
michael@0 134 {
michael@0 135 var fin = getBinaryInputStream(fn1);
michael@0 136 var ss = StorageStream();
michael@0 137
michael@0 138 var bos = new BOS(ss.getOutputStream(0));
michael@0 139 var av;
michael@0 140 while ((av = fin.available()) > 0)
michael@0 141 {
michael@0 142 var data = fin.readByteArray(av);
michael@0 143 bos.writeByteArray(data, data.length);
michael@0 144 }
michael@0 145 fin.close();
michael@0 146 bos.close();
michael@0 147
michael@0 148 for (var fn2 in UNICODE_FILES)
michael@0 149 {
michael@0 150 var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
michael@0 151 var unichar = new CIS(ss.newInputStream(0),
michael@0 152 UNICODE_FILES[fn1], 8192, 0x0);
michael@0 153
michael@0 154 if (!equalUnicharStreams(unichar, fin2))
michael@0 155 do_throw("unequal streams: " +
michael@0 156 UNICODE_FILES[fn1] + ", " +
michael@0 157 UNICODE_FILES[fn2]);
michael@0 158 }
michael@0 159 }
michael@0 160 }
michael@0 161
michael@0 162
michael@0 163 // utility functions
michael@0 164
michael@0 165 function StorageStream()
michael@0 166 {
michael@0 167 return new _SS(8192, Math.pow(2, 32) - 1, null);
michael@0 168 }
michael@0 169
michael@0 170 function getUnicharInputStream(filename, encoding)
michael@0 171 {
michael@0 172 var file = dataDir.clone();
michael@0 173 file.append(filename);
michael@0 174
michael@0 175 const PR_RDONLY = 0x1;
michael@0 176 var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
michael@0 177 return new CIS(fis, encoding, 8192, 0x0);
michael@0 178 }
michael@0 179
michael@0 180 function getBinaryInputStream(filename, encoding)
michael@0 181 {
michael@0 182 var file = dataDir.clone();
michael@0 183 file.append(filename);
michael@0 184
michael@0 185 const PR_RDONLY = 0x1;
michael@0 186 var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
michael@0 187 return new BIS(fis);
michael@0 188 }
michael@0 189
michael@0 190 function equal(stream, codePoints)
michael@0 191 {
michael@0 192 var sz, currIndex = 0;
michael@0 193 while (true)
michael@0 194 {
michael@0 195 var unit = stream.readUnit();
michael@0 196 if (unit < 0)
michael@0 197 return currIndex == codePoints.length;
michael@0 198 if (unit !== codePoints[currIndex++])
michael@0 199 return false;
michael@0 200 }
michael@0 201
michael@0 202 do_throw("not reached");
michael@0 203 return false;
michael@0 204 }
michael@0 205
michael@0 206 function equalUnicharStreams(s1, s2)
michael@0 207 {
michael@0 208 var r1, r2;
michael@0 209 var str1 = {}, str2 = {};
michael@0 210 while (true)
michael@0 211 {
michael@0 212 r1 = s1.readString(1024, str1);
michael@0 213 r2 = s2.readString(1024, str2);
michael@0 214
michael@0 215 if (r1 != r2 || str1.value != str2.value)
michael@0 216 {
michael@0 217 print("r1: " + r1 + ", r2: " + r2);
michael@0 218 print(str1.value.length);
michael@0 219 print(str2.value.length);
michael@0 220 return false;
michael@0 221 }
michael@0 222 if (r1 == 0 && r2 == 0)
michael@0 223 return true;
michael@0 224 }
michael@0 225
michael@0 226 // not reached
michael@0 227 return false;
michael@0 228 }
michael@0 229
michael@0 230 function stringToCodePoints(str)
michael@0 231 {
michael@0 232 return str.split('').map(function(v){ return v.charCodeAt(0); });
michael@0 233 }
michael@0 234
michael@0 235 function lowbits(n)
michael@0 236 {
michael@0 237 return Math.pow(2, n) - 1;
michael@0 238 }
michael@0 239
michael@0 240 function Pipe()
michael@0 241 {
michael@0 242 return new _Pipe(false, false, 1024, 10, null);
michael@0 243 }
michael@0 244
michael@0 245
michael@0 246 // complex charset readers
michael@0 247
michael@0 248 /**
michael@0 249 * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
michael@0 250 *
michael@0 251 * @param stream
michael@0 252 * the stream to wrap
michael@0 253 */
michael@0 254 function UTF8(stream)
michael@0 255 {
michael@0 256 this._stream = new BIS(stream);
michael@0 257 }
michael@0 258 UTF8.prototype =
michael@0 259 {
michael@0 260 // returns numeric code point at front of stream encoded in UTF-8, -1 if at
michael@0 261 // end of stream, or throws if valid (and properly encoded!) code point not
michael@0 262 // found
michael@0 263 readUnit: function()
michael@0 264 {
michael@0 265 var str = this._stream;
michael@0 266
michael@0 267 var c, c2, c3, c4, rv;
michael@0 268
michael@0 269 // if at end of stream, must distinguish failure to read any bytes
michael@0 270 // (correct behavior) from failure to read some byte after the first
michael@0 271 // in the character
michael@0 272 try
michael@0 273 {
michael@0 274 c = str.read8();
michael@0 275 }
michael@0 276 catch (e)
michael@0 277 {
michael@0 278 return -1;
michael@0 279 }
michael@0 280
michael@0 281 if (c < 0x80)
michael@0 282 return c;
michael@0 283
michael@0 284 if (c < 0xC0) // c < 11000000
michael@0 285 {
michael@0 286 // byte doesn't have enough leading ones (must be at least two)
michael@0 287 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 288 }
michael@0 289
michael@0 290
michael@0 291 c2 = str.read8();
michael@0 292 if (c2 >= 0xC0 || c2 < 0x80)
michael@0 293 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0 294
michael@0 295 if (c < 0xE0) // c < 11100000
michael@0 296 {
michael@0 297 // two-byte between U+000080 and U+0007FF
michael@0 298 rv = ((lowbits(5) & c) << 6) +
michael@0 299 (lowbits(6) & c2);
michael@0 300 // no upper bounds-check needed, by previous lines
michael@0 301 if (rv >= 0x80)
michael@0 302 return rv;
michael@0 303 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 304 }
michael@0 305
michael@0 306
michael@0 307 c3 = str.read8();
michael@0 308 if (c3 >= 0xC0 || c3 < 0x80)
michael@0 309 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0 310
michael@0 311 if (c < 0xF0) // c < 11110000
michael@0 312 {
michael@0 313 // three-byte between U+000800 and U+00FFFF
michael@0 314 rv = ((lowbits(4) & c) << 12) +
michael@0 315 ((lowbits(6) & c2) << 6) +
michael@0 316 (lowbits(6) & c3);
michael@0 317 // no upper bounds-check needed, by previous lines
michael@0 318 if (rv >= 0xE000 ||
michael@0 319 (rv >= 0x800 && rv <= 0xD7FF))
michael@0 320 return rv;
michael@0 321 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 322 }
michael@0 323
michael@0 324
michael@0 325 c4 = str.read8();
michael@0 326 if (c4 >= 0xC0 || c4 < 0x80)
michael@0 327 throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0 328
michael@0 329 if (c < 0xF8) // c < 11111000
michael@0 330 {
michael@0 331 // four-byte between U+010000 and U+10FFFF
michael@0 332 rv = ((lowbits(3) & c) << 18) +
michael@0 333 ((lowbits(6) & c2) << 12) +
michael@0 334 ((lowbits(6) & c3) << 6) +
michael@0 335 (lowbits(6) & c4);
michael@0 336 // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
michael@0 337 if (rv >= 0x10000 && rv <= 0x10FFFF)
michael@0 338 return rv;
michael@0 339 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 340 }
michael@0 341
michael@0 342 // 11111000 or greater -- no UTF-8 mapping
michael@0 343 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 344 }
michael@0 345 };
michael@0 346
michael@0 347 /**
michael@0 348 * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
michael@0 349 *
michael@0 350 * @param stream
michael@0 351 * the stream to wrap
michael@0 352 * @param bigEndian
michael@0 353 * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
michael@0 354 * a byte-order mark
michael@0 355 */
michael@0 356 function UTF16(stream, bigEndian)
michael@0 357 {
michael@0 358 this._stream = new BIS(stream);
michael@0 359 if (arguments.length > 1)
michael@0 360 {
michael@0 361 this._bigEndian = bigEndian;
michael@0 362 }
michael@0 363 else
michael@0 364 {
michael@0 365 var bom = this._stream.read16();
michael@0 366 if (bom == 0xFEFF)
michael@0 367 this._bigEndian = true;
michael@0 368 else if (bom == 0xFFFE)
michael@0 369 this._bigEndian = false;
michael@0 370 else
michael@0 371 do_throw("missing BOM: " + bom.toString(16).toUpperCase());
michael@0 372 }
michael@0 373 }
michael@0 374 UTF16.prototype =
michael@0 375 {
michael@0 376 // returns numeric code point at front of stream encoded in UTF-16,
michael@0 377 // -1 if at end of stream, or throws if UTF-16 code point not found
michael@0 378 readUnit: function()
michael@0 379 {
michael@0 380 var str = this._stream;
michael@0 381
michael@0 382 // if at end of stream, must distinguish failure to read any bytes
michael@0 383 // (correct behavior) from failure to read some byte after the first
michael@0 384 // in the character
michael@0 385 try
michael@0 386 {
michael@0 387 var b1 = str.read8();
michael@0 388 }
michael@0 389 catch (e)
michael@0 390 {
michael@0 391 return -1;
michael@0 392 }
michael@0 393
michael@0 394 var b2 = str.read8();
michael@0 395
michael@0 396 var w1 = this._bigEndian
michael@0 397 ? (b1 << 8) + b2
michael@0 398 : (b2 << 8) + b1;
michael@0 399
michael@0 400 if (w1 > 0xDBFF && w1 < 0xE000)
michael@0 401 {
michael@0 402 // second surrogate, but expecting none or first
michael@0 403 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 404 }
michael@0 405
michael@0 406 if (w1 > 0xD7FF && w1 < 0xDC00)
michael@0 407 {
michael@0 408 // non-BMP, use surrogate pair
michael@0 409 b1 = str.read8();
michael@0 410 b2 = str.read8();
michael@0 411 var w2 = this._bigEndian
michael@0 412 ? (b1 << 8) + b2
michael@0 413 : (b2 << 8) + b1;
michael@0 414 if (w2 < 0xDC00 || w2 > 0xDFFF)
michael@0 415 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 416
michael@0 417 var rv = 0x100000 +
michael@0 418 ((lowbits(10) & w2) << 10) +
michael@0 419 (lowbits(10) & w1);
michael@0 420 if (rv <= 0x10FFFF)
michael@0 421 return rv;
michael@0 422 throw NS_ERROR_ILLEGAL_VALUE;
michael@0 423 }
michael@0 424
michael@0 425 // non-surrogate
michael@0 426 return w1;
michael@0 427 }
michael@0 428 };

mercurial