michael@0: const Cc = Components.classes; michael@0: const Ci = Components.interfaces; michael@0: michael@0: const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE; michael@0: michael@0: var BIS, BOS, _Pipe, COS, FIS, _SS, CIS; michael@0: michael@0: var dataDir; michael@0: michael@0: function run_test() michael@0: { michael@0: BIS = Components.Constructor("@mozilla.org/binaryinputstream;1", michael@0: "nsIBinaryInputStream", michael@0: "setInputStream"); michael@0: BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1", michael@0: "nsIBinaryOutputStream", michael@0: "setOutputStream"); michael@0: _Pipe = Components.Constructor("@mozilla.org/pipe;1", michael@0: "nsIPipe", michael@0: "init"); michael@0: COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1", michael@0: "nsIConverterOutputStream", michael@0: "init"); michael@0: FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1", michael@0: "nsIFileInputStream", michael@0: "init"); michael@0: _SS = Components.Constructor("@mozilla.org/storagestream;1", michael@0: "nsIStorageStream", michael@0: "init"); michael@0: CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1", michael@0: "nsIConverterInputStream", michael@0: "init"); michael@0: michael@0: dataDir = do_get_file("data/"); michael@0: michael@0: test_utf8_1(); michael@0: test_utf16_1(); michael@0: test_utf16_2(); michael@0: test_utf16_3(); michael@0: test_cross_conversion(); michael@0: } michael@0: michael@0: const UNICODE_STRINGS = michael@0: [ michael@0: '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE', michael@0: michael@0: 'AZaz09 \u007F ' + // U+000000 to U+00007F michael@0: '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF michael@0: '\u0964 \u0F5F \u20AC \uFFFB' // U+000800 to U+00FFFF michael@0: michael@0: // there would be strings containing non-BMP code points here, but michael@0: // unfortunately JS strings are UCS-2 (and worse yet are treated as michael@0: // 16-bit values by the spec), so we have to do gymnastics to work michael@0: // with non-BMP -- manual surrogate decoding doesn't work because michael@0: // String.prototype.charCodeAt() ignores surrogate pairs and only michael@0: // returns 16-bit values michael@0: ]; michael@0: michael@0: // test conversion equality -- keys are names of files containing equivalent michael@0: // Unicode data, values are the encoding of the file in the format expected by michael@0: // nsIConverter(In|Out)putStream.init michael@0: const UNICODE_FILES = michael@0: { michael@0: "unicode-conversion.utf8.txt": "UTF-8", michael@0: "unicode-conversion.utf16.txt": "UTF-16", michael@0: "unicode-conversion.utf16le.txt": "UTF-16LE", michael@0: "unicode-conversion.utf16be.txt": "UTF-16BE" michael@0: }; michael@0: michael@0: function test_utf8_1() michael@0: { michael@0: for (var i = 0; i < UNICODE_STRINGS.length; i++) michael@0: { michael@0: var pipe = Pipe(); michael@0: var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0); michael@0: do_check_true(conv.writeString(UNICODE_STRINGS[i])); michael@0: conv.close(); michael@0: michael@0: if (!equal(new UTF8(pipe.inputStream), michael@0: stringToCodePoints(UNICODE_STRINGS[i]))) michael@0: do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); michael@0: } michael@0: } michael@0: michael@0: function test_utf16_1() michael@0: { michael@0: for (var i = 0; i < UNICODE_STRINGS.length; i++) michael@0: { michael@0: var pipe = Pipe(); michael@0: var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0); michael@0: do_check_true(conv.writeString(UNICODE_STRINGS[i])); michael@0: conv.close(); michael@0: michael@0: if (!equal(new UTF16(pipe.inputStream), michael@0: stringToCodePoints(UNICODE_STRINGS[i]))) michael@0: do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); michael@0: } michael@0: } michael@0: michael@0: function test_utf16_2() michael@0: { michael@0: for (var i = 0; i < UNICODE_STRINGS.length; i++) michael@0: { michael@0: var pipe = Pipe(); michael@0: var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0); michael@0: do_check_true(conv.writeString(UNICODE_STRINGS[i])); michael@0: conv.close(); michael@0: michael@0: if (!equal(new UTF16(pipe.inputStream, false), michael@0: stringToCodePoints(UNICODE_STRINGS[i]))) michael@0: do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); michael@0: } michael@0: } michael@0: michael@0: function test_utf16_3() michael@0: { michael@0: for (var i = 0; i < UNICODE_STRINGS.length; i++) michael@0: { michael@0: var pipe = Pipe(); michael@0: var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0); michael@0: do_check_true(conv.writeString(UNICODE_STRINGS[i])); michael@0: conv.close(); michael@0: michael@0: if (!equal(new UTF16(pipe.inputStream, true), michael@0: stringToCodePoints(UNICODE_STRINGS[i]))) michael@0: do_throw("UNICODE_STRINGS[" + i + "] not handled correctly"); michael@0: } michael@0: } michael@0: michael@0: michael@0: function test_cross_conversion() michael@0: { michael@0: for (var fn1 in UNICODE_FILES) michael@0: { michael@0: var fin = getBinaryInputStream(fn1); michael@0: var ss = StorageStream(); michael@0: michael@0: var bos = new BOS(ss.getOutputStream(0)); michael@0: var av; michael@0: while ((av = fin.available()) > 0) michael@0: { michael@0: var data = fin.readByteArray(av); michael@0: bos.writeByteArray(data, data.length); michael@0: } michael@0: fin.close(); michael@0: bos.close(); michael@0: michael@0: for (var fn2 in UNICODE_FILES) michael@0: { michael@0: var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]); michael@0: var unichar = new CIS(ss.newInputStream(0), michael@0: UNICODE_FILES[fn1], 8192, 0x0); michael@0: michael@0: if (!equalUnicharStreams(unichar, fin2)) michael@0: do_throw("unequal streams: " + michael@0: UNICODE_FILES[fn1] + ", " + michael@0: UNICODE_FILES[fn2]); michael@0: } michael@0: } michael@0: } michael@0: michael@0: michael@0: // utility functions michael@0: michael@0: function StorageStream() michael@0: { michael@0: return new _SS(8192, Math.pow(2, 32) - 1, null); michael@0: } michael@0: michael@0: function getUnicharInputStream(filename, encoding) michael@0: { michael@0: var file = dataDir.clone(); michael@0: file.append(filename); michael@0: michael@0: const PR_RDONLY = 0x1; michael@0: var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); michael@0: return new CIS(fis, encoding, 8192, 0x0); michael@0: } michael@0: michael@0: function getBinaryInputStream(filename, encoding) michael@0: { michael@0: var file = dataDir.clone(); michael@0: file.append(filename); michael@0: michael@0: const PR_RDONLY = 0x1; michael@0: var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF); michael@0: return new BIS(fis); michael@0: } michael@0: michael@0: function equal(stream, codePoints) michael@0: { michael@0: var sz, currIndex = 0; michael@0: while (true) michael@0: { michael@0: var unit = stream.readUnit(); michael@0: if (unit < 0) michael@0: return currIndex == codePoints.length; michael@0: if (unit !== codePoints[currIndex++]) michael@0: return false; michael@0: } michael@0: michael@0: do_throw("not reached"); michael@0: return false; michael@0: } michael@0: michael@0: function equalUnicharStreams(s1, s2) michael@0: { michael@0: var r1, r2; michael@0: var str1 = {}, str2 = {}; michael@0: while (true) michael@0: { michael@0: r1 = s1.readString(1024, str1); michael@0: r2 = s2.readString(1024, str2); michael@0: michael@0: if (r1 != r2 || str1.value != str2.value) michael@0: { michael@0: print("r1: " + r1 + ", r2: " + r2); michael@0: print(str1.value.length); michael@0: print(str2.value.length); michael@0: return false; michael@0: } michael@0: if (r1 == 0 && r2 == 0) michael@0: return true; michael@0: } michael@0: michael@0: // not reached michael@0: return false; michael@0: } michael@0: michael@0: function stringToCodePoints(str) michael@0: { michael@0: return str.split('').map(function(v){ return v.charCodeAt(0); }); michael@0: } michael@0: michael@0: function lowbits(n) michael@0: { michael@0: return Math.pow(2, n) - 1; michael@0: } michael@0: michael@0: function Pipe() michael@0: { michael@0: return new _Pipe(false, false, 1024, 10, null); michael@0: } michael@0: michael@0: michael@0: // complex charset readers michael@0: michael@0: /** michael@0: * Wraps a UTF-8 stream to allow access to the Unicode code points in it. michael@0: * michael@0: * @param stream michael@0: * the stream to wrap michael@0: */ michael@0: function UTF8(stream) michael@0: { michael@0: this._stream = new BIS(stream); michael@0: } michael@0: UTF8.prototype = michael@0: { michael@0: // returns numeric code point at front of stream encoded in UTF-8, -1 if at michael@0: // end of stream, or throws if valid (and properly encoded!) code point not michael@0: // found michael@0: readUnit: function() michael@0: { michael@0: var str = this._stream; michael@0: michael@0: var c, c2, c3, c4, rv; michael@0: michael@0: // if at end of stream, must distinguish failure to read any bytes michael@0: // (correct behavior) from failure to read some byte after the first michael@0: // in the character michael@0: try michael@0: { michael@0: c = str.read8(); michael@0: } michael@0: catch (e) michael@0: { michael@0: return -1; michael@0: } michael@0: michael@0: if (c < 0x80) michael@0: return c; michael@0: michael@0: if (c < 0xC0) // c < 11000000 michael@0: { michael@0: // byte doesn't have enough leading ones (must be at least two) michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: michael@0: c2 = str.read8(); michael@0: if (c2 >= 0xC0 || c2 < 0x80) michael@0: throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx michael@0: michael@0: if (c < 0xE0) // c < 11100000 michael@0: { michael@0: // two-byte between U+000080 and U+0007FF michael@0: rv = ((lowbits(5) & c) << 6) + michael@0: (lowbits(6) & c2); michael@0: // no upper bounds-check needed, by previous lines michael@0: if (rv >= 0x80) michael@0: return rv; michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: michael@0: c3 = str.read8(); michael@0: if (c3 >= 0xC0 || c3 < 0x80) michael@0: throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx michael@0: michael@0: if (c < 0xF0) // c < 11110000 michael@0: { michael@0: // three-byte between U+000800 and U+00FFFF michael@0: rv = ((lowbits(4) & c) << 12) + michael@0: ((lowbits(6) & c2) << 6) + michael@0: (lowbits(6) & c3); michael@0: // no upper bounds-check needed, by previous lines michael@0: if (rv >= 0xE000 || michael@0: (rv >= 0x800 && rv <= 0xD7FF)) michael@0: return rv; michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: michael@0: c4 = str.read8(); michael@0: if (c4 >= 0xC0 || c4 < 0x80) michael@0: throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx michael@0: michael@0: if (c < 0xF8) // c < 11111000 michael@0: { michael@0: // four-byte between U+010000 and U+10FFFF michael@0: rv = ((lowbits(3) & c) << 18) + michael@0: ((lowbits(6) & c2) << 12) + michael@0: ((lowbits(6) & c3) << 6) + michael@0: (lowbits(6) & c4); michael@0: // need an upper bounds-check since 0x10FFFF isn't (2**n - 1) michael@0: if (rv >= 0x10000 && rv <= 0x10FFFF) michael@0: return rv; michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: // 11111000 or greater -- no UTF-8 mapping michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: }; michael@0: michael@0: /** michael@0: * Wraps a UTF-16 stream to allow access to the Unicode code points in it. michael@0: * michael@0: * @param stream michael@0: * the stream to wrap michael@0: * @param bigEndian michael@0: * true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with michael@0: * a byte-order mark michael@0: */ michael@0: function UTF16(stream, bigEndian) michael@0: { michael@0: this._stream = new BIS(stream); michael@0: if (arguments.length > 1) michael@0: { michael@0: this._bigEndian = bigEndian; michael@0: } michael@0: else michael@0: { michael@0: var bom = this._stream.read16(); michael@0: if (bom == 0xFEFF) michael@0: this._bigEndian = true; michael@0: else if (bom == 0xFFFE) michael@0: this._bigEndian = false; michael@0: else michael@0: do_throw("missing BOM: " + bom.toString(16).toUpperCase()); michael@0: } michael@0: } michael@0: UTF16.prototype = michael@0: { michael@0: // returns numeric code point at front of stream encoded in UTF-16, michael@0: // -1 if at end of stream, or throws if UTF-16 code point not found michael@0: readUnit: function() michael@0: { michael@0: var str = this._stream; michael@0: michael@0: // if at end of stream, must distinguish failure to read any bytes michael@0: // (correct behavior) from failure to read some byte after the first michael@0: // in the character michael@0: try michael@0: { michael@0: var b1 = str.read8(); michael@0: } michael@0: catch (e) michael@0: { michael@0: return -1; michael@0: } michael@0: michael@0: var b2 = str.read8(); michael@0: michael@0: var w1 = this._bigEndian michael@0: ? (b1 << 8) + b2 michael@0: : (b2 << 8) + b1; michael@0: michael@0: if (w1 > 0xDBFF && w1 < 0xE000) michael@0: { michael@0: // second surrogate, but expecting none or first michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: if (w1 > 0xD7FF && w1 < 0xDC00) michael@0: { michael@0: // non-BMP, use surrogate pair michael@0: b1 = str.read8(); michael@0: b2 = str.read8(); michael@0: var w2 = this._bigEndian michael@0: ? (b1 << 8) + b2 michael@0: : (b2 << 8) + b1; michael@0: if (w2 < 0xDC00 || w2 > 0xDFFF) michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: michael@0: var rv = 0x100000 + michael@0: ((lowbits(10) & w2) << 10) + michael@0: (lowbits(10) & w1); michael@0: if (rv <= 0x10FFFF) michael@0: return rv; michael@0: throw NS_ERROR_ILLEGAL_VALUE; michael@0: } michael@0: michael@0: // non-surrogate michael@0: return w1; michael@0: } michael@0: };