michael@0: const Cc = Components.classes;
michael@0: const Ci = Components.interfaces;
michael@0: 
michael@0: const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE;
michael@0: 
michael@0: var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
michael@0: 
michael@0: var dataDir;
michael@0: 
michael@0: function run_test()
michael@0: {
michael@0:   BIS = Components.Constructor("@mozilla.org/binaryinputstream;1",
michael@0:                                "nsIBinaryInputStream",
michael@0:                                "setInputStream");
michael@0:   BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1",
michael@0:                                "nsIBinaryOutputStream",
michael@0:                                "setOutputStream");
michael@0:   _Pipe = Components.Constructor("@mozilla.org/pipe;1",
michael@0:                                  "nsIPipe",
michael@0:                                  "init");
michael@0:   COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1",
michael@0:                                "nsIConverterOutputStream",
michael@0:                                "init");
michael@0:   FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1",
michael@0:                                "nsIFileInputStream",
michael@0:                                "init");
michael@0:   _SS = Components.Constructor("@mozilla.org/storagestream;1",
michael@0:                                "nsIStorageStream",
michael@0:                                "init");
michael@0:   CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
michael@0:                                "nsIConverterInputStream",
michael@0:                                "init");
michael@0: 
michael@0:   dataDir = do_get_file("data/");
michael@0: 
michael@0:   test_utf8_1();
michael@0:   test_utf16_1();
michael@0:   test_utf16_2();
michael@0:   test_utf16_3();
michael@0:   test_cross_conversion();
michael@0: }
michael@0: 
michael@0: const UNICODE_STRINGS =
michael@0:   [
michael@0:     '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE',
michael@0: 
michael@0:     'AZaz09 \u007F ' +               // U+000000 to U+00007F
michael@0:     '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF
michael@0:     '\u0964 \u0F5F \u20AC \uFFFB'    // U+000800 to U+00FFFF
michael@0: 
michael@0:     // there would be strings containing non-BMP code points here, but
michael@0:     // unfortunately JS strings are UCS-2 (and worse yet are treated as
michael@0:     // 16-bit values by the spec), so we have to do gymnastics to work
michael@0:     // with non-BMP -- manual surrogate decoding doesn't work because
michael@0:     // String.prototype.charCodeAt() ignores surrogate pairs and only
michael@0:     // returns 16-bit values
michael@0:   ];
michael@0: 
michael@0: // test conversion equality -- keys are names of files containing equivalent
michael@0: // Unicode data, values are the encoding of the file in the format expected by
michael@0: // nsIConverter(In|Out)putStream.init
michael@0: const UNICODE_FILES =
michael@0:   {
michael@0:     "unicode-conversion.utf8.txt":            "UTF-8",
michael@0:     "unicode-conversion.utf16.txt":           "UTF-16",
michael@0:     "unicode-conversion.utf16le.txt":         "UTF-16LE",
michael@0:     "unicode-conversion.utf16be.txt":         "UTF-16BE"
michael@0:   };
michael@0: 
michael@0: function test_utf8_1()
michael@0: {
michael@0:   for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0:   {
michael@0:     var pipe = Pipe();
michael@0:     var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0);
michael@0:     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0:     conv.close();
michael@0: 
michael@0:     if (!equal(new UTF8(pipe.inputStream),
michael@0:                stringToCodePoints(UNICODE_STRINGS[i])))
michael@0:       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0:   }
michael@0: }
michael@0: 
michael@0: function test_utf16_1()
michael@0: {
michael@0:   for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0:   {
michael@0:     var pipe = Pipe();
michael@0:     var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0);
michael@0:     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0:     conv.close();
michael@0: 
michael@0:     if (!equal(new UTF16(pipe.inputStream),
michael@0:                stringToCodePoints(UNICODE_STRINGS[i])))
michael@0:       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0:   }
michael@0: }
michael@0: 
michael@0: function test_utf16_2()
michael@0: {
michael@0:   for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0:   {
michael@0:     var pipe = Pipe();
michael@0:     var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0);
michael@0:     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0:     conv.close();
michael@0: 
michael@0:     if (!equal(new UTF16(pipe.inputStream, false),
michael@0:                stringToCodePoints(UNICODE_STRINGS[i])))
michael@0:       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0:   }
michael@0: }
michael@0: 
michael@0: function test_utf16_3()
michael@0: {
michael@0:   for (var i = 0; i < UNICODE_STRINGS.length; i++)
michael@0:   {
michael@0:     var pipe = Pipe();
michael@0:     var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0);
michael@0:     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
michael@0:     conv.close();
michael@0: 
michael@0:     if (!equal(new UTF16(pipe.inputStream, true),
michael@0:                stringToCodePoints(UNICODE_STRINGS[i])))
michael@0:       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
michael@0:   }
michael@0: }
michael@0: 
michael@0: 
michael@0: function test_cross_conversion()
michael@0: {
michael@0:   for (var fn1 in UNICODE_FILES)
michael@0:   {
michael@0:     var fin = getBinaryInputStream(fn1);
michael@0:     var ss = StorageStream();
michael@0: 
michael@0:     var bos = new BOS(ss.getOutputStream(0));
michael@0:     var av;
michael@0:     while ((av = fin.available()) > 0)
michael@0:     {
michael@0:       var data = fin.readByteArray(av);
michael@0:       bos.writeByteArray(data, data.length);
michael@0:     }
michael@0:     fin.close();
michael@0:     bos.close();
michael@0: 
michael@0:     for (var fn2 in UNICODE_FILES)
michael@0:     {
michael@0:       var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
michael@0:       var unichar = new CIS(ss.newInputStream(0),
michael@0:                             UNICODE_FILES[fn1], 8192, 0x0);
michael@0: 
michael@0:       if (!equalUnicharStreams(unichar, fin2))
michael@0:         do_throw("unequal streams: " +
michael@0:                  UNICODE_FILES[fn1] + ", " +
michael@0:                  UNICODE_FILES[fn2]);
michael@0:     }
michael@0:   }
michael@0: }
michael@0: 
michael@0: 
michael@0: // utility functions
michael@0: 
michael@0: function StorageStream()
michael@0: {
michael@0:   return new _SS(8192, Math.pow(2, 32) - 1, null);
michael@0: }
michael@0: 
michael@0: function getUnicharInputStream(filename, encoding)
michael@0: {
michael@0:   var file = dataDir.clone();
michael@0:   file.append(filename);
michael@0: 
michael@0:   const PR_RDONLY = 0x1;
michael@0:   var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
michael@0:   return new CIS(fis, encoding, 8192, 0x0);
michael@0: }
michael@0: 
michael@0: function getBinaryInputStream(filename, encoding)
michael@0: {
michael@0:   var file = dataDir.clone();
michael@0:   file.append(filename);
michael@0: 
michael@0:   const PR_RDONLY = 0x1;
michael@0:   var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
michael@0:   return new BIS(fis);
michael@0: }
michael@0: 
michael@0: function equal(stream, codePoints)
michael@0: {
michael@0:   var sz, currIndex = 0;
michael@0:   while (true)
michael@0:   {
michael@0:     var unit = stream.readUnit();
michael@0:     if (unit < 0)
michael@0:       return currIndex == codePoints.length;
michael@0:     if (unit !== codePoints[currIndex++])
michael@0:       return false;
michael@0:   }
michael@0: 
michael@0:   do_throw("not reached");
michael@0:   return false;
michael@0: }
michael@0: 
michael@0: function equalUnicharStreams(s1, s2)
michael@0: {
michael@0:   var r1, r2;
michael@0:   var str1 = {}, str2 = {};
michael@0:   while (true)
michael@0:   {
michael@0:     r1 = s1.readString(1024, str1);
michael@0:     r2 = s2.readString(1024, str2);
michael@0: 
michael@0:     if (r1 != r2 || str1.value != str2.value)
michael@0:     {
michael@0:       print("r1: " + r1 + ", r2: " + r2);
michael@0:       print(str1.value.length);
michael@0:       print(str2.value.length);
michael@0:       return false;
michael@0:     }
michael@0:     if (r1 == 0 && r2 == 0)
michael@0:       return true;
michael@0:   }
michael@0: 
michael@0:   // not reached
michael@0:   return false;
michael@0: }
michael@0: 
michael@0: function stringToCodePoints(str)
michael@0: {
michael@0:   return str.split('').map(function(v){ return v.charCodeAt(0); });
michael@0: }
michael@0: 
michael@0: function lowbits(n)
michael@0: {
michael@0:   return Math.pow(2, n) - 1;
michael@0: }
michael@0: 
michael@0: function Pipe()
michael@0: {
michael@0:   return new _Pipe(false, false, 1024, 10, null);
michael@0: }
michael@0: 
michael@0: 
michael@0: // complex charset readers
michael@0: 
michael@0: /**
michael@0:  * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
michael@0:  *
michael@0:  * @param stream
michael@0:  *   the stream to wrap
michael@0:  */
michael@0: function UTF8(stream)
michael@0: {
michael@0:   this._stream = new BIS(stream);
michael@0: }
michael@0: UTF8.prototype =
michael@0:   {
michael@0:     // returns numeric code point at front of stream encoded in UTF-8, -1 if at
michael@0:     // end of stream, or throws if valid (and properly encoded!) code point not
michael@0:     // found
michael@0:     readUnit: function()
michael@0:     {
michael@0:       var str = this._stream;
michael@0: 
michael@0:       var c, c2, c3, c4, rv;
michael@0: 
michael@0:       // if at end of stream, must distinguish failure to read any bytes
michael@0:       // (correct behavior) from failure to read some byte after the first
michael@0:       // in the character
michael@0:       try
michael@0:       {
michael@0:         c = str.read8();
michael@0:       }
michael@0:       catch (e)
michael@0:       {
michael@0:         return -1;
michael@0:       }
michael@0: 
michael@0:       if (c < 0x80)
michael@0:         return c;
michael@0: 
michael@0:       if (c < 0xC0) // c < 11000000
michael@0:       {
michael@0:         // byte doesn't have enough leading ones (must be at least two)
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       c2 = str.read8();
michael@0:       if (c2 >= 0xC0 || c2 < 0x80)
michael@0:         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0: 
michael@0:       if (c < 0xE0) // c < 11100000
michael@0:       {
michael@0:         // two-byte between U+000080 and U+0007FF
michael@0:         rv = ((lowbits(5) & c) << 6) +
michael@0:               (lowbits(6) & c2);
michael@0:         // no upper bounds-check needed, by previous lines
michael@0:         if (rv >= 0x80)
michael@0:           return rv;
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       c3 = str.read8();
michael@0:       if (c3 >= 0xC0 || c3 < 0x80)
michael@0:         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0: 
michael@0:       if (c < 0xF0) // c < 11110000
michael@0:       {
michael@0:         // three-byte between U+000800 and U+00FFFF
michael@0:         rv = ((lowbits(4) & c)  << 12) +
michael@0:              ((lowbits(6) & c2) <<  6) +
michael@0:               (lowbits(6) & c3);
michael@0:         // no upper bounds-check needed, by previous lines
michael@0:         if (rv >= 0xE000 ||
michael@0:             (rv >= 0x800 && rv <= 0xD7FF))
michael@0:           return rv;
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0: 
michael@0:       c4 = str.read8();
michael@0:       if (c4 >= 0xC0 || c4 < 0x80)
michael@0:         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
michael@0: 
michael@0:       if (c < 0xF8) // c < 11111000
michael@0:       {
michael@0:         // four-byte between U+010000 and U+10FFFF
michael@0:         rv = ((lowbits(3) & c)  << 18) +
michael@0:              ((lowbits(6) & c2) << 12) +
michael@0:              ((lowbits(6) & c3) <<  6) +
michael@0:               (lowbits(6) & c4);
michael@0:         // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
michael@0:         if (rv >= 0x10000 && rv <= 0x10FFFF)
michael@0:           return rv;
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0:       // 11111000 or greater -- no UTF-8 mapping
michael@0:       throw NS_ERROR_ILLEGAL_VALUE;
michael@0:     }
michael@0:   };
michael@0: 
michael@0: /**
michael@0:  * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
michael@0:  *
michael@0:  * @param stream
michael@0:  *   the stream to wrap
michael@0:  * @param bigEndian
michael@0:  *   true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
michael@0:  *   a byte-order mark
michael@0:  */
michael@0: function UTF16(stream, bigEndian)
michael@0: {
michael@0:   this._stream = new BIS(stream);
michael@0:   if (arguments.length > 1)
michael@0:   {
michael@0:     this._bigEndian = bigEndian;
michael@0:   }
michael@0:   else
michael@0:   {
michael@0:     var bom = this._stream.read16();
michael@0:     if (bom == 0xFEFF)
michael@0:       this._bigEndian = true;
michael@0:     else if (bom == 0xFFFE)
michael@0:       this._bigEndian = false;
michael@0:     else
michael@0:       do_throw("missing BOM: " + bom.toString(16).toUpperCase());
michael@0:   }
michael@0: }
michael@0: UTF16.prototype =
michael@0:   {
michael@0:     // returns numeric code point at front of stream encoded in UTF-16,
michael@0:     // -1 if at end of stream, or throws if UTF-16 code point not found
michael@0:     readUnit: function()
michael@0:     {
michael@0:       var str = this._stream;
michael@0: 
michael@0:       // if at end of stream, must distinguish failure to read any bytes
michael@0:       // (correct behavior) from failure to read some byte after the first
michael@0:       // in the character
michael@0:       try
michael@0:       {
michael@0:         var b1 = str.read8();
michael@0:       }
michael@0:       catch (e)
michael@0:       {
michael@0:         return -1;
michael@0:       }
michael@0: 
michael@0:       var b2 = str.read8();
michael@0: 
michael@0:       var w1 = this._bigEndian
michael@0:              ? (b1 << 8) + b2
michael@0:              : (b2 << 8) + b1;
michael@0: 
michael@0:       if (w1 > 0xDBFF && w1 < 0xE000)
michael@0:       {
michael@0:         // second surrogate, but expecting none or first
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0:       if (w1 > 0xD7FF && w1 < 0xDC00)
michael@0:       {
michael@0:         // non-BMP, use surrogate pair
michael@0:         b1 = str.read8();
michael@0:         b2 = str.read8();
michael@0:         var w2 = this._bigEndian
michael@0:                ? (b1 << 8) + b2
michael@0:                : (b2 << 8) + b1;
michael@0:         if (w2 < 0xDC00 || w2 > 0xDFFF)
michael@0:           throw NS_ERROR_ILLEGAL_VALUE;
michael@0: 
michael@0:         var rv = 0x100000 +
michael@0:                  ((lowbits(10) & w2) << 10) +
michael@0:                   (lowbits(10) & w1);
michael@0:         if (rv <= 0x10FFFF)
michael@0:           return rv;
michael@0:         throw NS_ERROR_ILLEGAL_VALUE;
michael@0:       }
michael@0: 
michael@0:       // non-surrogate
michael@0:       return w1;
michael@0:     }
michael@0:   };