intl/uconv/tests/unit/test_charset_conversion.js

Wed, 31 Dec 2014 07:22:50 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 07:22:50 +0100
branch
TOR_BUG_3246
changeset 4
fc2d59ddac77
permissions
-rw-r--r--

Correct previous dual key logic pending first delivery installment.

     1 const Cc = Components.classes;
     2 const Ci = Components.interfaces;
     4 const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE;
     6 var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
     8 var dataDir;
    10 function run_test()
    11 {
    12   BIS = Components.Constructor("@mozilla.org/binaryinputstream;1",
    13                                "nsIBinaryInputStream",
    14                                "setInputStream");
    15   BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1",
    16                                "nsIBinaryOutputStream",
    17                                "setOutputStream");
    18   _Pipe = Components.Constructor("@mozilla.org/pipe;1",
    19                                  "nsIPipe",
    20                                  "init");
    21   COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1",
    22                                "nsIConverterOutputStream",
    23                                "init");
    24   FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1",
    25                                "nsIFileInputStream",
    26                                "init");
    27   _SS = Components.Constructor("@mozilla.org/storagestream;1",
    28                                "nsIStorageStream",
    29                                "init");
    30   CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
    31                                "nsIConverterInputStream",
    32                                "init");
    34   dataDir = do_get_file("data/");
    36   test_utf8_1();
    37   test_utf16_1();
    38   test_utf16_2();
    39   test_utf16_3();
    40   test_cross_conversion();
    41 }
    43 const UNICODE_STRINGS =
    44   [
    45     '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE',
    47     'AZaz09 \u007F ' +               // U+000000 to U+00007F
    48     '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF
    49     '\u0964 \u0F5F \u20AC \uFFFB'    // U+000800 to U+00FFFF
    51     // there would be strings containing non-BMP code points here, but
    52     // unfortunately JS strings are UCS-2 (and worse yet are treated as
    53     // 16-bit values by the spec), so we have to do gymnastics to work
    54     // with non-BMP -- manual surrogate decoding doesn't work because
    55     // String.prototype.charCodeAt() ignores surrogate pairs and only
    56     // returns 16-bit values
    57   ];
    59 // test conversion equality -- keys are names of files containing equivalent
    60 // Unicode data, values are the encoding of the file in the format expected by
    61 // nsIConverter(In|Out)putStream.init
    62 const UNICODE_FILES =
    63   {
    64     "unicode-conversion.utf8.txt":            "UTF-8",
    65     "unicode-conversion.utf16.txt":           "UTF-16",
    66     "unicode-conversion.utf16le.txt":         "UTF-16LE",
    67     "unicode-conversion.utf16be.txt":         "UTF-16BE"
    68   };
    70 function test_utf8_1()
    71 {
    72   for (var i = 0; i < UNICODE_STRINGS.length; i++)
    73   {
    74     var pipe = Pipe();
    75     var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0);
    76     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
    77     conv.close();
    79     if (!equal(new UTF8(pipe.inputStream),
    80                stringToCodePoints(UNICODE_STRINGS[i])))
    81       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
    82   }
    83 }
    85 function test_utf16_1()
    86 {
    87   for (var i = 0; i < UNICODE_STRINGS.length; i++)
    88   {
    89     var pipe = Pipe();
    90     var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0);
    91     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
    92     conv.close();
    94     if (!equal(new UTF16(pipe.inputStream),
    95                stringToCodePoints(UNICODE_STRINGS[i])))
    96       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
    97   }
    98 }
   100 function test_utf16_2()
   101 {
   102   for (var i = 0; i < UNICODE_STRINGS.length; i++)
   103   {
   104     var pipe = Pipe();
   105     var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0);
   106     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
   107     conv.close();
   109     if (!equal(new UTF16(pipe.inputStream, false),
   110                stringToCodePoints(UNICODE_STRINGS[i])))
   111       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
   112   }
   113 }
   115 function test_utf16_3()
   116 {
   117   for (var i = 0; i < UNICODE_STRINGS.length; i++)
   118   {
   119     var pipe = Pipe();
   120     var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0);
   121     do_check_true(conv.writeString(UNICODE_STRINGS[i]));
   122     conv.close();
   124     if (!equal(new UTF16(pipe.inputStream, true),
   125                stringToCodePoints(UNICODE_STRINGS[i])))
   126       do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
   127   }
   128 }
   131 function test_cross_conversion()
   132 {
   133   for (var fn1 in UNICODE_FILES)
   134   {
   135     var fin = getBinaryInputStream(fn1);
   136     var ss = StorageStream();
   138     var bos = new BOS(ss.getOutputStream(0));
   139     var av;
   140     while ((av = fin.available()) > 0)
   141     {
   142       var data = fin.readByteArray(av);
   143       bos.writeByteArray(data, data.length);
   144     }
   145     fin.close();
   146     bos.close();
   148     for (var fn2 in UNICODE_FILES)
   149     {
   150       var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
   151       var unichar = new CIS(ss.newInputStream(0),
   152                             UNICODE_FILES[fn1], 8192, 0x0);
   154       if (!equalUnicharStreams(unichar, fin2))
   155         do_throw("unequal streams: " +
   156                  UNICODE_FILES[fn1] + ", " +
   157                  UNICODE_FILES[fn2]);
   158     }
   159   }
   160 }
   163 // utility functions
   165 function StorageStream()
   166 {
   167   return new _SS(8192, Math.pow(2, 32) - 1, null);
   168 }
   170 function getUnicharInputStream(filename, encoding)
   171 {
   172   var file = dataDir.clone();
   173   file.append(filename);
   175   const PR_RDONLY = 0x1;
   176   var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   177   return new CIS(fis, encoding, 8192, 0x0);
   178 }
   180 function getBinaryInputStream(filename, encoding)
   181 {
   182   var file = dataDir.clone();
   183   file.append(filename);
   185   const PR_RDONLY = 0x1;
   186   var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   187   return new BIS(fis);
   188 }
   190 function equal(stream, codePoints)
   191 {
   192   var sz, currIndex = 0;
   193   while (true)
   194   {
   195     var unit = stream.readUnit();
   196     if (unit < 0)
   197       return currIndex == codePoints.length;
   198     if (unit !== codePoints[currIndex++])
   199       return false;
   200   }
   202   do_throw("not reached");
   203   return false;
   204 }
   206 function equalUnicharStreams(s1, s2)
   207 {
   208   var r1, r2;
   209   var str1 = {}, str2 = {};
   210   while (true)
   211   {
   212     r1 = s1.readString(1024, str1);
   213     r2 = s2.readString(1024, str2);
   215     if (r1 != r2 || str1.value != str2.value)
   216     {
   217       print("r1: " + r1 + ", r2: " + r2);
   218       print(str1.value.length);
   219       print(str2.value.length);
   220       return false;
   221     }
   222     if (r1 == 0 && r2 == 0)
   223       return true;
   224   }
   226   // not reached
   227   return false;
   228 }
   230 function stringToCodePoints(str)
   231 {
   232   return str.split('').map(function(v){ return v.charCodeAt(0); });
   233 }
   235 function lowbits(n)
   236 {
   237   return Math.pow(2, n) - 1;
   238 }
   240 function Pipe()
   241 {
   242   return new _Pipe(false, false, 1024, 10, null);
   243 }
   246 // complex charset readers
   248 /**
   249  * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
   250  *
   251  * @param stream
   252  *   the stream to wrap
   253  */
   254 function UTF8(stream)
   255 {
   256   this._stream = new BIS(stream);
   257 }
   258 UTF8.prototype =
   259   {
   260     // returns numeric code point at front of stream encoded in UTF-8, -1 if at
   261     // end of stream, or throws if valid (and properly encoded!) code point not
   262     // found
   263     readUnit: function()
   264     {
   265       var str = this._stream;
   267       var c, c2, c3, c4, rv;
   269       // if at end of stream, must distinguish failure to read any bytes
   270       // (correct behavior) from failure to read some byte after the first
   271       // in the character
   272       try
   273       {
   274         c = str.read8();
   275       }
   276       catch (e)
   277       {
   278         return -1;
   279       }
   281       if (c < 0x80)
   282         return c;
   284       if (c < 0xC0) // c < 11000000
   285       {
   286         // byte doesn't have enough leading ones (must be at least two)
   287         throw NS_ERROR_ILLEGAL_VALUE;
   288       }
   291       c2 = str.read8();
   292       if (c2 >= 0xC0 || c2 < 0x80)
   293         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   295       if (c < 0xE0) // c < 11100000
   296       {
   297         // two-byte between U+000080 and U+0007FF
   298         rv = ((lowbits(5) & c) << 6) +
   299               (lowbits(6) & c2);
   300         // no upper bounds-check needed, by previous lines
   301         if (rv >= 0x80)
   302           return rv;
   303         throw NS_ERROR_ILLEGAL_VALUE;
   304       }
   307       c3 = str.read8();
   308       if (c3 >= 0xC0 || c3 < 0x80)
   309         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   311       if (c < 0xF0) // c < 11110000
   312       {
   313         // three-byte between U+000800 and U+00FFFF
   314         rv = ((lowbits(4) & c)  << 12) +
   315              ((lowbits(6) & c2) <<  6) +
   316               (lowbits(6) & c3);
   317         // no upper bounds-check needed, by previous lines
   318         if (rv >= 0xE000 ||
   319             (rv >= 0x800 && rv <= 0xD7FF))
   320           return rv;
   321         throw NS_ERROR_ILLEGAL_VALUE;
   322       }
   325       c4 = str.read8();
   326       if (c4 >= 0xC0 || c4 < 0x80)
   327         throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   329       if (c < 0xF8) // c < 11111000
   330       {
   331         // four-byte between U+010000 and U+10FFFF
   332         rv = ((lowbits(3) & c)  << 18) +
   333              ((lowbits(6) & c2) << 12) +
   334              ((lowbits(6) & c3) <<  6) +
   335               (lowbits(6) & c4);
   336         // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
   337         if (rv >= 0x10000 && rv <= 0x10FFFF)
   338           return rv;
   339         throw NS_ERROR_ILLEGAL_VALUE;
   340       }
   342       // 11111000 or greater -- no UTF-8 mapping
   343       throw NS_ERROR_ILLEGAL_VALUE;
   344     }
   345   };
   347 /**
   348  * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
   349  *
   350  * @param stream
   351  *   the stream to wrap
   352  * @param bigEndian
   353  *   true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
   354  *   a byte-order mark
   355  */
   356 function UTF16(stream, bigEndian)
   357 {
   358   this._stream = new BIS(stream);
   359   if (arguments.length > 1)
   360   {
   361     this._bigEndian = bigEndian;
   362   }
   363   else
   364   {
   365     var bom = this._stream.read16();
   366     if (bom == 0xFEFF)
   367       this._bigEndian = true;
   368     else if (bom == 0xFFFE)
   369       this._bigEndian = false;
   370     else
   371       do_throw("missing BOM: " + bom.toString(16).toUpperCase());
   372   }
   373 }
   374 UTF16.prototype =
   375   {
   376     // returns numeric code point at front of stream encoded in UTF-16,
   377     // -1 if at end of stream, or throws if UTF-16 code point not found
   378     readUnit: function()
   379     {
   380       var str = this._stream;
   382       // if at end of stream, must distinguish failure to read any bytes
   383       // (correct behavior) from failure to read some byte after the first
   384       // in the character
   385       try
   386       {
   387         var b1 = str.read8();
   388       }
   389       catch (e)
   390       {
   391         return -1;
   392       }
   394       var b2 = str.read8();
   396       var w1 = this._bigEndian
   397              ? (b1 << 8) + b2
   398              : (b2 << 8) + b1;
   400       if (w1 > 0xDBFF && w1 < 0xE000)
   401       {
   402         // second surrogate, but expecting none or first
   403         throw NS_ERROR_ILLEGAL_VALUE;
   404       }
   406       if (w1 > 0xD7FF && w1 < 0xDC00)
   407       {
   408         // non-BMP, use surrogate pair
   409         b1 = str.read8();
   410         b2 = str.read8();
   411         var w2 = this._bigEndian
   412                ? (b1 << 8) + b2
   413                : (b2 << 8) + b1;
   414         if (w2 < 0xDC00 || w2 > 0xDFFF)
   415           throw NS_ERROR_ILLEGAL_VALUE;
   417         var rv = 0x100000 +
   418                  ((lowbits(10) & w2) << 10) +
   419                   (lowbits(10) & w1);
   420         if (rv <= 0x10FFFF)
   421           return rv;
   422         throw NS_ERROR_ILLEGAL_VALUE;
   423       }
   425       // non-surrogate
   426       return w1;
   427     }
   428   };

mercurial