intl/uconv/tests/unit/test_charset_conversion.js

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/intl/uconv/tests/unit/test_charset_conversion.js	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,428 @@
     1.4 +const Cc = Components.classes;
     1.5 +const Ci = Components.interfaces;
     1.6 +
     1.7 +const NS_ERROR_ILLEGAL_VALUE = Components.results.NS_ERROR_ILLEGAL_VALUE;
     1.8 +
     1.9 +var BIS, BOS, _Pipe, COS, FIS, _SS, CIS;
    1.10 +
    1.11 +var dataDir;
    1.12 +
    1.13 +function run_test()
    1.14 +{
    1.15 +  BIS = Components.Constructor("@mozilla.org/binaryinputstream;1",
    1.16 +                               "nsIBinaryInputStream",
    1.17 +                               "setInputStream");
    1.18 +  BOS = Components.Constructor("@mozilla.org/binaryoutputstream;1",
    1.19 +                               "nsIBinaryOutputStream",
    1.20 +                               "setOutputStream");
    1.21 +  _Pipe = Components.Constructor("@mozilla.org/pipe;1",
    1.22 +                                 "nsIPipe",
    1.23 +                                 "init");
    1.24 +  COS = Components.Constructor("@mozilla.org/intl/converter-output-stream;1",
    1.25 +                               "nsIConverterOutputStream",
    1.26 +                               "init");
    1.27 +  FIS = Components.Constructor("@mozilla.org/network/file-input-stream;1",
    1.28 +                               "nsIFileInputStream",
    1.29 +                               "init");
    1.30 +  _SS = Components.Constructor("@mozilla.org/storagestream;1",
    1.31 +                               "nsIStorageStream",
    1.32 +                               "init");
    1.33 +  CIS = Components.Constructor("@mozilla.org/intl/converter-input-stream;1",
    1.34 +                               "nsIConverterInputStream",
    1.35 +                               "init");
    1.36 +
    1.37 +  dataDir = do_get_file("data/");
    1.38 +
    1.39 +  test_utf8_1();
    1.40 +  test_utf16_1();
    1.41 +  test_utf16_2();
    1.42 +  test_utf16_3();
    1.43 +  test_cross_conversion();
    1.44 +}
    1.45 +
    1.46 +const UNICODE_STRINGS =
    1.47 +  [
    1.48 +    '\u00BD + \u00BE == \u00BD\u00B2 + \u00BC + \u00BE',
    1.49 +
    1.50 +    'AZaz09 \u007F ' +               // U+000000 to U+00007F
    1.51 +    '\u0080 \u0398 \u03BB \u0725 ' + // U+000080 to U+0007FF
    1.52 +    '\u0964 \u0F5F \u20AC \uFFFB'    // U+000800 to U+00FFFF
    1.53 +
    1.54 +    // there would be strings containing non-BMP code points here, but
    1.55 +    // unfortunately JS strings are UCS-2 (and worse yet are treated as
    1.56 +    // 16-bit values by the spec), so we have to do gymnastics to work
    1.57 +    // with non-BMP -- manual surrogate decoding doesn't work because
    1.58 +    // String.prototype.charCodeAt() ignores surrogate pairs and only
    1.59 +    // returns 16-bit values
    1.60 +  ];
    1.61 +
    1.62 +// test conversion equality -- keys are names of files containing equivalent
    1.63 +// Unicode data, values are the encoding of the file in the format expected by
    1.64 +// nsIConverter(In|Out)putStream.init
    1.65 +const UNICODE_FILES =
    1.66 +  {
    1.67 +    "unicode-conversion.utf8.txt":            "UTF-8",
    1.68 +    "unicode-conversion.utf16.txt":           "UTF-16",
    1.69 +    "unicode-conversion.utf16le.txt":         "UTF-16LE",
    1.70 +    "unicode-conversion.utf16be.txt":         "UTF-16BE"
    1.71 +  };
    1.72 +
    1.73 +function test_utf8_1()
    1.74 +{
    1.75 +  for (var i = 0; i < UNICODE_STRINGS.length; i++)
    1.76 +  {
    1.77 +    var pipe = Pipe();
    1.78 +    var conv = new COS(pipe.outputStream, "UTF-8", 1024, 0x0);
    1.79 +    do_check_true(conv.writeString(UNICODE_STRINGS[i]));
    1.80 +    conv.close();
    1.81 +
    1.82 +    if (!equal(new UTF8(pipe.inputStream),
    1.83 +               stringToCodePoints(UNICODE_STRINGS[i])))
    1.84 +      do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
    1.85 +  }
    1.86 +}
    1.87 +
    1.88 +function test_utf16_1()
    1.89 +{
    1.90 +  for (var i = 0; i < UNICODE_STRINGS.length; i++)
    1.91 +  {
    1.92 +    var pipe = Pipe();
    1.93 +    var conv = new COS(pipe.outputStream, "UTF-16", 1024, 0x0);
    1.94 +    do_check_true(conv.writeString(UNICODE_STRINGS[i]));
    1.95 +    conv.close();
    1.96 +
    1.97 +    if (!equal(new UTF16(pipe.inputStream),
    1.98 +               stringToCodePoints(UNICODE_STRINGS[i])))
    1.99 +      do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
   1.100 +  }
   1.101 +}
   1.102 +
   1.103 +function test_utf16_2()
   1.104 +{
   1.105 +  for (var i = 0; i < UNICODE_STRINGS.length; i++)
   1.106 +  {
   1.107 +    var pipe = Pipe();
   1.108 +    var conv = new COS(pipe.outputStream, "UTF-16LE", 1024, 0x0);
   1.109 +    do_check_true(conv.writeString(UNICODE_STRINGS[i]));
   1.110 +    conv.close();
   1.111 +
   1.112 +    if (!equal(new UTF16(pipe.inputStream, false),
   1.113 +               stringToCodePoints(UNICODE_STRINGS[i])))
   1.114 +      do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
   1.115 +  }
   1.116 +}
   1.117 +
   1.118 +function test_utf16_3()
   1.119 +{
   1.120 +  for (var i = 0; i < UNICODE_STRINGS.length; i++)
   1.121 +  {
   1.122 +    var pipe = Pipe();
   1.123 +    var conv = new COS(pipe.outputStream, "UTF-16BE", 1024, 0x0);
   1.124 +    do_check_true(conv.writeString(UNICODE_STRINGS[i]));
   1.125 +    conv.close();
   1.126 +
   1.127 +    if (!equal(new UTF16(pipe.inputStream, true),
   1.128 +               stringToCodePoints(UNICODE_STRINGS[i])))
   1.129 +      do_throw("UNICODE_STRINGS[" + i + "] not handled correctly");
   1.130 +  }
   1.131 +}
   1.132 +
   1.133 +
   1.134 +function test_cross_conversion()
   1.135 +{
   1.136 +  for (var fn1 in UNICODE_FILES)
   1.137 +  {
   1.138 +    var fin = getBinaryInputStream(fn1);
   1.139 +    var ss = StorageStream();
   1.140 +
   1.141 +    var bos = new BOS(ss.getOutputStream(0));
   1.142 +    var av;
   1.143 +    while ((av = fin.available()) > 0)
   1.144 +    {
   1.145 +      var data = fin.readByteArray(av);
   1.146 +      bos.writeByteArray(data, data.length);
   1.147 +    }
   1.148 +    fin.close();
   1.149 +    bos.close();
   1.150 +
   1.151 +    for (var fn2 in UNICODE_FILES)
   1.152 +    {
   1.153 +      var fin2 = getUnicharInputStream(fn2, UNICODE_FILES[fn2]);
   1.154 +      var unichar = new CIS(ss.newInputStream(0),
   1.155 +                            UNICODE_FILES[fn1], 8192, 0x0);
   1.156 +
   1.157 +      if (!equalUnicharStreams(unichar, fin2))
   1.158 +        do_throw("unequal streams: " +
   1.159 +                 UNICODE_FILES[fn1] + ", " +
   1.160 +                 UNICODE_FILES[fn2]);
   1.161 +    }
   1.162 +  }
   1.163 +}
   1.164 +
   1.165 +
   1.166 +// utility functions
   1.167 +
   1.168 +function StorageStream()
   1.169 +{
   1.170 +  return new _SS(8192, Math.pow(2, 32) - 1, null);
   1.171 +}
   1.172 +
   1.173 +function getUnicharInputStream(filename, encoding)
   1.174 +{
   1.175 +  var file = dataDir.clone();
   1.176 +  file.append(filename);
   1.177 +
   1.178 +  const PR_RDONLY = 0x1;
   1.179 +  var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   1.180 +  return new CIS(fis, encoding, 8192, 0x0);
   1.181 +}
   1.182 +
   1.183 +function getBinaryInputStream(filename, encoding)
   1.184 +{
   1.185 +  var file = dataDir.clone();
   1.186 +  file.append(filename);
   1.187 +
   1.188 +  const PR_RDONLY = 0x1;
   1.189 +  var fis = new FIS(file, PR_RDONLY, 0644, Ci.nsIFileInputStream.CLOSE_ON_EOF);
   1.190 +  return new BIS(fis);
   1.191 +}
   1.192 +
   1.193 +function equal(stream, codePoints)
   1.194 +{
   1.195 +  var sz, currIndex = 0;
   1.196 +  while (true)
   1.197 +  {
   1.198 +    var unit = stream.readUnit();
   1.199 +    if (unit < 0)
   1.200 +      return currIndex == codePoints.length;
   1.201 +    if (unit !== codePoints[currIndex++])
   1.202 +      return false;
   1.203 +  }
   1.204 +
   1.205 +  do_throw("not reached");
   1.206 +  return false;
   1.207 +}
   1.208 +
   1.209 +function equalUnicharStreams(s1, s2)
   1.210 +{
   1.211 +  var r1, r2;
   1.212 +  var str1 = {}, str2 = {};
   1.213 +  while (true)
   1.214 +  {
   1.215 +    r1 = s1.readString(1024, str1);
   1.216 +    r2 = s2.readString(1024, str2);
   1.217 +
   1.218 +    if (r1 != r2 || str1.value != str2.value)
   1.219 +    {
   1.220 +      print("r1: " + r1 + ", r2: " + r2);
   1.221 +      print(str1.value.length);
   1.222 +      print(str2.value.length);
   1.223 +      return false;
   1.224 +    }
   1.225 +    if (r1 == 0 && r2 == 0)
   1.226 +      return true;
   1.227 +  }
   1.228 +
   1.229 +  // not reached
   1.230 +  return false;
   1.231 +}
   1.232 +
   1.233 +function stringToCodePoints(str)
   1.234 +{
   1.235 +  return str.split('').map(function(v){ return v.charCodeAt(0); });
   1.236 +}
   1.237 +
   1.238 +function lowbits(n)
   1.239 +{
   1.240 +  return Math.pow(2, n) - 1;
   1.241 +}
   1.242 +
   1.243 +function Pipe()
   1.244 +{
   1.245 +  return new _Pipe(false, false, 1024, 10, null);
   1.246 +}
   1.247 +
   1.248 +
   1.249 +// complex charset readers
   1.250 +
   1.251 +/**
   1.252 + * Wraps a UTF-8 stream to allow access to the Unicode code points in it.
   1.253 + *
   1.254 + * @param stream
   1.255 + *   the stream to wrap
   1.256 + */
   1.257 +function UTF8(stream)
   1.258 +{
   1.259 +  this._stream = new BIS(stream);
   1.260 +}
   1.261 +UTF8.prototype =
   1.262 +  {
   1.263 +    // returns numeric code point at front of stream encoded in UTF-8, -1 if at
   1.264 +    // end of stream, or throws if valid (and properly encoded!) code point not
   1.265 +    // found
   1.266 +    readUnit: function()
   1.267 +    {
   1.268 +      var str = this._stream;
   1.269 +
   1.270 +      var c, c2, c3, c4, rv;
   1.271 +
   1.272 +      // if at end of stream, must distinguish failure to read any bytes
   1.273 +      // (correct behavior) from failure to read some byte after the first
   1.274 +      // in the character
   1.275 +      try
   1.276 +      {
   1.277 +        c = str.read8();
   1.278 +      }
   1.279 +      catch (e)
   1.280 +      {
   1.281 +        return -1;
   1.282 +      }
   1.283 +
   1.284 +      if (c < 0x80)
   1.285 +        return c;
   1.286 +
   1.287 +      if (c < 0xC0) // c < 11000000
   1.288 +      {
   1.289 +        // byte doesn't have enough leading ones (must be at least two)
   1.290 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.291 +      }
   1.292 +
   1.293 +
   1.294 +      c2 = str.read8();
   1.295 +      if (c2 >= 0xC0 || c2 < 0x80)
   1.296 +        throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   1.297 +
   1.298 +      if (c < 0xE0) // c < 11100000
   1.299 +      {
   1.300 +        // two-byte between U+000080 and U+0007FF
   1.301 +        rv = ((lowbits(5) & c) << 6) +
   1.302 +              (lowbits(6) & c2);
   1.303 +        // no upper bounds-check needed, by previous lines
   1.304 +        if (rv >= 0x80)
   1.305 +          return rv;
   1.306 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.307 +      }
   1.308 +
   1.309 +
   1.310 +      c3 = str.read8();
   1.311 +      if (c3 >= 0xC0 || c3 < 0x80)
   1.312 +        throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   1.313 +
   1.314 +      if (c < 0xF0) // c < 11110000
   1.315 +      {
   1.316 +        // three-byte between U+000800 and U+00FFFF
   1.317 +        rv = ((lowbits(4) & c)  << 12) +
   1.318 +             ((lowbits(6) & c2) <<  6) +
   1.319 +              (lowbits(6) & c3);
   1.320 +        // no upper bounds-check needed, by previous lines
   1.321 +        if (rv >= 0xE000 ||
   1.322 +            (rv >= 0x800 && rv <= 0xD7FF))
   1.323 +          return rv;
   1.324 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.325 +      }
   1.326 +
   1.327 +
   1.328 +      c4 = str.read8();
   1.329 +      if (c4 >= 0xC0 || c4 < 0x80)
   1.330 +        throw NS_ERROR_ILLEGAL_VALUE; // not 10xxxxxx
   1.331 +
   1.332 +      if (c < 0xF8) // c < 11111000
   1.333 +      {
   1.334 +        // four-byte between U+010000 and U+10FFFF
   1.335 +        rv = ((lowbits(3) & c)  << 18) +
   1.336 +             ((lowbits(6) & c2) << 12) +
   1.337 +             ((lowbits(6) & c3) <<  6) +
   1.338 +              (lowbits(6) & c4);
   1.339 +        // need an upper bounds-check since 0x10FFFF isn't (2**n - 1)
   1.340 +        if (rv >= 0x10000 && rv <= 0x10FFFF)
   1.341 +          return rv;
   1.342 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.343 +      }
   1.344 +
   1.345 +      // 11111000 or greater -- no UTF-8 mapping
   1.346 +      throw NS_ERROR_ILLEGAL_VALUE;
   1.347 +    }
   1.348 +  };
   1.349 +
   1.350 +/**
   1.351 + * Wraps a UTF-16 stream to allow access to the Unicode code points in it.
   1.352 + *
   1.353 + * @param stream
   1.354 + *   the stream to wrap
   1.355 + * @param bigEndian
   1.356 + *   true for UTF-16BE, false for UTF-16LE, not present at all for UTF-16 with
   1.357 + *   a byte-order mark
   1.358 + */
   1.359 +function UTF16(stream, bigEndian)
   1.360 +{
   1.361 +  this._stream = new BIS(stream);
   1.362 +  if (arguments.length > 1)
   1.363 +  {
   1.364 +    this._bigEndian = bigEndian;
   1.365 +  }
   1.366 +  else
   1.367 +  {
   1.368 +    var bom = this._stream.read16();
   1.369 +    if (bom == 0xFEFF)
   1.370 +      this._bigEndian = true;
   1.371 +    else if (bom == 0xFFFE)
   1.372 +      this._bigEndian = false;
   1.373 +    else
   1.374 +      do_throw("missing BOM: " + bom.toString(16).toUpperCase());
   1.375 +  }
   1.376 +}
   1.377 +UTF16.prototype =
   1.378 +  {
   1.379 +    // returns numeric code point at front of stream encoded in UTF-16,
   1.380 +    // -1 if at end of stream, or throws if UTF-16 code point not found
   1.381 +    readUnit: function()
   1.382 +    {
   1.383 +      var str = this._stream;
   1.384 +
   1.385 +      // if at end of stream, must distinguish failure to read any bytes
   1.386 +      // (correct behavior) from failure to read some byte after the first
   1.387 +      // in the character
   1.388 +      try
   1.389 +      {
   1.390 +        var b1 = str.read8();
   1.391 +      }
   1.392 +      catch (e)
   1.393 +      {
   1.394 +        return -1;
   1.395 +      }
   1.396 +
   1.397 +      var b2 = str.read8();
   1.398 +
   1.399 +      var w1 = this._bigEndian
   1.400 +             ? (b1 << 8) + b2
   1.401 +             : (b2 << 8) + b1;
   1.402 +
   1.403 +      if (w1 > 0xDBFF && w1 < 0xE000)
   1.404 +      {
   1.405 +        // second surrogate, but expecting none or first
   1.406 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.407 +      }
   1.408 +
   1.409 +      if (w1 > 0xD7FF && w1 < 0xDC00)
   1.410 +      {
   1.411 +        // non-BMP, use surrogate pair
   1.412 +        b1 = str.read8();
   1.413 +        b2 = str.read8();
   1.414 +        var w2 = this._bigEndian
   1.415 +               ? (b1 << 8) + b2
   1.416 +               : (b2 << 8) + b1;
   1.417 +        if (w2 < 0xDC00 || w2 > 0xDFFF)
   1.418 +          throw NS_ERROR_ILLEGAL_VALUE;
   1.419 +
   1.420 +        var rv = 0x100000 +
   1.421 +                 ((lowbits(10) & w2) << 10) +
   1.422 +                  (lowbits(10) & w1);
   1.423 +        if (rv <= 0x10FFFF)
   1.424 +          return rv;
   1.425 +        throw NS_ERROR_ILLEGAL_VALUE;
   1.426 +      }
   1.427 +
   1.428 +      // non-surrogate
   1.429 +      return w1;
   1.430 +    }
   1.431 +  };

mercurial