dom/encoding/test/unit/test_utf.js

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/dom/encoding/test/unit/test_utf.js	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,166 @@
     1.4 +// NOTE: Requires testharness.js
     1.5 +// http://www.w3.org/2008/webapps/wiki/Harness
     1.6 +
     1.7 +// Extension to testharness.js API which avoids logging enormous strings
     1.8 +// on a coding failure.
     1.9 +function assert_string_equals(actual, expected, description) {
    1.10 +  // short circuit success case
    1.11 +  if (actual === expected) {
    1.12 +    assert_true(true, description + ": <actual> === <expected>");
    1.13 +    return;
    1.14 +  }
    1.15 +
    1.16 +  // length check
    1.17 +  assert_equals(actual.length, expected.length,
    1.18 +                description + ": string lengths")
    1.19 +
    1.20 +  var i, a, b;
    1.21 +  for (i = 0; i < actual.length; i++) {
    1.22 +    a = actual.charCodeAt(i);
    1.23 +    b = expected.charCodeAt(i);
    1.24 +    if (a !== b)
    1.25 +      assert_true(false,
    1.26 +                  description +
    1.27 +                  ": code unit " + i.toString() + " unequal: " +
    1.28 +                  cpname(a) + " != " + cpname(b)); // doesn't return
    1.29 +  }
    1.30 +
    1.31 +  // It should be impossible to get here, because the initial
    1.32 +  // comparison failed, so either the length comparison or the
    1.33 +  // codeunit-by-codeunit comparison should also fail.
    1.34 +  assert_true(false, description + ": failed to detect string difference");
    1.35 +}
    1.36 +
    1.37 +// Inspired by:
    1.38 +// http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html
    1.39 +function encode_utf8(string) {
    1.40 +  var utf8 = unescape(encodeURIComponent(string));
    1.41 +  var octets = new Uint8Array(utf8.length), i;
    1.42 +  for (i = 0; i < utf8.length; i += 1) {
    1.43 +    octets[i] = utf8.charCodeAt(i);
    1.44 +  }
    1.45 +  return octets;
    1.46 +}
    1.47 +
    1.48 +function decode_utf8(octets) {
    1.49 +  var utf8 = String.fromCharCode.apply(null, octets);
    1.50 +  return decodeURIComponent(escape(utf8));
    1.51 +}
    1.52 +
    1.53 +// Helpers for test_utf_roundtrip.
    1.54 +function cpname(n) {
    1.55 +  if (n+0 !== n)
    1.56 +    return n.toString();
    1.57 +  var w = (n <= 0xFFFF) ? 4 : 6;
    1.58 +  return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w);
    1.59 +}
    1.60 +
    1.61 +function genblock(from, len) {
    1.62 +  var i, j, point, offset;
    1.63 +  var size, block;
    1.64 +
    1.65 +  // determine size required:
    1.66 +  //    1 unit   for each point from U+000000 through U+00D7FF
    1.67 +  //    0 units                      U+00D800 through U+00DFFF
    1.68 +  //    1 unit                       U+00E000 through U+00FFFF
    1.69 +  //    2 units                      U+010000 through U+10FFFF
    1.70 +  function overlap(min1, max1, min2, max2) {
    1.71 +    return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2));
    1.72 +  }
    1.73 +  size = (overlap(from, from+len, 0x000000, 0x00D800) +
    1.74 +          overlap(from, from+len, 0x00E000, 0x010000) +
    1.75 +          overlap(from, from+len, 0x010000, 0x110000)*2);
    1.76 +
    1.77 +  block = new Uint16Array(size);
    1.78 +  for (i = 0, j = 0; i < len; i++) {
    1.79 +    point = from + i;
    1.80 +    if (0xD800 <= point && point <= 0xDFFF)
    1.81 +      continue;
    1.82 +    else if (point <= 0xFFFF)
    1.83 +      block[j++] = point;
    1.84 +    else {
    1.85 +      offset = point - 0x10000;
    1.86 +      block[j++] = 0xD800 + (offset >> 10);
    1.87 +      block[j++] = 0xDC00 + (offset & 0x3FF);
    1.88 +    }
    1.89 +  }
    1.90 +  return String.fromCharCode.apply(null, block);
    1.91 +}
    1.92 +
    1.93 +function test_utf_roundtrip () {
    1.94 +  var MIN_CODEPOINT = 0;
    1.95 +  var MAX_CODEPOINT = 0x10FFFF;
    1.96 +  var BLOCK_SIZE = 0x1000;
    1.97 +
    1.98 +  var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded;
    1.99 +
   1.100 +  var TE_U16LE = new TextEncoder("UTF-16LE");
   1.101 +  var TD_U16LE = new TextDecoder("UTF-16LE");
   1.102 +
   1.103 +  var TE_U16BE = new TextEncoder("UTF-16BE");
   1.104 +  var TD_U16BE = new TextDecoder("UTF-16BE");
   1.105 +
   1.106 +  var TE_U8    = new TextEncoder("UTF-8");
   1.107 +  var TD_U8    = new TextDecoder("UTF-8");
   1.108 +
   1.109 +  for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) {
   1.110 +    block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1);
   1.111 +    block = genblock(i, BLOCK_SIZE);
   1.112 +
   1.113 +    // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves
   1.114 +    encoded = TE_U16LE.encode(block);
   1.115 +    decoded = TD_U16LE.decode(encoded);
   1.116 +    assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag);
   1.117 +
   1.118 +    encoded = TE_U16BE.encode(block);
   1.119 +    decoded = TD_U16BE.decode(encoded);
   1.120 +    assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag);
   1.121 +
   1.122 +    encoded = TE_U8.encode(block);
   1.123 +    decoded = TD_U8.decode(encoded);
   1.124 +    assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag);
   1.125 +
   1.126 +    // test TextEncoder(UTF-8) against the older idiom
   1.127 +    exp_encoded = encode_utf8(block);
   1.128 +    assert_array_equals(encoded, exp_encoded,
   1.129 +                        "UTF-8 reference encoding " + block_tag);
   1.130 +
   1.131 +    exp_decoded = decode_utf8(exp_encoded);
   1.132 +    assert_string_equals(decoded, exp_decoded,
   1.133 +                         "UTF-8 reference decoding " + block_tag);
   1.134 +  }
   1.135 +}
   1.136 +
   1.137 +function test_utf_samples () {
   1.138 +  // z, cent, CJK water, G-Clef, Private-use character
   1.139 +  var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD";
   1.140 +  var cases = [
   1.141 +    { encoding: "utf-8",
   1.142 +      expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] },
   1.143 +    { encoding: "utf-16le",
   1.144 +      expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
   1.145 +    { encoding: "utf-16",
   1.146 +      expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
   1.147 +    { encoding: "utf-16be",
   1.148 +      expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] }
   1.149 +  ];
   1.150 +
   1.151 +  cases.forEach(
   1.152 +    function(t) {
   1.153 +      var encoded = new TextEncoder(t.encoding).encode(sample);
   1.154 +      assert_array_equals(encoded, t.expected,
   1.155 +                          "expected equal encodings - " + t.encoding);
   1.156 +
   1.157 +      var decoded = new TextDecoder(t.encoding)
   1.158 +                        .decode(new Uint8Array(t.expected));
   1.159 +      assert_equals(decoded, sample,
   1.160 +                    "expected equal decodings - " + t.encoding);
   1.161 +    });
   1.162 +}
   1.163 +
   1.164 +test(test_utf_samples,
   1.165 +     "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample");
   1.166 +
   1.167 +test(test_utf_roundtrip,
   1.168 +     "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+
   1.169 +     "agreement with encode/decodeURIComponent");

mercurial