dom/encoding/test/unit/test_utf.js

Fri, 16 Jan 2015 18:13:44 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Fri, 16 Jan 2015 18:13:44 +0100
branch
TOR_BUG_9701
changeset 14
925c144e1f1f
permissions
-rw-r--r--

Integrate suggestion from review to improve consistency with existing code.

michael@0 1 // NOTE: Requires testharness.js
michael@0 2 // http://www.w3.org/2008/webapps/wiki/Harness
michael@0 3
michael@0 4 // Extension to testharness.js API which avoids logging enormous strings
michael@0 5 // on a coding failure.
michael@0 6 function assert_string_equals(actual, expected, description) {
michael@0 7 // short circuit success case
michael@0 8 if (actual === expected) {
michael@0 9 assert_true(true, description + ": <actual> === <expected>");
michael@0 10 return;
michael@0 11 }
michael@0 12
michael@0 13 // length check
michael@0 14 assert_equals(actual.length, expected.length,
michael@0 15 description + ": string lengths")
michael@0 16
michael@0 17 var i, a, b;
michael@0 18 for (i = 0; i < actual.length; i++) {
michael@0 19 a = actual.charCodeAt(i);
michael@0 20 b = expected.charCodeAt(i);
michael@0 21 if (a !== b)
michael@0 22 assert_true(false,
michael@0 23 description +
michael@0 24 ": code unit " + i.toString() + " unequal: " +
michael@0 25 cpname(a) + " != " + cpname(b)); // doesn't return
michael@0 26 }
michael@0 27
michael@0 28 // It should be impossible to get here, because the initial
michael@0 29 // comparison failed, so either the length comparison or the
michael@0 30 // codeunit-by-codeunit comparison should also fail.
michael@0 31 assert_true(false, description + ": failed to detect string difference");
michael@0 32 }
michael@0 33
michael@0 34 // Inspired by:
michael@0 35 // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html
michael@0 36 function encode_utf8(string) {
michael@0 37 var utf8 = unescape(encodeURIComponent(string));
michael@0 38 var octets = new Uint8Array(utf8.length), i;
michael@0 39 for (i = 0; i < utf8.length; i += 1) {
michael@0 40 octets[i] = utf8.charCodeAt(i);
michael@0 41 }
michael@0 42 return octets;
michael@0 43 }
michael@0 44
michael@0 45 function decode_utf8(octets) {
michael@0 46 var utf8 = String.fromCharCode.apply(null, octets);
michael@0 47 return decodeURIComponent(escape(utf8));
michael@0 48 }
michael@0 49
michael@0 50 // Helpers for test_utf_roundtrip.
michael@0 51 function cpname(n) {
michael@0 52 if (n+0 !== n)
michael@0 53 return n.toString();
michael@0 54 var w = (n <= 0xFFFF) ? 4 : 6;
michael@0 55 return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w);
michael@0 56 }
michael@0 57
michael@0 58 function genblock(from, len) {
michael@0 59 var i, j, point, offset;
michael@0 60 var size, block;
michael@0 61
michael@0 62 // determine size required:
michael@0 63 // 1 unit for each point from U+000000 through U+00D7FF
michael@0 64 // 0 units U+00D800 through U+00DFFF
michael@0 65 // 1 unit U+00E000 through U+00FFFF
michael@0 66 // 2 units U+010000 through U+10FFFF
michael@0 67 function overlap(min1, max1, min2, max2) {
michael@0 68 return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2));
michael@0 69 }
michael@0 70 size = (overlap(from, from+len, 0x000000, 0x00D800) +
michael@0 71 overlap(from, from+len, 0x00E000, 0x010000) +
michael@0 72 overlap(from, from+len, 0x010000, 0x110000)*2);
michael@0 73
michael@0 74 block = new Uint16Array(size);
michael@0 75 for (i = 0, j = 0; i < len; i++) {
michael@0 76 point = from + i;
michael@0 77 if (0xD800 <= point && point <= 0xDFFF)
michael@0 78 continue;
michael@0 79 else if (point <= 0xFFFF)
michael@0 80 block[j++] = point;
michael@0 81 else {
michael@0 82 offset = point - 0x10000;
michael@0 83 block[j++] = 0xD800 + (offset >> 10);
michael@0 84 block[j++] = 0xDC00 + (offset & 0x3FF);
michael@0 85 }
michael@0 86 }
michael@0 87 return String.fromCharCode.apply(null, block);
michael@0 88 }
michael@0 89
michael@0 90 function test_utf_roundtrip () {
michael@0 91 var MIN_CODEPOINT = 0;
michael@0 92 var MAX_CODEPOINT = 0x10FFFF;
michael@0 93 var BLOCK_SIZE = 0x1000;
michael@0 94
michael@0 95 var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded;
michael@0 96
michael@0 97 var TE_U16LE = new TextEncoder("UTF-16LE");
michael@0 98 var TD_U16LE = new TextDecoder("UTF-16LE");
michael@0 99
michael@0 100 var TE_U16BE = new TextEncoder("UTF-16BE");
michael@0 101 var TD_U16BE = new TextDecoder("UTF-16BE");
michael@0 102
michael@0 103 var TE_U8 = new TextEncoder("UTF-8");
michael@0 104 var TD_U8 = new TextDecoder("UTF-8");
michael@0 105
michael@0 106 for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) {
michael@0 107 block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1);
michael@0 108 block = genblock(i, BLOCK_SIZE);
michael@0 109
michael@0 110 // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves
michael@0 111 encoded = TE_U16LE.encode(block);
michael@0 112 decoded = TD_U16LE.decode(encoded);
michael@0 113 assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag);
michael@0 114
michael@0 115 encoded = TE_U16BE.encode(block);
michael@0 116 decoded = TD_U16BE.decode(encoded);
michael@0 117 assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag);
michael@0 118
michael@0 119 encoded = TE_U8.encode(block);
michael@0 120 decoded = TD_U8.decode(encoded);
michael@0 121 assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag);
michael@0 122
michael@0 123 // test TextEncoder(UTF-8) against the older idiom
michael@0 124 exp_encoded = encode_utf8(block);
michael@0 125 assert_array_equals(encoded, exp_encoded,
michael@0 126 "UTF-8 reference encoding " + block_tag);
michael@0 127
michael@0 128 exp_decoded = decode_utf8(exp_encoded);
michael@0 129 assert_string_equals(decoded, exp_decoded,
michael@0 130 "UTF-8 reference decoding " + block_tag);
michael@0 131 }
michael@0 132 }
michael@0 133
michael@0 134 function test_utf_samples () {
michael@0 135 // z, cent, CJK water, G-Clef, Private-use character
michael@0 136 var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD";
michael@0 137 var cases = [
michael@0 138 { encoding: "utf-8",
michael@0 139 expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] },
michael@0 140 { encoding: "utf-16le",
michael@0 141 expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
michael@0 142 { encoding: "utf-16",
michael@0 143 expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] },
michael@0 144 { encoding: "utf-16be",
michael@0 145 expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] }
michael@0 146 ];
michael@0 147
michael@0 148 cases.forEach(
michael@0 149 function(t) {
michael@0 150 var encoded = new TextEncoder(t.encoding).encode(sample);
michael@0 151 assert_array_equals(encoded, t.expected,
michael@0 152 "expected equal encodings - " + t.encoding);
michael@0 153
michael@0 154 var decoded = new TextDecoder(t.encoding)
michael@0 155 .decode(new Uint8Array(t.expected));
michael@0 156 assert_equals(decoded, sample,
michael@0 157 "expected equal decodings - " + t.encoding);
michael@0 158 });
michael@0 159 }
michael@0 160
michael@0 161 test(test_utf_samples,
michael@0 162 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample");
michael@0 163
michael@0 164 test(test_utf_roundtrip,
michael@0 165 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+
michael@0 166 "agreement with encode/decodeURIComponent");

mercurial