Fri, 16 Jan 2015 18:13:44 +0100
Integrate suggestion from review to improve consistency with existing code.
michael@0 | 1 | // NOTE: Requires testharness.js |
michael@0 | 2 | // http://www.w3.org/2008/webapps/wiki/Harness |
michael@0 | 3 | |
michael@0 | 4 | // Extension to testharness.js API which avoids logging enormous strings |
michael@0 | 5 | // on a coding failure. |
michael@0 | 6 | function assert_string_equals(actual, expected, description) { |
michael@0 | 7 | // short circuit success case |
michael@0 | 8 | if (actual === expected) { |
michael@0 | 9 | assert_true(true, description + ": <actual> === <expected>"); |
michael@0 | 10 | return; |
michael@0 | 11 | } |
michael@0 | 12 | |
michael@0 | 13 | // length check |
michael@0 | 14 | assert_equals(actual.length, expected.length, |
michael@0 | 15 | description + ": string lengths") |
michael@0 | 16 | |
michael@0 | 17 | var i, a, b; |
michael@0 | 18 | for (i = 0; i < actual.length; i++) { |
michael@0 | 19 | a = actual.charCodeAt(i); |
michael@0 | 20 | b = expected.charCodeAt(i); |
michael@0 | 21 | if (a !== b) |
michael@0 | 22 | assert_true(false, |
michael@0 | 23 | description + |
michael@0 | 24 | ": code unit " + i.toString() + " unequal: " + |
michael@0 | 25 | cpname(a) + " != " + cpname(b)); // doesn't return |
michael@0 | 26 | } |
michael@0 | 27 | |
michael@0 | 28 | // It should be impossible to get here, because the initial |
michael@0 | 29 | // comparison failed, so either the length comparison or the |
michael@0 | 30 | // codeunit-by-codeunit comparison should also fail. |
michael@0 | 31 | assert_true(false, description + ": failed to detect string difference"); |
michael@0 | 32 | } |
michael@0 | 33 | |
michael@0 | 34 | // Inspired by: |
michael@0 | 35 | // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html |
michael@0 | 36 | function encode_utf8(string) { |
michael@0 | 37 | var utf8 = unescape(encodeURIComponent(string)); |
michael@0 | 38 | var octets = new Uint8Array(utf8.length), i; |
michael@0 | 39 | for (i = 0; i < utf8.length; i += 1) { |
michael@0 | 40 | octets[i] = utf8.charCodeAt(i); |
michael@0 | 41 | } |
michael@0 | 42 | return octets; |
michael@0 | 43 | } |
michael@0 | 44 | |
michael@0 | 45 | function decode_utf8(octets) { |
michael@0 | 46 | var utf8 = String.fromCharCode.apply(null, octets); |
michael@0 | 47 | return decodeURIComponent(escape(utf8)); |
michael@0 | 48 | } |
michael@0 | 49 | |
michael@0 | 50 | // Helpers for test_utf_roundtrip. |
michael@0 | 51 | function cpname(n) { |
michael@0 | 52 | if (n+0 !== n) |
michael@0 | 53 | return n.toString(); |
michael@0 | 54 | var w = (n <= 0xFFFF) ? 4 : 6; |
michael@0 | 55 | return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w); |
michael@0 | 56 | } |
michael@0 | 57 | |
michael@0 | 58 | function genblock(from, len) { |
michael@0 | 59 | var i, j, point, offset; |
michael@0 | 60 | var size, block; |
michael@0 | 61 | |
michael@0 | 62 | // determine size required: |
michael@0 | 63 | // 1 unit for each point from U+000000 through U+00D7FF |
michael@0 | 64 | // 0 units U+00D800 through U+00DFFF |
michael@0 | 65 | // 1 unit U+00E000 through U+00FFFF |
michael@0 | 66 | // 2 units U+010000 through U+10FFFF |
michael@0 | 67 | function overlap(min1, max1, min2, max2) { |
michael@0 | 68 | return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2)); |
michael@0 | 69 | } |
michael@0 | 70 | size = (overlap(from, from+len, 0x000000, 0x00D800) + |
michael@0 | 71 | overlap(from, from+len, 0x00E000, 0x010000) + |
michael@0 | 72 | overlap(from, from+len, 0x010000, 0x110000)*2); |
michael@0 | 73 | |
michael@0 | 74 | block = new Uint16Array(size); |
michael@0 | 75 | for (i = 0, j = 0; i < len; i++) { |
michael@0 | 76 | point = from + i; |
michael@0 | 77 | if (0xD800 <= point && point <= 0xDFFF) |
michael@0 | 78 | continue; |
michael@0 | 79 | else if (point <= 0xFFFF) |
michael@0 | 80 | block[j++] = point; |
michael@0 | 81 | else { |
michael@0 | 82 | offset = point - 0x10000; |
michael@0 | 83 | block[j++] = 0xD800 + (offset >> 10); |
michael@0 | 84 | block[j++] = 0xDC00 + (offset & 0x3FF); |
michael@0 | 85 | } |
michael@0 | 86 | } |
michael@0 | 87 | return String.fromCharCode.apply(null, block); |
michael@0 | 88 | } |
michael@0 | 89 | |
michael@0 | 90 | function test_utf_roundtrip () { |
michael@0 | 91 | var MIN_CODEPOINT = 0; |
michael@0 | 92 | var MAX_CODEPOINT = 0x10FFFF; |
michael@0 | 93 | var BLOCK_SIZE = 0x1000; |
michael@0 | 94 | |
michael@0 | 95 | var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded; |
michael@0 | 96 | |
michael@0 | 97 | var TE_U16LE = new TextEncoder("UTF-16LE"); |
michael@0 | 98 | var TD_U16LE = new TextDecoder("UTF-16LE"); |
michael@0 | 99 | |
michael@0 | 100 | var TE_U16BE = new TextEncoder("UTF-16BE"); |
michael@0 | 101 | var TD_U16BE = new TextDecoder("UTF-16BE"); |
michael@0 | 102 | |
michael@0 | 103 | var TE_U8 = new TextEncoder("UTF-8"); |
michael@0 | 104 | var TD_U8 = new TextDecoder("UTF-8"); |
michael@0 | 105 | |
michael@0 | 106 | for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { |
michael@0 | 107 | block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1); |
michael@0 | 108 | block = genblock(i, BLOCK_SIZE); |
michael@0 | 109 | |
michael@0 | 110 | // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves |
michael@0 | 111 | encoded = TE_U16LE.encode(block); |
michael@0 | 112 | decoded = TD_U16LE.decode(encoded); |
michael@0 | 113 | assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); |
michael@0 | 114 | |
michael@0 | 115 | encoded = TE_U16BE.encode(block); |
michael@0 | 116 | decoded = TD_U16BE.decode(encoded); |
michael@0 | 117 | assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); |
michael@0 | 118 | |
michael@0 | 119 | encoded = TE_U8.encode(block); |
michael@0 | 120 | decoded = TD_U8.decode(encoded); |
michael@0 | 121 | assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag); |
michael@0 | 122 | |
michael@0 | 123 | // test TextEncoder(UTF-8) against the older idiom |
michael@0 | 124 | exp_encoded = encode_utf8(block); |
michael@0 | 125 | assert_array_equals(encoded, exp_encoded, |
michael@0 | 126 | "UTF-8 reference encoding " + block_tag); |
michael@0 | 127 | |
michael@0 | 128 | exp_decoded = decode_utf8(exp_encoded); |
michael@0 | 129 | assert_string_equals(decoded, exp_decoded, |
michael@0 | 130 | "UTF-8 reference decoding " + block_tag); |
michael@0 | 131 | } |
michael@0 | 132 | } |
michael@0 | 133 | |
michael@0 | 134 | function test_utf_samples () { |
michael@0 | 135 | // z, cent, CJK water, G-Clef, Private-use character |
michael@0 | 136 | var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; |
michael@0 | 137 | var cases = [ |
michael@0 | 138 | { encoding: "utf-8", |
michael@0 | 139 | expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] }, |
michael@0 | 140 | { encoding: "utf-16le", |
michael@0 | 141 | expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, |
michael@0 | 142 | { encoding: "utf-16", |
michael@0 | 143 | expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, |
michael@0 | 144 | { encoding: "utf-16be", |
michael@0 | 145 | expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] } |
michael@0 | 146 | ]; |
michael@0 | 147 | |
michael@0 | 148 | cases.forEach( |
michael@0 | 149 | function(t) { |
michael@0 | 150 | var encoded = new TextEncoder(t.encoding).encode(sample); |
michael@0 | 151 | assert_array_equals(encoded, t.expected, |
michael@0 | 152 | "expected equal encodings - " + t.encoding); |
michael@0 | 153 | |
michael@0 | 154 | var decoded = new TextDecoder(t.encoding) |
michael@0 | 155 | .decode(new Uint8Array(t.expected)); |
michael@0 | 156 | assert_equals(decoded, sample, |
michael@0 | 157 | "expected equal decodings - " + t.encoding); |
michael@0 | 158 | }); |
michael@0 | 159 | } |
michael@0 | 160 | |
michael@0 | 161 | test(test_utf_samples, |
michael@0 | 162 | "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample"); |
michael@0 | 163 | |
michael@0 | 164 | test(test_utf_roundtrip, |
michael@0 | 165 | "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+ |
michael@0 | 166 | "agreement with encode/decodeURIComponent"); |