|
1 // NOTE: Requires testharness.js |
|
2 // http://www.w3.org/2008/webapps/wiki/Harness |
|
3 |
|
4 // Extension to testharness.js API which avoids logging enormous strings |
|
5 // on a coding failure. |
|
6 function assert_string_equals(actual, expected, description) { |
|
7 // short circuit success case |
|
8 if (actual === expected) { |
|
9 assert_true(true, description + ": <actual> === <expected>"); |
|
10 return; |
|
11 } |
|
12 |
|
13 // length check |
|
14 assert_equals(actual.length, expected.length, |
|
15 description + ": string lengths") |
|
16 |
|
17 var i, a, b; |
|
18 for (i = 0; i < actual.length; i++) { |
|
19 a = actual.charCodeAt(i); |
|
20 b = expected.charCodeAt(i); |
|
21 if (a !== b) |
|
22 assert_true(false, |
|
23 description + |
|
24 ": code unit " + i.toString() + " unequal: " + |
|
25 cpname(a) + " != " + cpname(b)); // doesn't return |
|
26 } |
|
27 |
|
28 // It should be impossible to get here, because the initial |
|
29 // comparison failed, so either the length comparison or the |
|
30 // codeunit-by-codeunit comparison should also fail. |
|
31 assert_true(false, description + ": failed to detect string difference"); |
|
32 } |
|
33 |
|
34 // Inspired by: |
|
35 // http://ecmanaut.blogspot.com/2006/07/encoding-decoding-utf8-in-javascript.html |
|
36 function encode_utf8(string) { |
|
37 var utf8 = unescape(encodeURIComponent(string)); |
|
38 var octets = new Uint8Array(utf8.length), i; |
|
39 for (i = 0; i < utf8.length; i += 1) { |
|
40 octets[i] = utf8.charCodeAt(i); |
|
41 } |
|
42 return octets; |
|
43 } |
|
44 |
|
45 function decode_utf8(octets) { |
|
46 var utf8 = String.fromCharCode.apply(null, octets); |
|
47 return decodeURIComponent(escape(utf8)); |
|
48 } |
|
49 |
|
50 // Helpers for test_utf_roundtrip. |
|
51 function cpname(n) { |
|
52 if (n+0 !== n) |
|
53 return n.toString(); |
|
54 var w = (n <= 0xFFFF) ? 4 : 6; |
|
55 return 'U+' + ('000000' + n.toString(16).toUpperCase()).slice(-w); |
|
56 } |
|
57 |
|
58 function genblock(from, len) { |
|
59 var i, j, point, offset; |
|
60 var size, block; |
|
61 |
|
62 // determine size required: |
|
63 // 1 unit for each point from U+000000 through U+00D7FF |
|
64 // 0 units U+00D800 through U+00DFFF |
|
65 // 1 unit U+00E000 through U+00FFFF |
|
66 // 2 units U+010000 through U+10FFFF |
|
67 function overlap(min1, max1, min2, max2) { |
|
68 return Math.max(0, Math.min(max1, max2) - Math.max(min1, min2)); |
|
69 } |
|
70 size = (overlap(from, from+len, 0x000000, 0x00D800) + |
|
71 overlap(from, from+len, 0x00E000, 0x010000) + |
|
72 overlap(from, from+len, 0x010000, 0x110000)*2); |
|
73 |
|
74 block = new Uint16Array(size); |
|
75 for (i = 0, j = 0; i < len; i++) { |
|
76 point = from + i; |
|
77 if (0xD800 <= point && point <= 0xDFFF) |
|
78 continue; |
|
79 else if (point <= 0xFFFF) |
|
80 block[j++] = point; |
|
81 else { |
|
82 offset = point - 0x10000; |
|
83 block[j++] = 0xD800 + (offset >> 10); |
|
84 block[j++] = 0xDC00 + (offset & 0x3FF); |
|
85 } |
|
86 } |
|
87 return String.fromCharCode.apply(null, block); |
|
88 } |
|
89 |
|
90 function test_utf_roundtrip () { |
|
91 var MIN_CODEPOINT = 0; |
|
92 var MAX_CODEPOINT = 0x10FFFF; |
|
93 var BLOCK_SIZE = 0x1000; |
|
94 |
|
95 var block, block_tag, i, j, encoded, decoded, exp_encoded, exp_decoded; |
|
96 |
|
97 var TE_U16LE = new TextEncoder("UTF-16LE"); |
|
98 var TD_U16LE = new TextDecoder("UTF-16LE"); |
|
99 |
|
100 var TE_U16BE = new TextEncoder("UTF-16BE"); |
|
101 var TD_U16BE = new TextDecoder("UTF-16BE"); |
|
102 |
|
103 var TE_U8 = new TextEncoder("UTF-8"); |
|
104 var TD_U8 = new TextDecoder("UTF-8"); |
|
105 |
|
106 for (i = MIN_CODEPOINT; i < MAX_CODEPOINT; i += BLOCK_SIZE) { |
|
107 block_tag = cpname(i) + " - " + cpname(i + BLOCK_SIZE - 1); |
|
108 block = genblock(i, BLOCK_SIZE); |
|
109 |
|
110 // test UTF-16LE, UTF-16BE, and UTF-8 encodings against themselves |
|
111 encoded = TE_U16LE.encode(block); |
|
112 decoded = TD_U16LE.decode(encoded); |
|
113 assert_string_equals(block, decoded, "UTF-16LE round trip " + block_tag); |
|
114 |
|
115 encoded = TE_U16BE.encode(block); |
|
116 decoded = TD_U16BE.decode(encoded); |
|
117 assert_string_equals(block, decoded, "UTF-16BE round trip " + block_tag); |
|
118 |
|
119 encoded = TE_U8.encode(block); |
|
120 decoded = TD_U8.decode(encoded); |
|
121 assert_string_equals(block, decoded, "UTF-8 round trip " + block_tag); |
|
122 |
|
123 // test TextEncoder(UTF-8) against the older idiom |
|
124 exp_encoded = encode_utf8(block); |
|
125 assert_array_equals(encoded, exp_encoded, |
|
126 "UTF-8 reference encoding " + block_tag); |
|
127 |
|
128 exp_decoded = decode_utf8(exp_encoded); |
|
129 assert_string_equals(decoded, exp_decoded, |
|
130 "UTF-8 reference decoding " + block_tag); |
|
131 } |
|
132 } |
|
133 |
|
134 function test_utf_samples () { |
|
135 // z, cent, CJK water, G-Clef, Private-use character |
|
136 var sample = "z\xA2\u6C34\uD834\uDD1E\uDBFF\uDFFD"; |
|
137 var cases = [ |
|
138 { encoding: "utf-8", |
|
139 expected: [0x7A, 0xC2, 0xA2, 0xE6, 0xB0, 0xB4, 0xF0, 0x9D, 0x84, 0x9E, 0xF4, 0x8F, 0xBF, 0xBD] }, |
|
140 { encoding: "utf-16le", |
|
141 expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, |
|
142 { encoding: "utf-16", |
|
143 expected: [0x7A, 0x00, 0xA2, 0x00, 0x34, 0x6C, 0x34, 0xD8, 0x1E, 0xDD, 0xFF, 0xDB, 0xFD, 0xDF] }, |
|
144 { encoding: "utf-16be", |
|
145 expected: [0x00, 0x7A, 0x00, 0xA2, 0x6C, 0x34, 0xD8, 0x34, 0xDD, 0x1E, 0xDB, 0xFF, 0xDF, 0xFD] } |
|
146 ]; |
|
147 |
|
148 cases.forEach( |
|
149 function(t) { |
|
150 var encoded = new TextEncoder(t.encoding).encode(sample); |
|
151 assert_array_equals(encoded, t.expected, |
|
152 "expected equal encodings - " + t.encoding); |
|
153 |
|
154 var decoded = new TextDecoder(t.encoding) |
|
155 .decode(new Uint8Array(t.expected)); |
|
156 assert_equals(decoded, sample, |
|
157 "expected equal decodings - " + t.encoding); |
|
158 }); |
|
159 } |
|
160 |
|
161 test(test_utf_samples, |
|
162 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - reference sample"); |
|
163 |
|
164 test(test_utf_roundtrip, |
|
165 "UTF-8, UTF-16LE, UTF-16BE - Encode/Decode - full roundtrip and "+ |
|
166 "agreement with encode/decodeURIComponent"); |