|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * vim: set ts=8 sts=4 et sw=4 tw=99: |
|
3 * This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 #include "js/CharacterEncoding.h" |
|
8 |
|
9 #include "jscntxt.h" |
|
10 #include "jsprf.h" |
|
11 |
|
12 using namespace JS; |
|
13 |
|
14 Latin1CharsZ |
|
15 JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) |
|
16 { |
|
17 JS_ASSERT(cx); |
|
18 size_t len = tbchars.length(); |
|
19 unsigned char *latin1 = cx->pod_malloc<unsigned char>(len + 1); |
|
20 if (!latin1) |
|
21 return Latin1CharsZ(); |
|
22 for (size_t i = 0; i < len; ++i) |
|
23 latin1[i] = static_cast<unsigned char>(tbchars[i]); |
|
24 latin1[len] = '\0'; |
|
25 return Latin1CharsZ(latin1, len); |
|
26 } |
|
27 |
|
28 static size_t |
|
29 GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars) |
|
30 { |
|
31 size_t nbytes; |
|
32 const jschar *end; |
|
33 unsigned c, c2; |
|
34 |
|
35 nbytes = nchars; |
|
36 for (end = chars + nchars; chars != end; chars++) { |
|
37 c = *chars; |
|
38 if (c < 0x80) |
|
39 continue; |
|
40 if (0xD800 <= c && c <= 0xDFFF) { |
|
41 /* nbytes sets 1 length since this is surrogate pair. */ |
|
42 if (c >= 0xDC00 || (chars + 1) == end) { |
|
43 nbytes += 2; /* Bad Surrogate */ |
|
44 continue; |
|
45 } |
|
46 c2 = chars[1]; |
|
47 if (c2 < 0xDC00 || c2 > 0xDFFF) { |
|
48 nbytes += 2; /* Bad Surrogate */ |
|
49 continue; |
|
50 } |
|
51 c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; |
|
52 nbytes--; |
|
53 chars++; |
|
54 } |
|
55 c >>= 11; |
|
56 nbytes++; |
|
57 while (c) { |
|
58 c >>= 5; |
|
59 nbytes++; |
|
60 } |
|
61 } |
|
62 return nbytes; |
|
63 } |
|
64 |
|
65 static bool |
|
66 PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) { |
|
67 if (*dstlenp < 3) |
|
68 return false; |
|
69 *(*dst)++ = (char) 0xEF; |
|
70 *(*dst)++ = (char) 0xBF; |
|
71 *(*dst)++ = (char) 0xBD; |
|
72 *dstlenp -= 3; |
|
73 return true; |
|
74 } |
|
75 |
|
76 /* |
|
77 * Write up to |*dstlenp| bytes into |dst|. Writes the number of bytes used |
|
78 * into |*dstlenp| on success. Returns false on failure. |
|
79 */ |
|
80 static bool |
|
81 DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen, |
|
82 char *dst, size_t *dstlenp) |
|
83 { |
|
84 size_t dstlen = *dstlenp; |
|
85 size_t origDstlen = dstlen; |
|
86 |
|
87 while (srclen) { |
|
88 uint32_t v; |
|
89 jschar c = *src++; |
|
90 srclen--; |
|
91 if (c >= 0xDC00 && c <= 0xDFFF) { |
|
92 if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) |
|
93 goto bufferTooSmall; |
|
94 continue; |
|
95 } else if (c < 0xD800 || c > 0xDBFF) { |
|
96 v = c; |
|
97 } else { |
|
98 if (srclen < 1) { |
|
99 if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) |
|
100 goto bufferTooSmall; |
|
101 continue; |
|
102 } |
|
103 jschar c2 = *src; |
|
104 if ((c2 < 0xDC00) || (c2 > 0xDFFF)) { |
|
105 if (!PutUTF8ReplacementCharacter(&dst, &dstlen)) |
|
106 goto bufferTooSmall; |
|
107 continue; |
|
108 } |
|
109 src++; |
|
110 srclen--; |
|
111 v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000; |
|
112 } |
|
113 size_t utf8Len; |
|
114 if (v < 0x0080) { |
|
115 /* no encoding necessary - performance hack */ |
|
116 if (dstlen == 0) |
|
117 goto bufferTooSmall; |
|
118 *dst++ = (char) v; |
|
119 utf8Len = 1; |
|
120 } else { |
|
121 uint8_t utf8buf[4]; |
|
122 utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v); |
|
123 if (utf8Len > dstlen) |
|
124 goto bufferTooSmall; |
|
125 for (size_t i = 0; i < utf8Len; i++) |
|
126 *dst++ = (char) utf8buf[i]; |
|
127 } |
|
128 dstlen -= utf8Len; |
|
129 } |
|
130 *dstlenp = (origDstlen - dstlen); |
|
131 return true; |
|
132 |
|
133 bufferTooSmall: |
|
134 *dstlenp = (origDstlen - dstlen); |
|
135 if (cx->isJSContext()) |
|
136 JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr, |
|
137 JSMSG_BUFFER_TOO_SMALL); |
|
138 return false; |
|
139 } |
|
140 |
|
141 |
|
142 UTF8CharsZ |
|
143 JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars) |
|
144 { |
|
145 JS_ASSERT(cx); |
|
146 |
|
147 /* Get required buffer size. */ |
|
148 jschar *str = tbchars.start().get(); |
|
149 size_t len = GetDeflatedUTF8StringLength(str, tbchars.length()); |
|
150 |
|
151 /* Allocate buffer. */ |
|
152 unsigned char *utf8 = cx->pod_malloc<unsigned char>(len + 1); |
|
153 if (!utf8) |
|
154 return UTF8CharsZ(); |
|
155 |
|
156 /* Encode to UTF8. */ |
|
157 DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len); |
|
158 utf8[len] = '\0'; |
|
159 |
|
160 return UTF8CharsZ(utf8, len); |
|
161 } |
|
162 |
|
163 static const uint32_t INVALID_UTF8 = UINT32_MAX; |
|
164 |
|
165 /* |
|
166 * Convert a utf8 character sequence into a UCS-4 character and return that |
|
167 * character. It is assumed that the caller already checked that the sequence |
|
168 * is valid. |
|
169 */ |
|
170 uint32_t |
|
171 JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length) |
|
172 { |
|
173 JS_ASSERT(1 <= utf8Length && utf8Length <= 4); |
|
174 |
|
175 if (utf8Length == 1) { |
|
176 JS_ASSERT(!(*utf8Buffer & 0x80)); |
|
177 return *utf8Buffer; |
|
178 } |
|
179 |
|
180 /* from Unicode 3.1, non-shortest form is illegal */ |
|
181 static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 }; |
|
182 |
|
183 JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) == |
|
184 (0x100 - (1 << (8 - utf8Length)))); |
|
185 uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1); |
|
186 uint32_t minucs4Char = minucs4Table[utf8Length - 2]; |
|
187 while (--utf8Length) { |
|
188 JS_ASSERT((*utf8Buffer & 0xC0) == 0x80); |
|
189 ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F); |
|
190 } |
|
191 |
|
192 if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF))) |
|
193 return INVALID_UTF8; |
|
194 |
|
195 return ucs4Char; |
|
196 } |
|
197 |
|
198 static void |
|
199 ReportInvalidCharacter(JSContext *cx, uint32_t offset) |
|
200 { |
|
201 char buffer[10]; |
|
202 JS_snprintf(buffer, 10, "%d", offset); |
|
203 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, |
|
204 JSMSG_MALFORMED_UTF8_CHAR, buffer); |
|
205 } |
|
206 |
|
207 static void |
|
208 ReportBufferTooSmall(JSContext *cx, uint32_t dummy) |
|
209 { |
|
210 JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL); |
|
211 } |
|
212 |
|
213 static void |
|
214 ReportTooBigCharacter(JSContext *cx, uint32_t v) |
|
215 { |
|
216 char buffer[10]; |
|
217 JS_snprintf(buffer, 10, "0x%x", v + 0x10000); |
|
218 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr, |
|
219 JSMSG_UTF8_CHAR_TOO_LARGE, buffer); |
|
220 } |
|
221 |
|
222 enum InflateUTF8Action { |
|
223 CountAndReportInvalids, |
|
224 CountAndIgnoreInvalids, |
|
225 Copy |
|
226 }; |
|
227 |
|
228 static const uint32_t REPLACE_UTF8 = 0xFFFD; |
|
229 |
|
230 // If making changes to this algorithm, make sure to also update |
|
231 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp |
|
232 template <InflateUTF8Action action> |
|
233 static bool |
|
234 InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp, |
|
235 bool *isAsciip) |
|
236 { |
|
237 *isAsciip = true; |
|
238 |
|
239 // First, count how many jschars need to be in the inflated string. |
|
240 // |i| is the index into |src|, and |j| is the the index into |dst|. |
|
241 size_t srclen = src.length(); |
|
242 uint32_t j = 0; |
|
243 for (uint32_t i = 0; i < srclen; i++, j++) { |
|
244 uint32_t v = uint32_t(src[i]); |
|
245 if (!(v & 0x80)) { |
|
246 // ASCII code unit. Simple copy. |
|
247 if (action == Copy) |
|
248 dst[j] = jschar(v); |
|
249 |
|
250 } else { |
|
251 // Non-ASCII code unit. Determine its length in bytes (n). |
|
252 *isAsciip = false; |
|
253 uint32_t n = 1; |
|
254 while (v & (0x80 >> n)) |
|
255 n++; |
|
256 |
|
257 #define INVALID(report, arg, n2) \ |
|
258 do { \ |
|
259 if (action == CountAndReportInvalids) { \ |
|
260 report(cx, arg); \ |
|
261 return false; \ |
|
262 } else { \ |
|
263 if (action == Copy) \ |
|
264 dst[j] = jschar(REPLACE_UTF8); \ |
|
265 else \ |
|
266 JS_ASSERT(action == CountAndIgnoreInvalids); \ |
|
267 n = n2; \ |
|
268 goto invalidMultiByteCodeUnit; \ |
|
269 } \ |
|
270 } while (0) |
|
271 |
|
272 // Check the leading byte. |
|
273 if (n < 2 || n > 4) |
|
274 INVALID(ReportInvalidCharacter, i, 1); |
|
275 |
|
276 // Check that |src| is large enough to hold an n-byte code unit. |
|
277 if (i + n > srclen) |
|
278 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1); |
|
279 |
|
280 // Check the second byte. From Unicode Standard v6.2, Table 3-7 |
|
281 // Well-Formed UTF-8 Byte Sequences. |
|
282 if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF |
|
283 (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F |
|
284 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF |
|
285 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F |
|
286 { |
|
287 INVALID(ReportInvalidCharacter, i, 1); |
|
288 } |
|
289 |
|
290 // Check the continuation bytes. |
|
291 for (uint32_t m = 1; m < n; m++) |
|
292 if ((src[i + m] & 0xC0) != 0x80) |
|
293 INVALID(ReportInvalidCharacter, i, m); |
|
294 |
|
295 // Determine the code unit's length in jschars and act accordingly. |
|
296 v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n); |
|
297 if (v < 0x10000) { |
|
298 // The n-byte UTF8 code unit will fit in a single jschar. |
|
299 if (action == Copy) |
|
300 dst[j] = jschar(v); |
|
301 |
|
302 } else { |
|
303 v -= 0x10000; |
|
304 if (v <= 0xFFFFF) { |
|
305 // The n-byte UTF8 code unit will fit in two jschars. |
|
306 if (action == Copy) |
|
307 dst[j] = jschar((v >> 10) + 0xD800); |
|
308 j++; |
|
309 if (action == Copy) |
|
310 dst[j] = jschar((v & 0x3FF) + 0xDC00); |
|
311 |
|
312 } else { |
|
313 // The n-byte UTF8 code unit won't fit in two jschars. |
|
314 INVALID(ReportTooBigCharacter, v, 1); |
|
315 } |
|
316 } |
|
317 |
|
318 invalidMultiByteCodeUnit: |
|
319 // Move i to the last byte of the multi-byte code unit; the loop |
|
320 // header will do the final i++ to move to the start of the next |
|
321 // code unit. |
|
322 i += n - 1; |
|
323 } |
|
324 } |
|
325 |
|
326 *dstlenp = j; |
|
327 |
|
328 return true; |
|
329 } |
|
330 |
|
331 typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip); |
|
332 |
|
333 static TwoByteCharsZ |
|
334 InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen) |
|
335 { |
|
336 *outlen = 0; |
|
337 |
|
338 bool isAscii; |
|
339 if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii)) |
|
340 return TwoByteCharsZ(); |
|
341 |
|
342 jschar *dst = cx->pod_malloc<jschar>(*outlen + 1); // +1 for NUL |
|
343 if (!dst) |
|
344 return TwoByteCharsZ(); |
|
345 |
|
346 if (isAscii) { |
|
347 size_t srclen = src.length(); |
|
348 JS_ASSERT(*outlen == srclen); |
|
349 for (uint32_t i = 0; i < srclen; i++) |
|
350 dst[i] = jschar(src[i]); |
|
351 |
|
352 } else { |
|
353 JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii)); |
|
354 } |
|
355 |
|
356 dst[*outlen] = 0; // NUL char |
|
357 |
|
358 return TwoByteCharsZ(dst, *outlen); |
|
359 } |
|
360 |
|
361 TwoByteCharsZ |
|
362 JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) |
|
363 { |
|
364 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndReportInvalids>, |
|
365 outlen); |
|
366 } |
|
367 |
|
368 TwoByteCharsZ |
|
369 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen) |
|
370 { |
|
371 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndIgnoreInvalids>, |
|
372 outlen); |
|
373 } |
|
374 |