js/src/vm/CharacterEncoding.cpp

branch
TOR_BUG_3246
changeset 7
129ffea94266
equal deleted inserted replaced
-1:000000000000 0:394f3f833a54
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2 * vim: set ts=8 sts=4 et sw=4 tw=99:
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7 #include "js/CharacterEncoding.h"
8
9 #include "jscntxt.h"
10 #include "jsprf.h"
11
12 using namespace JS;
13
14 Latin1CharsZ
15 JS::LossyTwoByteCharsToNewLatin1CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
16 {
17 JS_ASSERT(cx);
18 size_t len = tbchars.length();
19 unsigned char *latin1 = cx->pod_malloc<unsigned char>(len + 1);
20 if (!latin1)
21 return Latin1CharsZ();
22 for (size_t i = 0; i < len; ++i)
23 latin1[i] = static_cast<unsigned char>(tbchars[i]);
24 latin1[len] = '\0';
25 return Latin1CharsZ(latin1, len);
26 }
27
28 static size_t
29 GetDeflatedUTF8StringLength(const jschar *chars, size_t nchars)
30 {
31 size_t nbytes;
32 const jschar *end;
33 unsigned c, c2;
34
35 nbytes = nchars;
36 for (end = chars + nchars; chars != end; chars++) {
37 c = *chars;
38 if (c < 0x80)
39 continue;
40 if (0xD800 <= c && c <= 0xDFFF) {
41 /* nbytes sets 1 length since this is surrogate pair. */
42 if (c >= 0xDC00 || (chars + 1) == end) {
43 nbytes += 2; /* Bad Surrogate */
44 continue;
45 }
46 c2 = chars[1];
47 if (c2 < 0xDC00 || c2 > 0xDFFF) {
48 nbytes += 2; /* Bad Surrogate */
49 continue;
50 }
51 c = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
52 nbytes--;
53 chars++;
54 }
55 c >>= 11;
56 nbytes++;
57 while (c) {
58 c >>= 5;
59 nbytes++;
60 }
61 }
62 return nbytes;
63 }
64
65 static bool
66 PutUTF8ReplacementCharacter(char **dst, size_t *dstlenp) {
67 if (*dstlenp < 3)
68 return false;
69 *(*dst)++ = (char) 0xEF;
70 *(*dst)++ = (char) 0xBF;
71 *(*dst)++ = (char) 0xBD;
72 *dstlenp -= 3;
73 return true;
74 }
75
76 /*
77 * Write up to |*dstlenp| bytes into |dst|. Writes the number of bytes used
78 * into |*dstlenp| on success. Returns false on failure.
79 */
80 static bool
81 DeflateStringToUTF8Buffer(js::ThreadSafeContext *cx, const jschar *src, size_t srclen,
82 char *dst, size_t *dstlenp)
83 {
84 size_t dstlen = *dstlenp;
85 size_t origDstlen = dstlen;
86
87 while (srclen) {
88 uint32_t v;
89 jschar c = *src++;
90 srclen--;
91 if (c >= 0xDC00 && c <= 0xDFFF) {
92 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
93 goto bufferTooSmall;
94 continue;
95 } else if (c < 0xD800 || c > 0xDBFF) {
96 v = c;
97 } else {
98 if (srclen < 1) {
99 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
100 goto bufferTooSmall;
101 continue;
102 }
103 jschar c2 = *src;
104 if ((c2 < 0xDC00) || (c2 > 0xDFFF)) {
105 if (!PutUTF8ReplacementCharacter(&dst, &dstlen))
106 goto bufferTooSmall;
107 continue;
108 }
109 src++;
110 srclen--;
111 v = ((c - 0xD800) << 10) + (c2 - 0xDC00) + 0x10000;
112 }
113 size_t utf8Len;
114 if (v < 0x0080) {
115 /* no encoding necessary - performance hack */
116 if (dstlen == 0)
117 goto bufferTooSmall;
118 *dst++ = (char) v;
119 utf8Len = 1;
120 } else {
121 uint8_t utf8buf[4];
122 utf8Len = js_OneUcs4ToUtf8Char(utf8buf, v);
123 if (utf8Len > dstlen)
124 goto bufferTooSmall;
125 for (size_t i = 0; i < utf8Len; i++)
126 *dst++ = (char) utf8buf[i];
127 }
128 dstlen -= utf8Len;
129 }
130 *dstlenp = (origDstlen - dstlen);
131 return true;
132
133 bufferTooSmall:
134 *dstlenp = (origDstlen - dstlen);
135 if (cx->isJSContext())
136 JS_ReportErrorNumber(cx->asJSContext(), js_GetErrorMessage, nullptr,
137 JSMSG_BUFFER_TOO_SMALL);
138 return false;
139 }
140
141
142 UTF8CharsZ
143 JS::TwoByteCharsToNewUTF8CharsZ(js::ThreadSafeContext *cx, TwoByteChars tbchars)
144 {
145 JS_ASSERT(cx);
146
147 /* Get required buffer size. */
148 jschar *str = tbchars.start().get();
149 size_t len = GetDeflatedUTF8StringLength(str, tbchars.length());
150
151 /* Allocate buffer. */
152 unsigned char *utf8 = cx->pod_malloc<unsigned char>(len + 1);
153 if (!utf8)
154 return UTF8CharsZ();
155
156 /* Encode to UTF8. */
157 DeflateStringToUTF8Buffer(cx, str, tbchars.length(), (char *)utf8, &len);
158 utf8[len] = '\0';
159
160 return UTF8CharsZ(utf8, len);
161 }
162
163 static const uint32_t INVALID_UTF8 = UINT32_MAX;
164
165 /*
166 * Convert a utf8 character sequence into a UCS-4 character and return that
167 * character. It is assumed that the caller already checked that the sequence
168 * is valid.
169 */
170 uint32_t
171 JS::Utf8ToOneUcs4Char(const uint8_t *utf8Buffer, int utf8Length)
172 {
173 JS_ASSERT(1 <= utf8Length && utf8Length <= 4);
174
175 if (utf8Length == 1) {
176 JS_ASSERT(!(*utf8Buffer & 0x80));
177 return *utf8Buffer;
178 }
179
180 /* from Unicode 3.1, non-shortest form is illegal */
181 static const uint32_t minucs4Table[] = { 0x80, 0x800, 0x10000 };
182
183 JS_ASSERT((*utf8Buffer & (0x100 - (1 << (7 - utf8Length)))) ==
184 (0x100 - (1 << (8 - utf8Length))));
185 uint32_t ucs4Char = *utf8Buffer++ & ((1 << (7 - utf8Length)) - 1);
186 uint32_t minucs4Char = minucs4Table[utf8Length - 2];
187 while (--utf8Length) {
188 JS_ASSERT((*utf8Buffer & 0xC0) == 0x80);
189 ucs4Char = (ucs4Char << 6) | (*utf8Buffer++ & 0x3F);
190 }
191
192 if (MOZ_UNLIKELY(ucs4Char < minucs4Char || (ucs4Char >= 0xD800 && ucs4Char <= 0xDFFF)))
193 return INVALID_UTF8;
194
195 return ucs4Char;
196 }
197
198 static void
199 ReportInvalidCharacter(JSContext *cx, uint32_t offset)
200 {
201 char buffer[10];
202 JS_snprintf(buffer, 10, "%d", offset);
203 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
204 JSMSG_MALFORMED_UTF8_CHAR, buffer);
205 }
206
207 static void
208 ReportBufferTooSmall(JSContext *cx, uint32_t dummy)
209 {
210 JS_ReportErrorNumber(cx, js_GetErrorMessage, nullptr, JSMSG_BUFFER_TOO_SMALL);
211 }
212
213 static void
214 ReportTooBigCharacter(JSContext *cx, uint32_t v)
215 {
216 char buffer[10];
217 JS_snprintf(buffer, 10, "0x%x", v + 0x10000);
218 JS_ReportErrorFlagsAndNumber(cx, JSREPORT_ERROR, js_GetErrorMessage, nullptr,
219 JSMSG_UTF8_CHAR_TOO_LARGE, buffer);
220 }
221
222 enum InflateUTF8Action {
223 CountAndReportInvalids,
224 CountAndIgnoreInvalids,
225 Copy
226 };
227
228 static const uint32_t REPLACE_UTF8 = 0xFFFD;
229
230 // If making changes to this algorithm, make sure to also update
231 // LossyConvertUTF8toUTF16() in dom/wifi/WifiUtils.cpp
232 template <InflateUTF8Action action>
233 static bool
234 InflateUTF8StringToBuffer(JSContext *cx, const UTF8Chars src, jschar *dst, size_t *dstlenp,
235 bool *isAsciip)
236 {
237 *isAsciip = true;
238
239 // First, count how many jschars need to be in the inflated string.
240 // |i| is the index into |src|, and |j| is the the index into |dst|.
241 size_t srclen = src.length();
242 uint32_t j = 0;
243 for (uint32_t i = 0; i < srclen; i++, j++) {
244 uint32_t v = uint32_t(src[i]);
245 if (!(v & 0x80)) {
246 // ASCII code unit. Simple copy.
247 if (action == Copy)
248 dst[j] = jschar(v);
249
250 } else {
251 // Non-ASCII code unit. Determine its length in bytes (n).
252 *isAsciip = false;
253 uint32_t n = 1;
254 while (v & (0x80 >> n))
255 n++;
256
257 #define INVALID(report, arg, n2) \
258 do { \
259 if (action == CountAndReportInvalids) { \
260 report(cx, arg); \
261 return false; \
262 } else { \
263 if (action == Copy) \
264 dst[j] = jschar(REPLACE_UTF8); \
265 else \
266 JS_ASSERT(action == CountAndIgnoreInvalids); \
267 n = n2; \
268 goto invalidMultiByteCodeUnit; \
269 } \
270 } while (0)
271
272 // Check the leading byte.
273 if (n < 2 || n > 4)
274 INVALID(ReportInvalidCharacter, i, 1);
275
276 // Check that |src| is large enough to hold an n-byte code unit.
277 if (i + n > srclen)
278 INVALID(ReportBufferTooSmall, /* dummy = */ 0, 1);
279
280 // Check the second byte. From Unicode Standard v6.2, Table 3-7
281 // Well-Formed UTF-8 Byte Sequences.
282 if ((v == 0xE0 && ((uint8_t)src[i + 1] & 0xE0) != 0xA0) || // E0 A0~BF
283 (v == 0xED && ((uint8_t)src[i + 1] & 0xE0) != 0x80) || // ED 80~9F
284 (v == 0xF0 && ((uint8_t)src[i + 1] & 0xF0) == 0x80) || // F0 90~BF
285 (v == 0xF4 && ((uint8_t)src[i + 1] & 0xF0) != 0x80)) // F4 80~8F
286 {
287 INVALID(ReportInvalidCharacter, i, 1);
288 }
289
290 // Check the continuation bytes.
291 for (uint32_t m = 1; m < n; m++)
292 if ((src[i + m] & 0xC0) != 0x80)
293 INVALID(ReportInvalidCharacter, i, m);
294
295 // Determine the code unit's length in jschars and act accordingly.
296 v = Utf8ToOneUcs4Char((uint8_t *)&src[i], n);
297 if (v < 0x10000) {
298 // The n-byte UTF8 code unit will fit in a single jschar.
299 if (action == Copy)
300 dst[j] = jschar(v);
301
302 } else {
303 v -= 0x10000;
304 if (v <= 0xFFFFF) {
305 // The n-byte UTF8 code unit will fit in two jschars.
306 if (action == Copy)
307 dst[j] = jschar((v >> 10) + 0xD800);
308 j++;
309 if (action == Copy)
310 dst[j] = jschar((v & 0x3FF) + 0xDC00);
311
312 } else {
313 // The n-byte UTF8 code unit won't fit in two jschars.
314 INVALID(ReportTooBigCharacter, v, 1);
315 }
316 }
317
318 invalidMultiByteCodeUnit:
319 // Move i to the last byte of the multi-byte code unit; the loop
320 // header will do the final i++ to move to the start of the next
321 // code unit.
322 i += n - 1;
323 }
324 }
325
326 *dstlenp = j;
327
328 return true;
329 }
330
331 typedef bool (*CountAction)(JSContext *, const UTF8Chars, jschar *, size_t *, bool *isAsciip);
332
333 static TwoByteCharsZ
334 InflateUTF8StringHelper(JSContext *cx, const UTF8Chars src, CountAction countAction, size_t *outlen)
335 {
336 *outlen = 0;
337
338 bool isAscii;
339 if (!countAction(cx, src, /* dst = */ nullptr, outlen, &isAscii))
340 return TwoByteCharsZ();
341
342 jschar *dst = cx->pod_malloc<jschar>(*outlen + 1); // +1 for NUL
343 if (!dst)
344 return TwoByteCharsZ();
345
346 if (isAscii) {
347 size_t srclen = src.length();
348 JS_ASSERT(*outlen == srclen);
349 for (uint32_t i = 0; i < srclen; i++)
350 dst[i] = jschar(src[i]);
351
352 } else {
353 JS_ALWAYS_TRUE(InflateUTF8StringToBuffer<Copy>(cx, src, dst, outlen, &isAscii));
354 }
355
356 dst[*outlen] = 0; // NUL char
357
358 return TwoByteCharsZ(dst, *outlen);
359 }
360
361 TwoByteCharsZ
362 JS::UTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
363 {
364 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndReportInvalids>,
365 outlen);
366 }
367
368 TwoByteCharsZ
369 JS::LossyUTF8CharsToNewTwoByteCharsZ(JSContext *cx, const UTF8Chars utf8, size_t *outlen)
370 {
371 return InflateUTF8StringHelper(cx, utf8, InflateUTF8StringToBuffer<CountAndIgnoreInvalids>,
372 outlen);
373 }
374

mercurial