|
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* vim: set ts=2 et sw=2 tw=80: */ |
|
3 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
4 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
6 |
|
7 #include "nsUCSupport.h" |
|
8 #include "nsUTF8ToUnicode.h" |
|
9 #include "mozilla/SSE.h" |
|
10 #include "nsCharTraits.h" |
|
11 #include <algorithm> |
|
12 |
|
13 #define UNICODE_BYTE_ORDER_MARK 0xFEFF |
|
14 |
|
15 static char16_t* EmitSurrogatePair(uint32_t ucs4, char16_t* aDest) |
|
16 { |
|
17 NS_ASSERTION(ucs4 > 0xFFFF, "Should be a supplementary character"); |
|
18 ucs4 -= 0x00010000; |
|
19 *aDest++ = 0xD800 | (0x000003FF & (ucs4 >> 10)); |
|
20 *aDest++ = 0xDC00 | (0x000003FF & ucs4); |
|
21 return aDest; |
|
22 } |
|
23 |
|
24 //---------------------------------------------------------------------- |
|
25 // Class nsUTF8ToUnicode [implementation] |
|
26 |
|
27 nsUTF8ToUnicode::nsUTF8ToUnicode() |
|
28 : nsBasicDecoderSupport() |
|
29 { |
|
30 Reset(); |
|
31 } |
|
32 |
|
33 //---------------------------------------------------------------------- |
|
34 // Subclassing of nsTableDecoderSupport class [implementation] |
|
35 |
|
36 /** |
|
37 * Normally the maximum length of the output of the UTF8 decoder in UTF16 |
|
38 * code units is the same as the length of the input in UTF8 code units, |
|
39 * since 1-byte, 2-byte and 3-byte UTF-8 sequences decode to a single |
|
40 * UTF-16 character, and 4-byte UTF-8 sequences decode to a surrogate pair. |
|
41 * |
|
42 * However, there is an edge case where the output can be longer than the |
|
43 * input: if the previous buffer ended with an incomplete multi-byte |
|
44 * sequence and this buffer does not begin with a valid continuation |
|
45 * byte, we will return NS_ERROR_ILLEGAL_INPUT and the caller may insert a |
|
46 * replacement character in the output buffer which corresponds to no |
|
47 * character in the input buffer. So in the worst case the destination |
|
48 * will need to be one code unit longer than the source. |
|
49 * See bug 301797. |
|
50 */ |
|
51 NS_IMETHODIMP nsUTF8ToUnicode::GetMaxLength(const char * aSrc, |
|
52 int32_t aSrcLength, |
|
53 int32_t * aDestLength) |
|
54 { |
|
55 *aDestLength = aSrcLength + 1; |
|
56 return NS_OK; |
|
57 } |
|
58 |
|
59 |
|
60 //---------------------------------------------------------------------- |
|
61 // Subclassing of nsBasicDecoderSupport class [implementation] |
|
62 |
|
63 NS_IMETHODIMP nsUTF8ToUnicode::Reset() |
|
64 { |
|
65 |
|
66 mUcs4 = 0; // cached Unicode character |
|
67 mState = 0; // cached expected number of octets after the current octet |
|
68 // until the beginning of the next UTF8 character sequence |
|
69 mBytes = 1; // cached expected number of octets in the current sequence |
|
70 mFirst = true; |
|
71 |
|
72 return NS_OK; |
|
73 |
|
74 } |
|
75 |
|
76 //---------------------------------------------------------------------- |
|
77 // Subclassing of nsBasicDecoderSupport class [implementation] |
|
78 |
|
79 // Fast ASCII -> UTF16 inner loop implementations |
|
80 // |
|
81 // Convert_ascii_run will update src and dst to the new values, and |
|
82 // len must be the maximum number ascii chars that it would be valid |
|
83 // to take from src and place into dst. (That is, the minimum of the |
|
84 // number of bytes left in src and the number of unichars available in |
|
85 // dst.) |
|
86 |
|
87 #if defined(__arm__) || defined(_M_ARM) |
|
88 |
|
89 // on ARM, do extra work to avoid byte/halfword reads/writes by |
|
90 // reading/writing a word at a time for as long as we can |
|
91 static inline void |
|
92 Convert_ascii_run (const char *&src, |
|
93 char16_t *&dst, |
|
94 int32_t len) |
|
95 { |
|
96 const uint32_t *src32; |
|
97 uint32_t *dst32; |
|
98 |
|
99 // with some alignments, we'd never actually break out of the slow loop, so |
|
100 // check and do the faster slow loop |
|
101 if ((((NS_PTR_TO_UINT32(dst) & 3) == 0) && ((NS_PTR_TO_UINT32(src) & 1) == 0)) || |
|
102 (((NS_PTR_TO_UINT32(dst) & 3) == 2) && ((NS_PTR_TO_UINT32(src) & 1) == 1))) |
|
103 { |
|
104 while (((NS_PTR_TO_UINT32(src) & 3) || |
|
105 (NS_PTR_TO_UINT32(dst) & 3)) && |
|
106 len > 0) |
|
107 { |
|
108 if (*src & 0x80U) |
|
109 return; |
|
110 *dst++ = (char16_t) *src++; |
|
111 len--; |
|
112 } |
|
113 } else { |
|
114 goto finish; |
|
115 } |
|
116 |
|
117 // then go 4 bytes at a time |
|
118 src32 = (const uint32_t*) src; |
|
119 dst32 = (uint32_t*) dst; |
|
120 |
|
121 while (len > 4) { |
|
122 uint32_t in = *src32++; |
|
123 |
|
124 if (in & 0x80808080U) { |
|
125 src32--; |
|
126 break; |
|
127 } |
|
128 |
|
129 *dst32++ = ((in & 0x000000ff) >> 0) | ((in & 0x0000ff00) << 8); |
|
130 *dst32++ = ((in & 0x00ff0000) >> 16) | ((in & 0xff000000) >> 8); |
|
131 |
|
132 len -= 4; |
|
133 } |
|
134 |
|
135 src = (const char *) src32; |
|
136 dst = (char16_t *) dst32; |
|
137 |
|
138 finish: |
|
139 while (len-- > 0 && (*src & 0x80U) == 0) { |
|
140 *dst++ = (char16_t) *src++; |
|
141 } |
|
142 } |
|
143 |
|
144 #else |
|
145 |
|
146 #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
147 namespace mozilla { |
|
148 namespace SSE2 { |
|
149 |
|
150 void Convert_ascii_run(const char *&src, char16_t *&dst, int32_t len); |
|
151 |
|
152 } |
|
153 } |
|
154 #endif |
|
155 |
|
156 static inline void |
|
157 Convert_ascii_run (const char *&src, |
|
158 char16_t *&dst, |
|
159 int32_t len) |
|
160 { |
|
161 #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
162 if (mozilla::supports_sse2()) { |
|
163 mozilla::SSE2::Convert_ascii_run(src, dst, len); |
|
164 return; |
|
165 } |
|
166 #endif |
|
167 |
|
168 while (len-- > 0 && (*src & 0x80U) == 0) { |
|
169 *dst++ = (char16_t) *src++; |
|
170 } |
|
171 } |
|
172 |
|
173 #endif |
|
174 |
|
175 NS_IMETHODIMP nsUTF8ToUnicode::Convert(const char * aSrc, |
|
176 int32_t * aSrcLength, |
|
177 char16_t * aDest, |
|
178 int32_t * aDestLength) |
|
179 { |
|
180 uint32_t aSrcLen = (uint32_t) (*aSrcLength); |
|
181 uint32_t aDestLen = (uint32_t) (*aDestLength); |
|
182 |
|
183 const char *in, *inend; |
|
184 inend = aSrc + aSrcLen; |
|
185 |
|
186 char16_t *out, *outend; |
|
187 outend = aDest + aDestLen; |
|
188 |
|
189 nsresult res = NS_OK; // conversion result |
|
190 |
|
191 out = aDest; |
|
192 if (mState == 0xFF) { |
|
193 // Emit supplementary character left over from previous iteration. It is |
|
194 // caller's responsibility to keep a sufficient buffer. |
|
195 if (aDestLen < 2) { |
|
196 *aSrcLength = *aDestLength = 0; |
|
197 return NS_OK_UDEC_MOREOUTPUT; |
|
198 } |
|
199 out = EmitSurrogatePair(mUcs4, out); |
|
200 mUcs4 = 0; |
|
201 mState = 0; |
|
202 mBytes = 1; |
|
203 mFirst = false; |
|
204 } |
|
205 |
|
206 // alias these locally for speed |
|
207 int32_t mUcs4 = this->mUcs4; |
|
208 uint8_t mState = this->mState; |
|
209 uint8_t mBytes = this->mBytes; |
|
210 bool mFirst = this->mFirst; |
|
211 |
|
212 // Set mFirst to false now so we don't have to every time through the ASCII |
|
213 // branch within the loop. |
|
214 if (mFirst && aSrcLen && (0 == (0x80 & (*aSrc)))) |
|
215 mFirst = false; |
|
216 |
|
217 for (in = aSrc; ((in < inend) && (out < outend)); ++in) { |
|
218 uint8_t c = *in; |
|
219 if (0 == mState) { |
|
220 // When mState is zero we expect either a US-ASCII character or a |
|
221 // multi-octet sequence. |
|
222 if (c < 0x80) { // 00..7F |
|
223 int32_t max_loops = std::min(inend - in, outend - out); |
|
224 Convert_ascii_run(in, out, max_loops); |
|
225 --in; // match the rest of the cases |
|
226 mBytes = 1; |
|
227 } else if (c < 0xC2) { // C0/C1 |
|
228 // Overlong 2 octet sequence |
|
229 if (mErrBehavior == kOnError_Signal) { |
|
230 res = NS_ERROR_ILLEGAL_INPUT; |
|
231 break; |
|
232 } |
|
233 *out++ = UCS2_REPLACEMENT_CHAR; |
|
234 mFirst = false; |
|
235 } else if (c < 0xE0) { // C2..DF |
|
236 // First octet of 2 octet sequence |
|
237 mUcs4 = c; |
|
238 mUcs4 = (mUcs4 & 0x1F) << 6; |
|
239 mState = 1; |
|
240 mBytes = 2; |
|
241 } else if (c < 0xF0) { // E0..EF |
|
242 // First octet of 3 octet sequence |
|
243 mUcs4 = c; |
|
244 mUcs4 = (mUcs4 & 0x0F) << 12; |
|
245 mState = 2; |
|
246 mBytes = 3; |
|
247 } else if (c < 0xF5) { // F0..F4 |
|
248 // First octet of 4 octet sequence |
|
249 mUcs4 = c; |
|
250 mUcs4 = (mUcs4 & 0x07) << 18; |
|
251 mState = 3; |
|
252 mBytes = 4; |
|
253 } else { // F5..FF |
|
254 /* Current octet is neither in the US-ASCII range nor a legal first |
|
255 * octet of a multi-octet sequence. |
|
256 */ |
|
257 if (mErrBehavior == kOnError_Signal) { |
|
258 /* Return an error condition. Caller is responsible for flushing and |
|
259 * refilling the buffer and resetting state. |
|
260 */ |
|
261 res = NS_ERROR_ILLEGAL_INPUT; |
|
262 break; |
|
263 } |
|
264 *out++ = UCS2_REPLACEMENT_CHAR; |
|
265 mFirst = false; |
|
266 } |
|
267 } else { |
|
268 // When mState is non-zero, we expect a continuation of the multi-octet |
|
269 // sequence |
|
270 if (0x80 == (0xC0 & c)) { |
|
271 if (mState > 1) { |
|
272 // If we are here, all possibilities are: |
|
273 // mState == 2 && mBytes == 3 || |
|
274 // mState == 2 && mBytes == 4 || |
|
275 // mState == 3 && mBytes == 4 |
|
276 if ((mBytes == 3 && ((!mUcs4 && c < 0xA0) || // E0 80..9F |
|
277 (mUcs4 == 0xD000 && c > 0x9F))) || // ED A0..BF |
|
278 (mState == 3 && ((!mUcs4 && c < 0x90) || // F0 80..8F |
|
279 (mUcs4 == 0x100000 && c > 0x8F)))) {// F4 90..BF |
|
280 // illegal sequences or sequences converted into illegal ranges. |
|
281 in--; |
|
282 if (mErrBehavior == kOnError_Signal) { |
|
283 res = NS_ERROR_ILLEGAL_INPUT; |
|
284 break; |
|
285 } |
|
286 *out++ = UCS2_REPLACEMENT_CHAR; |
|
287 mState = 0; |
|
288 mFirst = false; |
|
289 continue; |
|
290 } |
|
291 } |
|
292 |
|
293 // Legal continuation. |
|
294 uint32_t shift = (mState - 1) * 6; |
|
295 uint32_t tmp = c; |
|
296 tmp = (tmp & 0x0000003FL) << shift; |
|
297 mUcs4 |= tmp; |
|
298 |
|
299 if (0 == --mState) { |
|
300 /* End of the multi-octet sequence. mUcs4 now contains the final |
|
301 * Unicode codepoint to be output |
|
302 */ |
|
303 |
|
304 if (mUcs4 > 0xFFFF) { |
|
305 // mUcs4 is in the range 0x10000 - 0x10FFFF. Output a UTF-16 pair |
|
306 if (out + 2 > outend) { |
|
307 // insufficient space left in the buffer. Keep mUcs4 for the |
|
308 // next iteration. |
|
309 mState = 0xFF; |
|
310 ++in; |
|
311 res = NS_OK_UDEC_MOREOUTPUT; |
|
312 break; |
|
313 } |
|
314 out = EmitSurrogatePair(mUcs4, out); |
|
315 } else if (UNICODE_BYTE_ORDER_MARK != mUcs4 || !mFirst) { |
|
316 // Don't output the BOM only if it is the first character |
|
317 *out++ = mUcs4; |
|
318 } |
|
319 //initialize UTF8 cache |
|
320 mUcs4 = 0; |
|
321 mState = 0; |
|
322 mBytes = 1; |
|
323 mFirst = false; |
|
324 } |
|
325 } else { |
|
326 /* ((0xC0 & c != 0x80) && (mState != 0)) |
|
327 * |
|
328 * Incomplete multi-octet sequence. Unconsume this |
|
329 * octet and return an error condition. Caller is responsible |
|
330 * for flushing and refilling the buffer and resetting state. |
|
331 */ |
|
332 in--; |
|
333 if (mErrBehavior == kOnError_Signal) { |
|
334 res = NS_ERROR_ILLEGAL_INPUT; |
|
335 break; |
|
336 } |
|
337 *out++ = UCS2_REPLACEMENT_CHAR; |
|
338 mState = 0; |
|
339 mFirst = false; |
|
340 } |
|
341 } |
|
342 } |
|
343 |
|
344 // output not finished, output buffer too short |
|
345 if ((NS_OK == res) && (in < inend) && (out >= outend)) |
|
346 res = NS_OK_UDEC_MOREOUTPUT; |
|
347 |
|
348 // last UCS4 is incomplete, make sure the caller |
|
349 // returns with properly aligned continuation of the buffer |
|
350 if ((NS_OK == res) && (mState != 0)) |
|
351 res = NS_OK_UDEC_MOREINPUT; |
|
352 |
|
353 *aSrcLength = in - aSrc; |
|
354 *aDestLength = out - aDest; |
|
355 |
|
356 this->mUcs4 = mUcs4; |
|
357 this->mState = mState; |
|
358 this->mBytes = mBytes; |
|
359 this->mFirst = mFirst; |
|
360 |
|
361 return(res); |
|
362 } |