Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "nsUTF16ToUnicode.h"
7 #include "nsCharTraits.h"
8 #include "mozilla/Endian.h"
10 enum {
11 STATE_NORMAL = 0,
12 STATE_HALF_CODE_POINT = 1,
13 STATE_FIRST_CALL = 2,
14 STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT,
15 STATE_ODD_SURROGATE_PAIR = 4
16 };
18 nsresult
19 nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc,
20 int32_t * aSrcLength,
21 char16_t * aDest,
22 int32_t * aDestLength,
23 bool aSwapBytes)
24 {
25 const char* src = aSrc;
26 const char* srcEnd = aSrc + *aSrcLength;
27 char16_t* dest = aDest;
28 char16_t* destEnd = aDest + *aDestLength;
29 char16_t oddHighSurrogate;
31 switch(mState) {
32 case STATE_FIRST_CALL:
33 NS_ASSERTION(*aSrcLength > 1, "buffer too short");
34 src+=2;
35 mState = STATE_NORMAL;
36 break;
38 case STATE_SECOND_BYTE:
39 NS_ASSERTION(*aSrcLength > 0, "buffer too short");
40 src++;
41 mState = STATE_NORMAL;
42 break;
44 case STATE_ODD_SURROGATE_PAIR:
45 if (*aDestLength < 2)
46 goto error;
47 else {
48 *dest++ = mOddHighSurrogate;
49 *dest++ = mOddLowSurrogate;
50 mOddHighSurrogate = mOddLowSurrogate = 0;
51 mState = STATE_NORMAL;
52 }
53 break;
55 case STATE_NORMAL:
56 case STATE_HALF_CODE_POINT:
57 default:
58 break;
59 }
61 oddHighSurrogate = mOddHighSurrogate;
63 if (src == srcEnd) {
64 *aDestLength = dest - aDest;
65 return (mState != STATE_NORMAL || oddHighSurrogate) ?
66 NS_OK_UDEC_MOREINPUT : NS_OK;
67 }
69 const char* srcEvenEnd;
71 char16_t u;
72 if (mState == STATE_HALF_CODE_POINT) {
73 if (dest == destEnd)
74 goto error;
76 // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the
77 // previous run while the 2nd byte has to come from |*src|.
78 mState = STATE_NORMAL;
79 #if MOZ_BIG_ENDIAN
80 u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte.
81 #else
82 u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte.
83 #endif
84 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
85 goto have_codepoint;
86 } else {
87 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop
88 }
90 while (src != srcEvenEnd) {
91 if (dest == destEnd)
92 goto error;
94 #if !defined(__sparc__) && !defined(__arm__)
95 u = *(const char16_t*)src;
96 #else
97 memcpy(&u, src, 2);
98 #endif
99 src += 2;
101 have_codepoint:
102 if (aSwapBytes)
103 u = u << 8 | u >> 8;
105 if (!IS_SURROGATE(u)) {
106 if (oddHighSurrogate) {
107 if (mErrBehavior == kOnError_Signal) {
108 goto error2;
109 }
110 *dest++ = UCS2_REPLACEMENT_CHAR;
111 if (dest == destEnd)
112 goto error;
113 oddHighSurrogate = 0;
114 }
115 *dest++ = u;
116 } else if (NS_IS_HIGH_SURROGATE(u)) {
117 if (oddHighSurrogate) {
118 if (mErrBehavior == kOnError_Signal) {
119 goto error2;
120 }
121 *dest++ = UCS2_REPLACEMENT_CHAR;
122 if (dest == destEnd)
123 goto error;
124 }
125 oddHighSurrogate = u;
126 }
127 else /* if (NS_IS_LOW_SURROGATE(u)) */ {
128 if (oddHighSurrogate && *aDestLength > 1) {
129 if (dest + 1 >= destEnd) {
130 mOddLowSurrogate = u;
131 mOddHighSurrogate = oddHighSurrogate;
132 mState = STATE_ODD_SURROGATE_PAIR;
133 goto error;
134 }
135 *dest++ = oddHighSurrogate;
136 *dest++ = u;
137 } else {
138 if (mErrBehavior == kOnError_Signal) {
139 goto error2;
140 }
141 *dest++ = UCS2_REPLACEMENT_CHAR;
142 }
143 oddHighSurrogate = 0;
144 }
145 }
146 if (src != srcEnd) {
147 // store the lead byte of a 16-bit unit for the next run.
148 mOddByte = *src++;
149 mState = STATE_HALF_CODE_POINT;
150 }
152 mOddHighSurrogate = oddHighSurrogate;
154 *aDestLength = dest - aDest;
155 *aSrcLength = src - aSrc;
156 return (mState != STATE_NORMAL || oddHighSurrogate) ?
157 NS_OK_UDEC_MOREINPUT : NS_OK;
159 error:
160 *aDestLength = dest - aDest;
161 *aSrcLength = src - aSrc;
162 return NS_OK_UDEC_MOREOUTPUT;
164 error2:
165 *aDestLength = dest - aDest;
166 *aSrcLength = --src - aSrc;
167 return NS_ERROR_ILLEGAL_INPUT;
168 }
170 NS_IMETHODIMP
171 nsUTF16ToUnicodeBase::Reset()
172 {
173 mState = STATE_FIRST_CALL;
174 mOddByte = 0;
175 mOddHighSurrogate = 0;
176 mOddLowSurrogate = 0;
177 return NS_OK;
178 }
180 NS_IMETHODIMP
181 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength,
182 int32_t * aDestLength)
183 {
184 // the left-over data of the previous run have to be taken into account.
185 *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2;
186 if (mOddHighSurrogate)
187 (*aDestLength)++;
188 if (mOddLowSurrogate)
189 (*aDestLength)++;
190 return NS_OK;
191 }
194 NS_IMETHODIMP
195 nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
196 char16_t * aDest, int32_t * aDestLength)
197 {
198 switch (mState) {
199 case STATE_FIRST_CALL:
200 if (*aSrcLength < 2) {
201 if (*aSrcLength < 1) {
202 *aDestLength = 0;
203 return NS_OK;
204 }
205 if (uint8_t(*aSrc) != 0xFE) {
206 mState = STATE_NORMAL;
207 break;
208 }
209 *aDestLength = 0;
210 mState = STATE_SECOND_BYTE;
211 return NS_OK_UDEC_MOREINPUT;
212 }
213 #if MOZ_LITTLE_ENDIAN
214 // on LE machines, BE BOM is 0xFFFE
215 if (0xFFFE != *((char16_t*)aSrc)) {
216 mState = STATE_NORMAL;
217 }
218 #else
219 if (0xFEFF != *((char16_t*)aSrc)) {
220 mState = STATE_NORMAL;
221 }
222 #endif
223 break;
225 case STATE_SECOND_BYTE:
226 if (*aSrcLength < 1) {
227 *aDestLength = 0;
228 return NS_OK_UDEC_MOREINPUT;
229 }
230 if (uint8_t(*aSrc) != 0xFF) {
231 mOddByte = 0xFE;
232 mState = STATE_HALF_CODE_POINT;
233 }
234 break;
235 }
237 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
238 bool(MOZ_LITTLE_ENDIAN));
239 }
241 NS_IMETHODIMP
242 nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
243 char16_t * aDest, int32_t * aDestLength)
244 {
245 switch (mState) {
246 case STATE_FIRST_CALL:
247 if (*aSrcLength < 2) {
248 if (*aSrcLength < 1) {
249 *aDestLength = 0;
250 return NS_OK;
251 }
252 if (uint8_t(*aSrc) != 0xFF) {
253 mState = STATE_NORMAL;
254 break;
255 }
256 *aDestLength = 0;
257 mState = STATE_SECOND_BYTE;
258 return NS_OK_UDEC_MOREINPUT;
259 }
260 #if MOZ_BIG_ENDIAN
261 // on BE machines, LE BOM is 0xFFFE
262 if (0xFFFE != *((char16_t*)aSrc)) {
263 mState = STATE_NORMAL;
264 }
265 #else
266 if (0xFEFF != *((char16_t*)aSrc)) {
267 mState = STATE_NORMAL;
268 }
269 #endif
270 break;
272 case STATE_SECOND_BYTE:
273 if (*aSrcLength < 1) {
274 *aDestLength = 0;
275 return NS_OK_UDEC_MOREINPUT;
276 }
277 if (uint8_t(*aSrc) != 0xFE) {
278 mOddByte = 0xFF;
279 mState = STATE_HALF_CODE_POINT;
280 }
281 break;
282 }
284 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
285 bool(MOZ_BIG_ENDIAN));
286 }
288 NS_IMETHODIMP
289 nsUTF16ToUnicode::Reset()
290 {
291 mEndian = kUnknown;
292 mFoundBOM = false;
293 return nsUTF16ToUnicodeBase::Reset();
294 }
296 NS_IMETHODIMP
297 nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength,
298 char16_t * aDest, int32_t * aDestLength)
299 {
300 if(STATE_FIRST_CALL == mState && *aSrcLength < 2)
301 {
302 nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT;
303 *aSrcLength=0;
304 *aDestLength=0;
305 return res;
306 }
307 if(STATE_FIRST_CALL == mState) // first time called
308 {
309 // check if BOM (0xFEFF) is at the beginning, remove it if found, and
310 // set mEndian accordingly.
311 if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) {
312 mEndian = kLittleEndian;
313 mFoundBOM = true;
314 }
315 else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) {
316 mEndian = kBigEndian;
317 mFoundBOM = true;
318 }
319 // BOM is not found, but we can use a simple heuristic to determine
320 // the endianness. Assume the first character is [U+0001, U+00FF].
321 // Not always valid, but it's very likely to hold for html/xml/css.
322 else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00)
323 mState = STATE_NORMAL;
324 mEndian = kBigEndian;
325 }
326 else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00)
327 mState = STATE_NORMAL;
328 mEndian = kLittleEndian;
329 }
330 else { // Neither BOM nor 'plausible' byte patterns at the beginning.
331 // Just assume it's BE (following Unicode standard)
332 // and let the garbage show up in the browser. (security concern?)
333 // (bug 246194)
334 mState = STATE_NORMAL;
335 mEndian = kBigEndian;
336 }
337 }
339 nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength,
340 #if MOZ_BIG_ENDIAN
341 (mEndian == kLittleEndian)
342 #else
343 (mEndian == kBigEndian)
344 #endif
345 );
347 // If BOM is not found and we're to return NS_OK, signal that BOM
348 // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode|
349 return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv;
350 }