|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "nsUTF16ToUnicode.h" |
|
7 #include "nsCharTraits.h" |
|
8 #include "mozilla/Endian.h" |
|
9 |
|
10 enum { |
|
11 STATE_NORMAL = 0, |
|
12 STATE_HALF_CODE_POINT = 1, |
|
13 STATE_FIRST_CALL = 2, |
|
14 STATE_SECOND_BYTE = STATE_FIRST_CALL | STATE_HALF_CODE_POINT, |
|
15 STATE_ODD_SURROGATE_PAIR = 4 |
|
16 }; |
|
17 |
|
18 nsresult |
|
19 nsUTF16ToUnicodeBase::UTF16ConvertToUnicode(const char * aSrc, |
|
20 int32_t * aSrcLength, |
|
21 char16_t * aDest, |
|
22 int32_t * aDestLength, |
|
23 bool aSwapBytes) |
|
24 { |
|
25 const char* src = aSrc; |
|
26 const char* srcEnd = aSrc + *aSrcLength; |
|
27 char16_t* dest = aDest; |
|
28 char16_t* destEnd = aDest + *aDestLength; |
|
29 char16_t oddHighSurrogate; |
|
30 |
|
31 switch(mState) { |
|
32 case STATE_FIRST_CALL: |
|
33 NS_ASSERTION(*aSrcLength > 1, "buffer too short"); |
|
34 src+=2; |
|
35 mState = STATE_NORMAL; |
|
36 break; |
|
37 |
|
38 case STATE_SECOND_BYTE: |
|
39 NS_ASSERTION(*aSrcLength > 0, "buffer too short"); |
|
40 src++; |
|
41 mState = STATE_NORMAL; |
|
42 break; |
|
43 |
|
44 case STATE_ODD_SURROGATE_PAIR: |
|
45 if (*aDestLength < 2) |
|
46 goto error; |
|
47 else { |
|
48 *dest++ = mOddHighSurrogate; |
|
49 *dest++ = mOddLowSurrogate; |
|
50 mOddHighSurrogate = mOddLowSurrogate = 0; |
|
51 mState = STATE_NORMAL; |
|
52 } |
|
53 break; |
|
54 |
|
55 case STATE_NORMAL: |
|
56 case STATE_HALF_CODE_POINT: |
|
57 default: |
|
58 break; |
|
59 } |
|
60 |
|
61 oddHighSurrogate = mOddHighSurrogate; |
|
62 |
|
63 if (src == srcEnd) { |
|
64 *aDestLength = dest - aDest; |
|
65 return (mState != STATE_NORMAL || oddHighSurrogate) ? |
|
66 NS_OK_UDEC_MOREINPUT : NS_OK; |
|
67 } |
|
68 |
|
69 const char* srcEvenEnd; |
|
70 |
|
71 char16_t u; |
|
72 if (mState == STATE_HALF_CODE_POINT) { |
|
73 if (dest == destEnd) |
|
74 goto error; |
|
75 |
|
76 // the 1st byte of a 16-bit code unit was stored in |mOddByte| in the |
|
77 // previous run while the 2nd byte has to come from |*src|. |
|
78 mState = STATE_NORMAL; |
|
79 #if MOZ_BIG_ENDIAN |
|
80 u = (mOddByte << 8) | uint8_t(*src++); // safe, we know we have at least one byte. |
|
81 #else |
|
82 u = (*src++ << 8) | mOddByte; // safe, we know we have at least one byte. |
|
83 #endif |
|
84 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop |
|
85 goto have_codepoint; |
|
86 } else { |
|
87 srcEvenEnd = src + ((srcEnd - src) & ~1); // handle even number of bytes in main loop |
|
88 } |
|
89 |
|
90 while (src != srcEvenEnd) { |
|
91 if (dest == destEnd) |
|
92 goto error; |
|
93 |
|
94 #if !defined(__sparc__) && !defined(__arm__) |
|
95 u = *(const char16_t*)src; |
|
96 #else |
|
97 memcpy(&u, src, 2); |
|
98 #endif |
|
99 src += 2; |
|
100 |
|
101 have_codepoint: |
|
102 if (aSwapBytes) |
|
103 u = u << 8 | u >> 8; |
|
104 |
|
105 if (!IS_SURROGATE(u)) { |
|
106 if (oddHighSurrogate) { |
|
107 if (mErrBehavior == kOnError_Signal) { |
|
108 goto error2; |
|
109 } |
|
110 *dest++ = UCS2_REPLACEMENT_CHAR; |
|
111 if (dest == destEnd) |
|
112 goto error; |
|
113 oddHighSurrogate = 0; |
|
114 } |
|
115 *dest++ = u; |
|
116 } else if (NS_IS_HIGH_SURROGATE(u)) { |
|
117 if (oddHighSurrogate) { |
|
118 if (mErrBehavior == kOnError_Signal) { |
|
119 goto error2; |
|
120 } |
|
121 *dest++ = UCS2_REPLACEMENT_CHAR; |
|
122 if (dest == destEnd) |
|
123 goto error; |
|
124 } |
|
125 oddHighSurrogate = u; |
|
126 } |
|
127 else /* if (NS_IS_LOW_SURROGATE(u)) */ { |
|
128 if (oddHighSurrogate && *aDestLength > 1) { |
|
129 if (dest + 1 >= destEnd) { |
|
130 mOddLowSurrogate = u; |
|
131 mOddHighSurrogate = oddHighSurrogate; |
|
132 mState = STATE_ODD_SURROGATE_PAIR; |
|
133 goto error; |
|
134 } |
|
135 *dest++ = oddHighSurrogate; |
|
136 *dest++ = u; |
|
137 } else { |
|
138 if (mErrBehavior == kOnError_Signal) { |
|
139 goto error2; |
|
140 } |
|
141 *dest++ = UCS2_REPLACEMENT_CHAR; |
|
142 } |
|
143 oddHighSurrogate = 0; |
|
144 } |
|
145 } |
|
146 if (src != srcEnd) { |
|
147 // store the lead byte of a 16-bit unit for the next run. |
|
148 mOddByte = *src++; |
|
149 mState = STATE_HALF_CODE_POINT; |
|
150 } |
|
151 |
|
152 mOddHighSurrogate = oddHighSurrogate; |
|
153 |
|
154 *aDestLength = dest - aDest; |
|
155 *aSrcLength = src - aSrc; |
|
156 return (mState != STATE_NORMAL || oddHighSurrogate) ? |
|
157 NS_OK_UDEC_MOREINPUT : NS_OK; |
|
158 |
|
159 error: |
|
160 *aDestLength = dest - aDest; |
|
161 *aSrcLength = src - aSrc; |
|
162 return NS_OK_UDEC_MOREOUTPUT; |
|
163 |
|
164 error2: |
|
165 *aDestLength = dest - aDest; |
|
166 *aSrcLength = --src - aSrc; |
|
167 return NS_ERROR_ILLEGAL_INPUT; |
|
168 } |
|
169 |
|
170 NS_IMETHODIMP |
|
171 nsUTF16ToUnicodeBase::Reset() |
|
172 { |
|
173 mState = STATE_FIRST_CALL; |
|
174 mOddByte = 0; |
|
175 mOddHighSurrogate = 0; |
|
176 mOddLowSurrogate = 0; |
|
177 return NS_OK; |
|
178 } |
|
179 |
|
180 NS_IMETHODIMP |
|
181 nsUTF16ToUnicodeBase::GetMaxLength(const char * aSrc, int32_t aSrcLength, |
|
182 int32_t * aDestLength) |
|
183 { |
|
184 // the left-over data of the previous run have to be taken into account. |
|
185 *aDestLength = (aSrcLength + ((STATE_HALF_CODE_POINT & mState) ? 1 : 0)) / 2; |
|
186 if (mOddHighSurrogate) |
|
187 (*aDestLength)++; |
|
188 if (mOddLowSurrogate) |
|
189 (*aDestLength)++; |
|
190 return NS_OK; |
|
191 } |
|
192 |
|
193 |
|
194 NS_IMETHODIMP |
|
195 nsUTF16BEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, |
|
196 char16_t * aDest, int32_t * aDestLength) |
|
197 { |
|
198 switch (mState) { |
|
199 case STATE_FIRST_CALL: |
|
200 if (*aSrcLength < 2) { |
|
201 if (*aSrcLength < 1) { |
|
202 *aDestLength = 0; |
|
203 return NS_OK; |
|
204 } |
|
205 if (uint8_t(*aSrc) != 0xFE) { |
|
206 mState = STATE_NORMAL; |
|
207 break; |
|
208 } |
|
209 *aDestLength = 0; |
|
210 mState = STATE_SECOND_BYTE; |
|
211 return NS_OK_UDEC_MOREINPUT; |
|
212 } |
|
213 #if MOZ_LITTLE_ENDIAN |
|
214 // on LE machines, BE BOM is 0xFFFE |
|
215 if (0xFFFE != *((char16_t*)aSrc)) { |
|
216 mState = STATE_NORMAL; |
|
217 } |
|
218 #else |
|
219 if (0xFEFF != *((char16_t*)aSrc)) { |
|
220 mState = STATE_NORMAL; |
|
221 } |
|
222 #endif |
|
223 break; |
|
224 |
|
225 case STATE_SECOND_BYTE: |
|
226 if (*aSrcLength < 1) { |
|
227 *aDestLength = 0; |
|
228 return NS_OK_UDEC_MOREINPUT; |
|
229 } |
|
230 if (uint8_t(*aSrc) != 0xFF) { |
|
231 mOddByte = 0xFE; |
|
232 mState = STATE_HALF_CODE_POINT; |
|
233 } |
|
234 break; |
|
235 } |
|
236 |
|
237 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, |
|
238 bool(MOZ_LITTLE_ENDIAN)); |
|
239 } |
|
240 |
|
241 NS_IMETHODIMP |
|
242 nsUTF16LEToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, |
|
243 char16_t * aDest, int32_t * aDestLength) |
|
244 { |
|
245 switch (mState) { |
|
246 case STATE_FIRST_CALL: |
|
247 if (*aSrcLength < 2) { |
|
248 if (*aSrcLength < 1) { |
|
249 *aDestLength = 0; |
|
250 return NS_OK; |
|
251 } |
|
252 if (uint8_t(*aSrc) != 0xFF) { |
|
253 mState = STATE_NORMAL; |
|
254 break; |
|
255 } |
|
256 *aDestLength = 0; |
|
257 mState = STATE_SECOND_BYTE; |
|
258 return NS_OK_UDEC_MOREINPUT; |
|
259 } |
|
260 #if MOZ_BIG_ENDIAN |
|
261 // on BE machines, LE BOM is 0xFFFE |
|
262 if (0xFFFE != *((char16_t*)aSrc)) { |
|
263 mState = STATE_NORMAL; |
|
264 } |
|
265 #else |
|
266 if (0xFEFF != *((char16_t*)aSrc)) { |
|
267 mState = STATE_NORMAL; |
|
268 } |
|
269 #endif |
|
270 break; |
|
271 |
|
272 case STATE_SECOND_BYTE: |
|
273 if (*aSrcLength < 1) { |
|
274 *aDestLength = 0; |
|
275 return NS_OK_UDEC_MOREINPUT; |
|
276 } |
|
277 if (uint8_t(*aSrc) != 0xFE) { |
|
278 mOddByte = 0xFF; |
|
279 mState = STATE_HALF_CODE_POINT; |
|
280 } |
|
281 break; |
|
282 } |
|
283 |
|
284 return UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, |
|
285 bool(MOZ_BIG_ENDIAN)); |
|
286 } |
|
287 |
|
288 NS_IMETHODIMP |
|
289 nsUTF16ToUnicode::Reset() |
|
290 { |
|
291 mEndian = kUnknown; |
|
292 mFoundBOM = false; |
|
293 return nsUTF16ToUnicodeBase::Reset(); |
|
294 } |
|
295 |
|
296 NS_IMETHODIMP |
|
297 nsUTF16ToUnicode::Convert(const char * aSrc, int32_t * aSrcLength, |
|
298 char16_t * aDest, int32_t * aDestLength) |
|
299 { |
|
300 if(STATE_FIRST_CALL == mState && *aSrcLength < 2) |
|
301 { |
|
302 nsresult res = (*aSrcLength == 0) ? NS_OK : NS_ERROR_ILLEGAL_INPUT; |
|
303 *aSrcLength=0; |
|
304 *aDestLength=0; |
|
305 return res; |
|
306 } |
|
307 if(STATE_FIRST_CALL == mState) // first time called |
|
308 { |
|
309 // check if BOM (0xFEFF) is at the beginning, remove it if found, and |
|
310 // set mEndian accordingly. |
|
311 if(0xFF == uint8_t(aSrc[0]) && 0xFE == uint8_t(aSrc[1])) { |
|
312 mEndian = kLittleEndian; |
|
313 mFoundBOM = true; |
|
314 } |
|
315 else if(0xFE == uint8_t(aSrc[0]) && 0xFF == uint8_t(aSrc[1])) { |
|
316 mEndian = kBigEndian; |
|
317 mFoundBOM = true; |
|
318 } |
|
319 // BOM is not found, but we can use a simple heuristic to determine |
|
320 // the endianness. Assume the first character is [U+0001, U+00FF]. |
|
321 // Not always valid, but it's very likely to hold for html/xml/css. |
|
322 else if(!aSrc[0] && aSrc[1]) { // 0x00 0xhh (hh != 00) |
|
323 mState = STATE_NORMAL; |
|
324 mEndian = kBigEndian; |
|
325 } |
|
326 else if(aSrc[0] && !aSrc[1]) { // 0xhh 0x00 (hh != 00) |
|
327 mState = STATE_NORMAL; |
|
328 mEndian = kLittleEndian; |
|
329 } |
|
330 else { // Neither BOM nor 'plausible' byte patterns at the beginning. |
|
331 // Just assume it's BE (following Unicode standard) |
|
332 // and let the garbage show up in the browser. (security concern?) |
|
333 // (bug 246194) |
|
334 mState = STATE_NORMAL; |
|
335 mEndian = kBigEndian; |
|
336 } |
|
337 } |
|
338 |
|
339 nsresult rv = UTF16ConvertToUnicode(aSrc, aSrcLength, aDest, aDestLength, |
|
340 #if MOZ_BIG_ENDIAN |
|
341 (mEndian == kLittleEndian) |
|
342 #else |
|
343 (mEndian == kBigEndian) |
|
344 #endif |
|
345 ); |
|
346 |
|
347 // If BOM is not found and we're to return NS_OK, signal that BOM |
|
348 // is not found. Otherwise, return |rv| from |UTF16ConvertToUnicode| |
|
349 return (rv == NS_OK && !mFoundBOM) ? NS_OK_UDEC_NOBOMFOUND : rv; |
|
350 } |