|
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
|
2 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 #ifndef nsUTF8Utils_h_ |
|
6 #define nsUTF8Utils_h_ |
|
7 |
|
8 // This file may be used in two ways: if MOZILLA_INTERNAL_API is defined, this |
|
9 // file will provide signatures for the Mozilla abstract string types. It will |
|
10 // use XPCOM assertion/debugging macros, etc. |
|
11 |
|
12 #include "nscore.h" |
|
13 #include "mozilla/SSE.h" |
|
14 |
|
15 #include "nsCharTraits.h" |
|
16 |
|
17 class UTF8traits |
|
18 { |
|
19 public: |
|
20 static bool isASCII(char c) { return (c & 0x80) == 0x00; } |
|
21 static bool isInSeq(char c) { return (c & 0xC0) == 0x80; } |
|
22 static bool is2byte(char c) { return (c & 0xE0) == 0xC0; } |
|
23 static bool is3byte(char c) { return (c & 0xF0) == 0xE0; } |
|
24 static bool is4byte(char c) { return (c & 0xF8) == 0xF0; } |
|
25 static bool is5byte(char c) { return (c & 0xFC) == 0xF8; } |
|
26 static bool is6byte(char c) { return (c & 0xFE) == 0xFC; } |
|
27 }; |
|
28 |
|
29 /** |
|
30 * Extract the next UCS-4 character from the buffer and return it. The |
|
31 * pointer passed in is advanced to the start of the next character in the |
|
32 * buffer. If non-null, the parameters err and overlong are filled in to |
|
33 * indicate that the character was represented by an overlong sequence, or |
|
34 * that an error occurred. |
|
35 */ |
|
36 |
|
37 class UTF8CharEnumerator |
|
38 { |
|
39 public: |
|
40 static uint32_t NextChar(const char **buffer, const char *end, |
|
41 bool *err) |
|
42 { |
|
43 NS_ASSERTION(buffer && *buffer, "null buffer!"); |
|
44 |
|
45 const char *p = *buffer; |
|
46 *err = false; |
|
47 |
|
48 if (p >= end) |
|
49 { |
|
50 *err = true; |
|
51 |
|
52 return 0; |
|
53 } |
|
54 |
|
55 char c = *p++; |
|
56 |
|
57 if ( UTF8traits::isASCII(c) ) |
|
58 { |
|
59 *buffer = p; |
|
60 return c; |
|
61 } |
|
62 |
|
63 uint32_t ucs4; |
|
64 uint32_t minUcs4; |
|
65 int32_t state = 0; |
|
66 |
|
67 if (!CalcState(c, ucs4, minUcs4, state)) { |
|
68 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); |
|
69 *err = true; |
|
70 |
|
71 return 0; |
|
72 } |
|
73 |
|
74 while ( state-- ) |
|
75 { |
|
76 if (p == end) |
|
77 { |
|
78 *err = true; |
|
79 |
|
80 return 0; |
|
81 } |
|
82 |
|
83 c = *p++; |
|
84 |
|
85 if (!AddByte(c, state, ucs4)) |
|
86 { |
|
87 *err = true; |
|
88 |
|
89 return 0; |
|
90 } |
|
91 } |
|
92 |
|
93 if ( ucs4 < minUcs4 ) |
|
94 { |
|
95 // Overlong sequence |
|
96 ucs4 = UCS2_REPLACEMENT_CHAR; |
|
97 } |
|
98 else if ( ucs4 >= 0xD800 && |
|
99 (ucs4 <= 0xDFFF || ucs4 >= UCS_END)) |
|
100 { |
|
101 // Surrogates and code points outside the Unicode range. |
|
102 ucs4 = UCS2_REPLACEMENT_CHAR; |
|
103 } |
|
104 |
|
105 *buffer = p; |
|
106 return ucs4; |
|
107 } |
|
108 |
|
109 private: |
|
110 static bool CalcState(char c, uint32_t& ucs4, uint32_t& minUcs4, |
|
111 int32_t& state) |
|
112 { |
|
113 if ( UTF8traits::is2byte(c) ) |
|
114 { |
|
115 ucs4 = (uint32_t(c) << 6) & 0x000007C0L; |
|
116 state = 1; |
|
117 minUcs4 = 0x00000080; |
|
118 } |
|
119 else if ( UTF8traits::is3byte(c) ) |
|
120 { |
|
121 ucs4 = (uint32_t(c) << 12) & 0x0000F000L; |
|
122 state = 2; |
|
123 minUcs4 = 0x00000800; |
|
124 } |
|
125 else if ( UTF8traits::is4byte(c) ) |
|
126 { |
|
127 ucs4 = (uint32_t(c) << 18) & 0x001F0000L; |
|
128 state = 3; |
|
129 minUcs4 = 0x00010000; |
|
130 } |
|
131 else if ( UTF8traits::is5byte(c) ) |
|
132 { |
|
133 ucs4 = (uint32_t(c) << 24) & 0x03000000L; |
|
134 state = 4; |
|
135 minUcs4 = 0x00200000; |
|
136 } |
|
137 else if ( UTF8traits::is6byte(c) ) |
|
138 { |
|
139 ucs4 = (uint32_t(c) << 30) & 0x40000000L; |
|
140 state = 5; |
|
141 minUcs4 = 0x04000000; |
|
142 } |
|
143 else |
|
144 { |
|
145 return false; |
|
146 } |
|
147 |
|
148 return true; |
|
149 } |
|
150 |
|
151 static bool AddByte(char c, int32_t state, uint32_t& ucs4) |
|
152 { |
|
153 if ( UTF8traits::isInSeq(c) ) |
|
154 { |
|
155 int32_t shift = state * 6; |
|
156 ucs4 |= (uint32_t(c) & 0x3F) << shift; |
|
157 return true; |
|
158 } |
|
159 |
|
160 return false; |
|
161 } |
|
162 }; |
|
163 |
|
164 |
|
165 /** |
|
166 * Extract the next UCS-4 character from the buffer and return it. The |
|
167 * pointer passed in is advanced to the start of the next character in the |
|
168 * buffer. If non-null, the err parameter is filled in if an error occurs. |
|
169 */ |
|
170 |
|
171 |
|
172 class UTF16CharEnumerator |
|
173 { |
|
174 public: |
|
175 static uint32_t NextChar(const char16_t **buffer, const char16_t *end, |
|
176 bool *err = nullptr) |
|
177 { |
|
178 NS_ASSERTION(buffer && *buffer, "null buffer!"); |
|
179 |
|
180 const char16_t *p = *buffer; |
|
181 |
|
182 if (p >= end) |
|
183 { |
|
184 NS_ERROR("No input to work with"); |
|
185 if (err) |
|
186 *err = true; |
|
187 |
|
188 return 0; |
|
189 } |
|
190 |
|
191 char16_t c = *p++; |
|
192 |
|
193 if (!IS_SURROGATE(c)) // U+0000 - U+D7FF,U+E000 - U+FFFF |
|
194 { |
|
195 if (err) |
|
196 *err = false; |
|
197 *buffer = p; |
|
198 return c; |
|
199 } |
|
200 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF |
|
201 { |
|
202 if (p == end) |
|
203 { |
|
204 // Found a high surrogate the end of the buffer. Flag this |
|
205 // as an error and return the Unicode replacement |
|
206 // character 0xFFFD. |
|
207 |
|
208 NS_WARNING("Unexpected end of buffer after high surrogate"); |
|
209 |
|
210 if (err) |
|
211 *err = true; |
|
212 *buffer = p; |
|
213 return 0xFFFD; |
|
214 } |
|
215 |
|
216 // D800- DBFF - High Surrogate |
|
217 char16_t h = c; |
|
218 |
|
219 c = *p++; |
|
220 |
|
221 if (NS_IS_LOW_SURROGATE(c)) |
|
222 { |
|
223 // DC00- DFFF - Low Surrogate |
|
224 // N = (H - D800) *400 + 10000 + (L - DC00) |
|
225 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); |
|
226 if (err) |
|
227 *err = false; |
|
228 *buffer = p; |
|
229 return ucs4; |
|
230 } |
|
231 else |
|
232 { |
|
233 // Found a high surrogate followed by something other than |
|
234 // a low surrogate. Flag this as an error and return the |
|
235 // Unicode replacement character 0xFFFD. Note that the |
|
236 // pointer to the next character points to the second 16-bit |
|
237 // value, not beyond it, as per Unicode 5.0.0 Chapter 3 C10, |
|
238 // only the first code unit of an illegal sequence must be |
|
239 // treated as an illegally terminated code unit sequence |
|
240 // (also Chapter 3 D91, "isolated [not paired and ill-formed] |
|
241 // UTF-16 code units in the range D800..DFFF are ill-formed"). |
|
242 NS_WARNING("got a High Surrogate but no low surrogate"); |
|
243 |
|
244 if (err) |
|
245 *err = true; |
|
246 *buffer = p - 1; |
|
247 return 0xFFFD; |
|
248 } |
|
249 } |
|
250 else // U+DC00 - U+DFFF |
|
251 { |
|
252 // DC00- DFFF - Low Surrogate |
|
253 |
|
254 // Found a low surrogate w/o a preceding high surrogate. Flag |
|
255 // this as an error and return the Unicode replacement |
|
256 // character 0xFFFD. |
|
257 |
|
258 NS_WARNING("got a low Surrogate but no high surrogate"); |
|
259 if (err) |
|
260 *err = true; |
|
261 *buffer = p; |
|
262 return 0xFFFD; |
|
263 } |
|
264 |
|
265 if (err) |
|
266 *err = true; |
|
267 return 0; |
|
268 } |
|
269 }; |
|
270 |
|
271 |
|
272 /** |
|
273 * A character sink (see |copy_string| in nsAlgorithm.h) for converting |
|
274 * UTF-8 to UTF-16 |
|
275 */ |
|
276 class ConvertUTF8toUTF16 |
|
277 { |
|
278 public: |
|
279 typedef char value_type; |
|
280 typedef char16_t buffer_type; |
|
281 |
|
282 ConvertUTF8toUTF16( buffer_type* aBuffer ) |
|
283 : mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(false) {} |
|
284 |
|
285 size_t Length() const { return mBuffer - mStart; } |
|
286 |
|
287 bool ErrorEncountered() const { return mErrorEncountered; } |
|
288 |
|
289 void write( const value_type* start, uint32_t N ) |
|
290 { |
|
291 if ( mErrorEncountered ) |
|
292 return; |
|
293 |
|
294 // algorithm assumes utf8 units won't |
|
295 // be spread across fragments |
|
296 const value_type* p = start; |
|
297 const value_type* end = start + N; |
|
298 buffer_type* out = mBuffer; |
|
299 for ( ; p != end /* && *p */; ) |
|
300 { |
|
301 bool err; |
|
302 uint32_t ucs4 = UTF8CharEnumerator::NextChar(&p, end, &err); |
|
303 |
|
304 if ( err ) |
|
305 { |
|
306 mErrorEncountered = true; |
|
307 mBuffer = out; |
|
308 return; |
|
309 } |
|
310 |
|
311 if ( ucs4 >= PLANE1_BASE ) |
|
312 { |
|
313 *out++ = (buffer_type)H_SURROGATE(ucs4); |
|
314 *out++ = (buffer_type)L_SURROGATE(ucs4); |
|
315 } |
|
316 else |
|
317 { |
|
318 *out++ = ucs4; |
|
319 } |
|
320 } |
|
321 mBuffer = out; |
|
322 } |
|
323 |
|
324 void write_terminator() |
|
325 { |
|
326 *mBuffer = buffer_type(0); |
|
327 } |
|
328 |
|
329 private: |
|
330 buffer_type* const mStart; |
|
331 buffer_type* mBuffer; |
|
332 bool mErrorEncountered; |
|
333 }; |
|
334 |
|
335 /** |
|
336 * A character sink (see |copy_string| in nsAlgorithm.h) for computing |
|
337 * the length of the UTF-16 string equivalent to a UTF-8 string. |
|
338 */ |
|
339 class CalculateUTF8Length |
|
340 { |
|
341 public: |
|
342 typedef char value_type; |
|
343 |
|
344 CalculateUTF8Length() : mLength(0), mErrorEncountered(false) { } |
|
345 |
|
346 size_t Length() const { return mLength; } |
|
347 |
|
348 void write( const value_type* start, uint32_t N ) |
|
349 { |
|
350 // ignore any further requests |
|
351 if ( mErrorEncountered ) |
|
352 return; |
|
353 |
|
354 // algorithm assumes utf8 units won't |
|
355 // be spread across fragments |
|
356 const value_type* p = start; |
|
357 const value_type* end = start + N; |
|
358 for ( ; p < end /* && *p */; ++mLength ) |
|
359 { |
|
360 if ( UTF8traits::isASCII(*p) ) |
|
361 p += 1; |
|
362 else if ( UTF8traits::is2byte(*p) ) |
|
363 p += 2; |
|
364 else if ( UTF8traits::is3byte(*p) ) |
|
365 p += 3; |
|
366 else if ( UTF8traits::is4byte(*p) ) { |
|
367 // Because a UTF-8 sequence of 4 bytes represents a codepoint |
|
368 // greater than 0xFFFF, it will become a surrogate pair in the |
|
369 // UTF-16 string, so add 1 more to mLength. |
|
370 // This doesn't happen with is5byte and is6byte because they |
|
371 // are illegal UTF-8 sequences (greater than 0x10FFFF) so get |
|
372 // converted to a single replacement character. |
|
373 |
|
374 // However, there is one case when a 4 byte UTF-8 sequence will |
|
375 // only generate 2 UTF-16 bytes. If we have a properly encoded |
|
376 // sequence, but with an invalid value (too small or too big), |
|
377 // that will result in a replacement character being written |
|
378 // This replacement character is encoded as just 1 single |
|
379 // UTF-16 character, which is 2 bytes. |
|
380 |
|
381 // The below code therefore only adds 1 to mLength if the UTF8 |
|
382 // data will produce a decoded character which is greater than |
|
383 // or equal to 0x010000 and less than 0x0110000. |
|
384 |
|
385 // A 4byte UTF8 character is encoded as |
|
386 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
|
387 // Bit 1-3 on the first byte, and bit 5-6 on the second byte, |
|
388 // map to bit 17-21 in the final result. If these bits are |
|
389 // between 0x01 and 0x11, that means that the final result is |
|
390 // between 0x010000 and 0x110000. The below code reads these |
|
391 // bits out and assigns them to c, but shifted up 4 bits to |
|
392 // avoid having to shift twice. |
|
393 |
|
394 // It doesn't matter what to do in the case where p + 4 > end |
|
395 // since no UTF16 characters will be written in that case by |
|
396 // ConvertUTF8toUTF16. Likewise it doesn't matter what we do if |
|
397 // any of the surrogate bits are wrong since no UTF16 |
|
398 // characters will be written in that case either. |
|
399 |
|
400 if (p + 4 <= end) { |
|
401 uint32_t c = ((uint32_t)(p[0] & 0x07)) << 6 | |
|
402 ((uint32_t)(p[1] & 0x30)); |
|
403 if (c >= 0x010 && c < 0x110) |
|
404 ++mLength; |
|
405 } |
|
406 |
|
407 p += 4; |
|
408 } |
|
409 else if ( UTF8traits::is5byte(*p) ) |
|
410 p += 5; |
|
411 else if ( UTF8traits::is6byte(*p) ) |
|
412 p += 6; |
|
413 else // error |
|
414 { |
|
415 ++mLength; // to account for the decrement below |
|
416 break; |
|
417 } |
|
418 } |
|
419 if ( p != end ) |
|
420 { |
|
421 NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings."); |
|
422 --mLength; // The last multi-byte char wasn't complete, discard it. |
|
423 mErrorEncountered = true; |
|
424 } |
|
425 } |
|
426 |
|
427 private: |
|
428 size_t mLength; |
|
429 bool mErrorEncountered; |
|
430 }; |
|
431 |
|
432 /** |
|
433 * A character sink (see |copy_string| in nsAlgorithm.h) for |
|
434 * converting UTF-16 to UTF-8. Treats invalid UTF-16 data as 0xFFFD |
|
435 * (0xEFBFBD in UTF-8). |
|
436 */ |
|
437 class ConvertUTF16toUTF8 |
|
438 { |
|
439 public: |
|
440 typedef char16_t value_type; |
|
441 typedef char buffer_type; |
|
442 |
|
443 // The error handling here is more lenient than that in |
|
444 // |ConvertUTF8toUTF16|, but it's that way for backwards |
|
445 // compatibility. |
|
446 |
|
447 ConvertUTF16toUTF8( buffer_type* aBuffer ) |
|
448 : mStart(aBuffer), mBuffer(aBuffer) {} |
|
449 |
|
450 size_t Size() const { return mBuffer - mStart; } |
|
451 |
|
452 void write( const value_type* start, uint32_t N ) |
|
453 { |
|
454 buffer_type *out = mBuffer; // gcc isn't smart enough to do this! |
|
455 |
|
456 for (const value_type *p = start, *end = start + N; p < end; ++p ) |
|
457 { |
|
458 value_type c = *p; |
|
459 if (! (c & 0xFF80)) // U+0000 - U+007F |
|
460 { |
|
461 *out++ = (char)c; |
|
462 } |
|
463 else if (! (c & 0xF800)) // U+0100 - U+07FF |
|
464 { |
|
465 *out++ = 0xC0 | (char)(c >> 6); |
|
466 *out++ = 0x80 | (char)(0x003F & c); |
|
467 } |
|
468 else if (!IS_SURROGATE(c)) // U+0800 - U+D7FF,U+E000 - U+FFFF |
|
469 { |
|
470 *out++ = 0xE0 | (char)(c >> 12); |
|
471 *out++ = 0x80 | (char)(0x003F & (c >> 6)); |
|
472 *out++ = 0x80 | (char)(0x003F & c ); |
|
473 } |
|
474 else if (NS_IS_HIGH_SURROGATE(c)) // U+D800 - U+DBFF |
|
475 { |
|
476 // D800- DBFF - High Surrogate |
|
477 value_type h = c; |
|
478 |
|
479 ++p; |
|
480 if (p == end) |
|
481 { |
|
482 // Treat broken characters as the Unicode |
|
483 // replacement character 0xFFFD (0xEFBFBD in |
|
484 // UTF-8) |
|
485 *out++ = '\xEF'; |
|
486 *out++ = '\xBF'; |
|
487 *out++ = '\xBD'; |
|
488 |
|
489 NS_WARNING("String ending in half a surrogate pair!"); |
|
490 |
|
491 break; |
|
492 } |
|
493 c = *p; |
|
494 |
|
495 if (NS_IS_LOW_SURROGATE(c)) |
|
496 { |
|
497 // DC00- DFFF - Low Surrogate |
|
498 // N = (H - D800) *400 + 10000 + ( L - DC00 ) |
|
499 uint32_t ucs4 = SURROGATE_TO_UCS4(h, c); |
|
500 |
|
501 // 0001 0000-001F FFFF |
|
502 *out++ = 0xF0 | (char)(ucs4 >> 18); |
|
503 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 12)); |
|
504 *out++ = 0x80 | (char)(0x003F & (ucs4 >> 6)); |
|
505 *out++ = 0x80 | (char)(0x003F & ucs4); |
|
506 } |
|
507 else |
|
508 { |
|
509 // Treat broken characters as the Unicode |
|
510 // replacement character 0xFFFD (0xEFBFBD in |
|
511 // UTF-8) |
|
512 *out++ = '\xEF'; |
|
513 *out++ = '\xBF'; |
|
514 *out++ = '\xBD'; |
|
515 |
|
516 // The pointer to the next character points to the second |
|
517 // 16-bit value, not beyond it, as per Unicode 5.0.0 |
|
518 // Chapter 3 C10, only the first code unit of an illegal |
|
519 // sequence must be treated as an illegally terminated |
|
520 // code unit sequence (also Chapter 3 D91, "isolated [not |
|
521 // paired and ill-formed] UTF-16 code units in the range |
|
522 // D800..DFFF are ill-formed"). |
|
523 p--; |
|
524 |
|
525 NS_WARNING("got a High Surrogate but no low surrogate"); |
|
526 } |
|
527 } |
|
528 else // U+DC00 - U+DFFF |
|
529 { |
|
530 // Treat broken characters as the Unicode replacement |
|
531 // character 0xFFFD (0xEFBFBD in UTF-8) |
|
532 *out++ = '\xEF'; |
|
533 *out++ = '\xBF'; |
|
534 *out++ = '\xBD'; |
|
535 |
|
536 // DC00- DFFF - Low Surrogate |
|
537 NS_WARNING("got a low Surrogate but no high surrogate"); |
|
538 } |
|
539 } |
|
540 |
|
541 mBuffer = out; |
|
542 } |
|
543 |
|
544 void write_terminator() |
|
545 { |
|
546 *mBuffer = buffer_type(0); |
|
547 } |
|
548 |
|
549 private: |
|
550 buffer_type* const mStart; |
|
551 buffer_type* mBuffer; |
|
552 }; |
|
553 |
|
554 /** |
|
555 * A character sink (see |copy_string| in nsAlgorithm.h) for computing |
|
556 * the number of bytes a UTF-16 would occupy in UTF-8. Treats invalid |
|
557 * UTF-16 data as 0xFFFD (0xEFBFBD in UTF-8). |
|
558 */ |
|
559 class CalculateUTF8Size |
|
560 { |
|
561 public: |
|
562 typedef char16_t value_type; |
|
563 |
|
564 CalculateUTF8Size() |
|
565 : mSize(0) { } |
|
566 |
|
567 size_t Size() const { return mSize; } |
|
568 |
|
569 void write( const value_type* start, uint32_t N ) |
|
570 { |
|
571 // Assume UCS2 surrogate pairs won't be spread across fragments. |
|
572 for (const value_type *p = start, *end = start + N; p < end; ++p ) |
|
573 { |
|
574 value_type c = *p; |
|
575 if (! (c & 0xFF80)) // U+0000 - U+007F |
|
576 mSize += 1; |
|
577 else if (! (c & 0xF800)) // U+0100 - U+07FF |
|
578 mSize += 2; |
|
579 else if (0xD800 != (0xF800 & c)) // U+0800 - U+D7FF,U+E000 - U+FFFF |
|
580 mSize += 3; |
|
581 else if (0xD800 == (0xFC00 & c)) // U+D800 - U+DBFF |
|
582 { |
|
583 ++p; |
|
584 if (p == end) |
|
585 { |
|
586 // Treat broken characters as the Unicode |
|
587 // replacement character 0xFFFD (0xEFBFBD in |
|
588 // UTF-8) |
|
589 mSize += 3; |
|
590 |
|
591 NS_WARNING("String ending in half a surrogate pair!"); |
|
592 |
|
593 break; |
|
594 } |
|
595 c = *p; |
|
596 |
|
597 if (0xDC00 == (0xFC00 & c)) |
|
598 mSize += 4; |
|
599 else |
|
600 { |
|
601 // Treat broken characters as the Unicode |
|
602 // replacement character 0xFFFD (0xEFBFBD in |
|
603 // UTF-8) |
|
604 mSize += 3; |
|
605 |
|
606 // The next code unit is the second 16-bit value, not |
|
607 // the one beyond it, as per Unicode 5.0.0 Chapter 3 C10, |
|
608 // only the first code unit of an illegal sequence must |
|
609 // be treated as an illegally terminated code unit |
|
610 // sequence (also Chapter 3 D91, "isolated [not paired and |
|
611 // ill-formed] UTF-16 code units in the range D800..DFFF |
|
612 // are ill-formed"). |
|
613 p--; |
|
614 |
|
615 NS_WARNING("got a high Surrogate but no low surrogate"); |
|
616 } |
|
617 } |
|
618 else // U+DC00 - U+DFFF |
|
619 { |
|
620 // Treat broken characters as the Unicode replacement |
|
621 // character 0xFFFD (0xEFBFBD in UTF-8) |
|
622 mSize += 3; |
|
623 |
|
624 NS_WARNING("got a low Surrogate but no high surrogate"); |
|
625 } |
|
626 } |
|
627 } |
|
628 |
|
629 private: |
|
630 size_t mSize; |
|
631 }; |
|
632 |
|
633 #ifdef MOZILLA_INTERNAL_API |
|
634 /** |
|
635 * A character sink that performs a |reinterpret_cast|-style conversion |
|
636 * from char to char16_t. |
|
637 */ |
|
638 class LossyConvertEncoding8to16 |
|
639 { |
|
640 public: |
|
641 typedef char value_type; |
|
642 typedef char input_type; |
|
643 typedef char16_t output_type; |
|
644 |
|
645 public: |
|
646 LossyConvertEncoding8to16( char16_t* aDestination ) : |
|
647 mDestination(aDestination) { } |
|
648 |
|
649 void |
|
650 write( const char* aSource, uint32_t aSourceLength ) |
|
651 { |
|
652 #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
653 if (mozilla::supports_sse2()) |
|
654 { |
|
655 write_sse2(aSource, aSourceLength); |
|
656 return; |
|
657 } |
|
658 #endif |
|
659 const char* done_writing = aSource + aSourceLength; |
|
660 while ( aSource < done_writing ) |
|
661 *mDestination++ = (char16_t)(unsigned char)(*aSource++); |
|
662 } |
|
663 |
|
664 void |
|
665 write_sse2( const char* aSource, uint32_t aSourceLength ); |
|
666 |
|
667 void |
|
668 write_terminator() |
|
669 { |
|
670 *mDestination = (char16_t)(0); |
|
671 } |
|
672 |
|
673 private: |
|
674 char16_t* mDestination; |
|
675 }; |
|
676 |
|
677 /** |
|
678 * A character sink that performs a |reinterpret_cast|-style conversion |
|
679 * from char16_t to char. |
|
680 */ |
|
681 class LossyConvertEncoding16to8 |
|
682 { |
|
683 public: |
|
684 typedef char16_t value_type; |
|
685 typedef char16_t input_type; |
|
686 typedef char output_type; |
|
687 |
|
688 LossyConvertEncoding16to8( char* aDestination ) : mDestination(aDestination) { } |
|
689 |
|
690 void |
|
691 write( const char16_t* aSource, uint32_t aSourceLength) |
|
692 { |
|
693 #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
694 if (mozilla::supports_sse2()) |
|
695 { |
|
696 write_sse2(aSource, aSourceLength); |
|
697 return; |
|
698 } |
|
699 #endif |
|
700 const char16_t* done_writing = aSource + aSourceLength; |
|
701 while ( aSource < done_writing ) |
|
702 *mDestination++ = (char)(*aSource++); |
|
703 } |
|
704 |
|
705 #ifdef MOZILLA_MAY_SUPPORT_SSE2 |
|
706 void |
|
707 write_sse2( const char16_t* aSource, uint32_t aSourceLength ); |
|
708 #endif |
|
709 |
|
710 void |
|
711 write_terminator() |
|
712 { |
|
713 *mDestination = '\0'; |
|
714 } |
|
715 |
|
716 private: |
|
717 char *mDestination; |
|
718 }; |
|
719 #endif // MOZILLA_INTERNAL_API |
|
720 |
|
721 #endif /* !defined(nsUTF8Utils_h_) */ |