|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 // This file should only be compiled if you're on x86 or x86_64. Additionally, |
|
6 // you'll need to compile this file with -msse2 if you're using gcc. |
|
7 |
|
8 #include <emmintrin.h> |
|
9 #include "nscore.h" |
|
10 |
|
11 namespace mozilla { |
|
12 namespace SSE2 { |
|
13 |
|
14 void |
|
15 Convert_ascii_run(const char *&src, |
|
16 char16_t *&dst, |
|
17 int32_t len) |
|
18 { |
|
19 if (len > 15) { |
|
20 __m128i in, out1, out2; |
|
21 __m128d *outp1, *outp2; |
|
22 __m128i zeroes; |
|
23 uint32_t offset; |
|
24 |
|
25 // align input to 16 bytes |
|
26 while ((NS_PTR_TO_UINT32(src) & 15) && len > 0) { |
|
27 if (*src & 0x80U) |
|
28 return; |
|
29 *dst++ = (char16_t) *src++; |
|
30 len--; |
|
31 } |
|
32 |
|
33 zeroes = _mm_setzero_si128(); |
|
34 |
|
35 offset = NS_PTR_TO_UINT32(dst) & 15; |
|
36 |
|
37 // Note: all these inner loops have to break, not return; we need |
|
38 // to let the single-char loop below catch any leftover |
|
39 // byte-at-a-time ASCII chars, since this function must consume |
|
40 // all available ASCII chars before it returns |
|
41 |
|
42 if (offset == 0) { |
|
43 while (len > 15) { |
|
44 in = _mm_load_si128((__m128i *) src); |
|
45 if (_mm_movemask_epi8(in)) |
|
46 break; |
|
47 out1 = _mm_unpacklo_epi8(in, zeroes); |
|
48 out2 = _mm_unpackhi_epi8(in, zeroes); |
|
49 _mm_stream_si128((__m128i *) dst, out1); |
|
50 _mm_stream_si128((__m128i *) (dst + 8), out2); |
|
51 dst += 16; |
|
52 src += 16; |
|
53 len -= 16; |
|
54 } |
|
55 } else if (offset == 8) { |
|
56 outp1 = (__m128d *) &out1; |
|
57 outp2 = (__m128d *) &out2; |
|
58 while (len > 15) { |
|
59 in = _mm_load_si128((__m128i *) src); |
|
60 if (_mm_movemask_epi8(in)) |
|
61 break; |
|
62 out1 = _mm_unpacklo_epi8(in, zeroes); |
|
63 out2 = _mm_unpackhi_epi8(in, zeroes); |
|
64 _mm_storel_epi64((__m128i *) dst, out1); |
|
65 _mm_storel_epi64((__m128i *) (dst + 8), out2); |
|
66 _mm_storeh_pd((double *) (dst + 4), *outp1); |
|
67 _mm_storeh_pd((double *) (dst + 12), *outp2); |
|
68 src += 16; |
|
69 dst += 16; |
|
70 len -= 16; |
|
71 } |
|
72 } else { |
|
73 while (len > 15) { |
|
74 in = _mm_load_si128((__m128i *) src); |
|
75 if (_mm_movemask_epi8(in)) |
|
76 break; |
|
77 out1 = _mm_unpacklo_epi8(in, zeroes); |
|
78 out2 = _mm_unpackhi_epi8(in, zeroes); |
|
79 _mm_storeu_si128((__m128i *) dst, out1); |
|
80 _mm_storeu_si128((__m128i *) (dst + 8), out2); |
|
81 src += 16; |
|
82 dst += 16; |
|
83 len -= 16; |
|
84 } |
|
85 } |
|
86 } |
|
87 |
|
88 // finish off a byte at a time |
|
89 |
|
90 while (len-- > 0 && (*src & 0x80U) == 0) { |
|
91 *dst++ = (char16_t) *src++; |
|
92 } |
|
93 } |
|
94 |
|
95 } // namespace SSE2 |
|
96 } // namespace mozilla |