|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 #include "nscore.h" |
|
6 #include "nsAlgorithm.h" |
|
7 #include <emmintrin.h> |
|
8 #include <nsUTF8Utils.h> |
|
9 |
|
10 void |
|
11 LossyConvertEncoding16to8::write_sse2(const char16_t* aSource, |
|
12 uint32_t aSourceLength) |
|
13 { |
|
14 char* dest = mDestination; |
|
15 |
|
16 // Align source to a 16-byte boundary. |
|
17 uint32_t i = 0; |
|
18 uint32_t alignLen = |
|
19 XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t)); |
|
20 for (; i < alignLen; i++) { |
|
21 dest[i] = static_cast<unsigned char>(aSource[i]); |
|
22 } |
|
23 |
|
24 // Walk 64 bytes (four XMM registers) at a time. |
|
25 __m128i vectmask = _mm_set1_epi16(0x00ff); |
|
26 for (; aSourceLength - i > 31; i += 32) { |
|
27 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); |
|
28 source1 = _mm_and_si128(source1, vectmask); |
|
29 |
|
30 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8)); |
|
31 source2 = _mm_and_si128(source2, vectmask); |
|
32 |
|
33 __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); |
|
34 source3 = _mm_and_si128(source3, vectmask); |
|
35 |
|
36 __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24)); |
|
37 source4 = _mm_and_si128(source4, vectmask); |
|
38 |
|
39 |
|
40 // Pack the source data. SSE2 views this as a saturating uint16_t to |
|
41 // uint8_t conversion, but since we masked off the high-order byte of every |
|
42 // uint16_t, we're really just grabbing the low-order bytes of source1 and |
|
43 // source2. |
|
44 __m128i packed1 = _mm_packus_epi16(source1, source2); |
|
45 __m128i packed2 = _mm_packus_epi16(source3, source4); |
|
46 |
|
47 // This store needs to be unaligned since there's no guarantee that the |
|
48 // alignment we did above for the source will align the destination. |
|
49 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1); |
|
50 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2); |
|
51 } |
|
52 |
|
53 // Finish up the rest. |
|
54 for (; i < aSourceLength; i++) { |
|
55 dest[i] = static_cast<unsigned char>(aSource[i]); |
|
56 } |
|
57 |
|
58 mDestination += i; |
|
59 } |
|
60 |
|
61 void |
|
62 LossyConvertEncoding8to16::write_sse2(const char* aSource, |
|
63 uint32_t aSourceLength) |
|
64 { |
|
65 char16_t *dest = mDestination; |
|
66 |
|
67 // Align source to a 16-byte boundary. We choose to align source rather than |
|
68 // dest because we'd rather have our loads than our stores be fast. You have |
|
69 // to wait for a load to complete, but you can keep on moving after issuing a |
|
70 // store. |
|
71 uint32_t i = 0; |
|
72 uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf)); |
|
73 for (; i < alignLen; i++) { |
|
74 dest[i] = static_cast<unsigned char>(aSource[i]); |
|
75 } |
|
76 |
|
77 // Walk 32 bytes (two XMM registers) at a time. |
|
78 for (; aSourceLength - i > 31; i += 32) { |
|
79 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i)); |
|
80 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16)); |
|
81 |
|
82 // Interleave 0s in with the bytes of source to create lo and hi. |
|
83 __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128()); |
|
84 __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128()); |
|
85 __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128()); |
|
86 __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128()); |
|
87 |
|
88 // store lo and hi into dest. |
|
89 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1); |
|
90 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1); |
|
91 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2); |
|
92 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2); |
|
93 } |
|
94 |
|
95 // Finish up whatever's left. |
|
96 for (; i < aSourceLength; i++) { |
|
97 dest[i] = static_cast<unsigned char>(aSource[i]); |
|
98 } |
|
99 |
|
100 mDestination += i; |
|
101 } |