xpcom/string/src/nsUTF8UtilsSSE2.cpp

branch
TOR_BUG_9701
changeset 15
b8a032363ba2
equal deleted inserted replaced
-1:000000000000 0:7f97bf88272e
1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 #include "nscore.h"
6 #include "nsAlgorithm.h"
7 #include <emmintrin.h>
8 #include <nsUTF8Utils.h>
9
10 void
11 LossyConvertEncoding16to8::write_sse2(const char16_t* aSource,
12 uint32_t aSourceLength)
13 {
14 char* dest = mDestination;
15
16 // Align source to a 16-byte boundary.
17 uint32_t i = 0;
18 uint32_t alignLen =
19 XPCOM_MIN<uint32_t>(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf) / sizeof(char16_t));
20 for (; i < alignLen; i++) {
21 dest[i] = static_cast<unsigned char>(aSource[i]);
22 }
23
24 // Walk 64 bytes (four XMM registers) at a time.
25 __m128i vectmask = _mm_set1_epi16(0x00ff);
26 for (; aSourceLength - i > 31; i += 32) {
27 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
28 source1 = _mm_and_si128(source1, vectmask);
29
30 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 8));
31 source2 = _mm_and_si128(source2, vectmask);
32
33 __m128i source3 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
34 source3 = _mm_and_si128(source3, vectmask);
35
36 __m128i source4 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 24));
37 source4 = _mm_and_si128(source4, vectmask);
38
39
40 // Pack the source data. SSE2 views this as a saturating uint16_t to
41 // uint8_t conversion, but since we masked off the high-order byte of every
42 // uint16_t, we're really just grabbing the low-order bytes of source1 and
43 // source2.
44 __m128i packed1 = _mm_packus_epi16(source1, source2);
45 __m128i packed2 = _mm_packus_epi16(source3, source4);
46
47 // This store needs to be unaligned since there's no guarantee that the
48 // alignment we did above for the source will align the destination.
49 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), packed1);
50 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), packed2);
51 }
52
53 // Finish up the rest.
54 for (; i < aSourceLength; i++) {
55 dest[i] = static_cast<unsigned char>(aSource[i]);
56 }
57
58 mDestination += i;
59 }
60
61 void
62 LossyConvertEncoding8to16::write_sse2(const char* aSource,
63 uint32_t aSourceLength)
64 {
65 char16_t *dest = mDestination;
66
67 // Align source to a 16-byte boundary. We choose to align source rather than
68 // dest because we'd rather have our loads than our stores be fast. You have
69 // to wait for a load to complete, but you can keep on moving after issuing a
70 // store.
71 uint32_t i = 0;
72 uint32_t alignLen = XPCOM_MIN(aSourceLength, uint32_t(-NS_PTR_TO_INT32(aSource) & 0xf));
73 for (; i < alignLen; i++) {
74 dest[i] = static_cast<unsigned char>(aSource[i]);
75 }
76
77 // Walk 32 bytes (two XMM registers) at a time.
78 for (; aSourceLength - i > 31; i += 32) {
79 __m128i source1 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i));
80 __m128i source2 = _mm_load_si128(reinterpret_cast<const __m128i*>(aSource + i + 16));
81
82 // Interleave 0s in with the bytes of source to create lo and hi.
83 __m128i lo1 = _mm_unpacklo_epi8(source1, _mm_setzero_si128());
84 __m128i hi1 = _mm_unpackhi_epi8(source1, _mm_setzero_si128());
85 __m128i lo2 = _mm_unpacklo_epi8(source2, _mm_setzero_si128());
86 __m128i hi2 = _mm_unpackhi_epi8(source2, _mm_setzero_si128());
87
88 // store lo and hi into dest.
89 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i), lo1);
90 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 8), hi1);
91 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 16), lo2);
92 _mm_storeu_si128(reinterpret_cast<__m128i*>(dest + i + 24), hi2);
93 }
94
95 // Finish up whatever's left.
96 for (; i < aSourceLength; i++) {
97 dest[i] = static_cast<unsigned char>(aSource[i]);
98 }
99
100 mDestination += i;
101 }

mercurial