|
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
|
2 * This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #ifndef _MOZILLA_GFX_SIMD_H_ |
|
7 #define _MOZILLA_GFX_SIMD_H_ |
|
8 |
|
9 /** |
|
10 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it |
|
11 * if they want access to the SSE2 functions. |
|
12 */ |
|
13 |
|
14 #ifdef SIMD_COMPILE_SSE2 |
|
15 #include <xmmintrin.h> |
|
16 #endif |
|
17 |
|
18 namespace mozilla { |
|
19 namespace gfx { |
|
20 |
|
21 namespace simd { |
|
22 |
|
23 template<typename u8x16_t> |
|
24 u8x16_t Load8(const uint8_t* aSource); |
|
25 |
|
26 template<typename u8x16_t> |
|
27 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
|
28 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p); |
|
29 |
|
30 template<typename u8x16_t> |
|
31 u8x16_t FromZero8(); |
|
32 |
|
33 template<typename i16x8_t> |
|
34 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h); |
|
35 |
|
36 template<typename u16x8_t> |
|
37 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h); |
|
38 |
|
39 template<typename i16x8_t> |
|
40 i16x8_t FromI16(int16_t a); |
|
41 |
|
42 template<typename u16x8_t> |
|
43 u16x8_t FromU16(uint16_t a); |
|
44 |
|
45 template<typename i32x4_t> |
|
46 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d); |
|
47 |
|
48 template<typename i32x4_t> |
|
49 i32x4_t From32(int32_t a); |
|
50 |
|
51 template<typename f32x4_t> |
|
52 f32x4_t FromF32(float a, float b, float c, float d); |
|
53 |
|
54 template<typename f32x4_t> |
|
55 f32x4_t FromF32(float a); |
|
56 |
|
57 // All SIMD backends overload these functions for their SIMD types: |
|
58 |
|
59 #if 0 |
|
60 |
|
61 // Store 16 bytes to a 16-byte aligned address |
|
62 void Store8(uint8_t* aTarget, u8x16_t aM); |
|
63 |
|
64 // Fixed shifts |
|
65 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM); |
|
66 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM); |
|
67 |
|
68 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2); |
|
69 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2); |
|
70 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2); |
|
71 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2); |
|
72 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2); |
|
73 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2); |
|
74 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2); |
|
75 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2); |
|
76 |
|
77 // Truncating i16 -> i16 multiplication |
|
78 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2); |
|
79 |
|
80 // Long multiplication i16 -> i32 |
|
81 // aFactorsA1B1 = (a1[4] b1[4]) |
|
82 // aFactorsA2B2 = (a2[4] b2[4]) |
|
83 // aProductA = a1 * a2, aProductB = b1 * b2 |
|
84 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2, |
|
85 i32x4_t& aProductA, i32x4_t& aProductB); |
|
86 |
|
87 // Long multiplication + pairwise addition i16 -> i32 |
|
88 // See the scalar implementation for specifics. |
|
89 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB); |
|
90 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB); |
|
91 |
|
92 // Set all four 32-bit components to the value of the component at aIndex. |
|
93 template<int8_t aIndex> |
|
94 i32x4_t Splat32(i32x4_t aM); |
|
95 |
|
96 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them, |
|
97 // re-interpret the result as sixteen 8-bit values. |
|
98 template<int8_t aIndex> |
|
99 u8x16_t Splat32On8(u8x16_t aM); |
|
100 |
|
101 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM); |
|
102 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM); |
|
103 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM); |
|
104 |
|
105 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2); |
|
106 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2); |
|
107 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2); |
|
108 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2); |
|
109 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2); |
|
110 |
|
111 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m); |
|
112 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m); |
|
113 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m); |
|
114 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m); |
|
115 |
|
116 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2); |
|
117 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2); |
|
118 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4); |
|
119 |
|
120 i32x4 FastDivideBy255(i32x4 m); |
|
121 i16x8 FastDivideBy255_16(i16x8 m); |
|
122 |
|
123 #endif |
|
124 |
|
125 // Scalar |
|
126 |
|
127 struct Scalaru8x16_t { |
|
128 uint8_t u8[16]; |
|
129 }; |
|
130 |
|
131 union Scalari16x8_t { |
|
132 int16_t i16[8]; |
|
133 uint16_t u16[8]; |
|
134 }; |
|
135 |
|
136 typedef Scalari16x8_t Scalaru16x8_t; |
|
137 |
|
138 struct Scalari32x4_t { |
|
139 int32_t i32[4]; |
|
140 }; |
|
141 |
|
142 struct Scalarf32x4_t { |
|
143 float f32[4]; |
|
144 }; |
|
145 |
|
146 template<> |
|
147 inline Scalaru8x16_t |
|
148 Load8<Scalaru8x16_t>(const uint8_t* aSource) |
|
149 { |
|
150 return *(Scalaru8x16_t*)aSource; |
|
151 } |
|
152 |
|
153 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) |
|
154 { |
|
155 *(Scalaru8x16_t*)aTarget = aM; |
|
156 } |
|
157 |
|
158 template<> |
|
159 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
|
160 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
|
161 { |
|
162 Scalaru8x16_t _m; |
|
163 _m.u8[0] = a; |
|
164 _m.u8[1] = b; |
|
165 _m.u8[2] = c; |
|
166 _m.u8[3] = d; |
|
167 _m.u8[4] = e; |
|
168 _m.u8[5] = f; |
|
169 _m.u8[6] = g; |
|
170 _m.u8[7] = h; |
|
171 _m.u8[8+0] = i; |
|
172 _m.u8[8+1] = j; |
|
173 _m.u8[8+2] = k; |
|
174 _m.u8[8+3] = l; |
|
175 _m.u8[8+4] = m; |
|
176 _m.u8[8+5] = n; |
|
177 _m.u8[8+6] = o; |
|
178 _m.u8[8+7] = p; |
|
179 return _m; |
|
180 } |
|
181 |
|
182 template<> |
|
183 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() |
|
184 { |
|
185 return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); |
|
186 } |
|
187 |
|
188 template<> |
|
189 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) |
|
190 { |
|
191 Scalari16x8_t m; |
|
192 m.i16[0] = a; |
|
193 m.i16[1] = b; |
|
194 m.i16[2] = c; |
|
195 m.i16[3] = d; |
|
196 m.i16[4] = e; |
|
197 m.i16[5] = f; |
|
198 m.i16[6] = g; |
|
199 m.i16[7] = h; |
|
200 return m; |
|
201 } |
|
202 |
|
203 template<> |
|
204 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) |
|
205 { |
|
206 Scalaru16x8_t m; |
|
207 m.u16[0] = a; |
|
208 m.u16[1] = b; |
|
209 m.u16[2] = c; |
|
210 m.u16[3] = d; |
|
211 m.u16[4] = e; |
|
212 m.u16[5] = f; |
|
213 m.u16[6] = g; |
|
214 m.u16[7] = h; |
|
215 return m; |
|
216 } |
|
217 |
|
218 template<> |
|
219 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) |
|
220 { |
|
221 return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a); |
|
222 } |
|
223 |
|
224 template<> |
|
225 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) |
|
226 { |
|
227 return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a); |
|
228 } |
|
229 |
|
230 template<> |
|
231 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d) |
|
232 { |
|
233 Scalari32x4_t m; |
|
234 m.i32[0] = a; |
|
235 m.i32[1] = b; |
|
236 m.i32[2] = c; |
|
237 m.i32[3] = d; |
|
238 return m; |
|
239 } |
|
240 |
|
241 template<> |
|
242 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d) |
|
243 { |
|
244 Scalarf32x4_t m; |
|
245 m.f32[0] = a; |
|
246 m.f32[1] = b; |
|
247 m.f32[2] = c; |
|
248 m.f32[3] = d; |
|
249 return m; |
|
250 } |
|
251 |
|
252 template<> |
|
253 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) |
|
254 { |
|
255 return FromF32<Scalarf32x4_t>(a, a, a, a); |
|
256 } |
|
257 |
|
258 template<> |
|
259 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) |
|
260 { |
|
261 return From32<Scalari32x4_t>(a, a, a, a); |
|
262 } |
|
263 |
|
264 template<int32_t aNumberOfBits> |
|
265 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) |
|
266 { |
|
267 return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits, |
|
268 uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits, |
|
269 uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits, |
|
270 uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits); |
|
271 } |
|
272 |
|
273 template<int32_t aNumberOfBits> |
|
274 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) |
|
275 { |
|
276 return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits, |
|
277 aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits); |
|
278 } |
|
279 |
|
280 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
|
281 { |
|
282 return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], |
|
283 aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3], |
|
284 aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5], |
|
285 aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]); |
|
286 } |
|
287 |
|
288 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
|
289 { |
|
290 return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1], |
|
291 aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]); |
|
292 } |
|
293 |
|
294 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
|
295 { |
|
296 return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], |
|
297 aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3], |
|
298 aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5], |
|
299 aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]); |
|
300 } |
|
301 |
|
302 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
|
303 { |
|
304 return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1], |
|
305 aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]); |
|
306 } |
|
307 |
|
308 inline int32_t |
|
309 umin(int32_t a, int32_t b) |
|
310 { |
|
311 return a - ((a - b) & -(a > b)); |
|
312 } |
|
313 |
|
314 inline int32_t |
|
315 umax(int32_t a, int32_t b) |
|
316 { |
|
317 return a - ((a - b) & -(a < b)); |
|
318 } |
|
319 |
|
320 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) |
|
321 { |
|
322 return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]), |
|
323 umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]), |
|
324 umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]), |
|
325 umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]), |
|
326 umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]), |
|
327 umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]), |
|
328 umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]), |
|
329 umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7])); |
|
330 } |
|
331 |
|
332 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) |
|
333 { |
|
334 return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]), |
|
335 umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]), |
|
336 umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]), |
|
337 umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]), |
|
338 umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]), |
|
339 umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]), |
|
340 umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]), |
|
341 umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7])); |
|
342 } |
|
343 |
|
344 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
|
345 { |
|
346 return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]), |
|
347 umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3])); |
|
348 } |
|
349 |
|
350 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) |
|
351 { |
|
352 return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]), |
|
353 umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3])); |
|
354 } |
|
355 |
|
356 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) |
|
357 { |
|
358 return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])), |
|
359 uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])), |
|
360 uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])), |
|
361 uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7]))); |
|
362 } |
|
363 |
|
364 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1, |
|
365 Scalari16x8_t aFactorsA2B2, |
|
366 Scalari32x4_t& aProductA, |
|
367 Scalari32x4_t& aProductB) |
|
368 { |
|
369 aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0], |
|
370 aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1], |
|
371 aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2], |
|
372 aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]); |
|
373 aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4], |
|
374 aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5], |
|
375 aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6], |
|
376 aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]); |
|
377 } |
|
378 |
|
379 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA, |
|
380 Scalari16x8_t aFactorsB) |
|
381 { |
|
382 return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1], |
|
383 aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3], |
|
384 aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5], |
|
385 aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]); |
|
386 } |
|
387 |
|
388 template<int8_t aIndex> |
|
389 inline void AssertIndex() |
|
390 { |
|
391 static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3, |
|
392 "Invalid splat index"); |
|
393 } |
|
394 |
|
395 template<int8_t aIndex> |
|
396 inline Scalari32x4_t Splat32(Scalari32x4_t aM) |
|
397 { |
|
398 AssertIndex<aIndex>(); |
|
399 return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], |
|
400 aM.i32[aIndex], aM.i32[aIndex]); |
|
401 } |
|
402 |
|
403 template<int8_t i> |
|
404 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) |
|
405 { |
|
406 AssertIndex<i>(); |
|
407 return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
|
408 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
|
409 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], |
|
410 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]); |
|
411 } |
|
412 |
|
413 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
414 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) |
|
415 { |
|
416 AssertIndex<i0>(); |
|
417 AssertIndex<i1>(); |
|
418 AssertIndex<i2>(); |
|
419 AssertIndex<i3>(); |
|
420 Scalari32x4_t m = aM; |
|
421 m.i32[0] = aM.i32[i3]; |
|
422 m.i32[1] = aM.i32[i2]; |
|
423 m.i32[2] = aM.i32[i1]; |
|
424 m.i32[3] = aM.i32[i0]; |
|
425 return m; |
|
426 } |
|
427 |
|
428 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
429 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) |
|
430 { |
|
431 AssertIndex<i0>(); |
|
432 AssertIndex<i1>(); |
|
433 AssertIndex<i2>(); |
|
434 AssertIndex<i3>(); |
|
435 Scalari16x8_t m = aM; |
|
436 m.i16[0] = aM.i16[i3]; |
|
437 m.i16[1] = aM.i16[i2]; |
|
438 m.i16[2] = aM.i16[i1]; |
|
439 m.i16[3] = aM.i16[i0]; |
|
440 return m; |
|
441 } |
|
442 |
|
443 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
444 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) |
|
445 { |
|
446 AssertIndex<i0>(); |
|
447 AssertIndex<i1>(); |
|
448 AssertIndex<i2>(); |
|
449 AssertIndex<i3>(); |
|
450 Scalari16x8_t m = aM; |
|
451 m.i16[4 + 0] = aM.i16[4 + i3]; |
|
452 m.i16[4 + 1] = aM.i16[4 + i2]; |
|
453 m.i16[4 + 2] = aM.i16[4 + i1]; |
|
454 m.i16[4 + 3] = aM.i16[4 + i0]; |
|
455 return m; |
|
456 } |
|
457 |
|
458 template<int8_t aIndexLo, int8_t aIndexHi> |
|
459 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) |
|
460 { |
|
461 AssertIndex<aIndexLo>(); |
|
462 AssertIndex<aIndexHi>(); |
|
463 Scalaru16x8_t m; |
|
464 int16_t chosenValueLo = aM.u16[aIndexLo]; |
|
465 m.u16[0] = chosenValueLo; |
|
466 m.u16[1] = chosenValueLo; |
|
467 m.u16[2] = chosenValueLo; |
|
468 m.u16[3] = chosenValueLo; |
|
469 int16_t chosenValueHi = aM.u16[4 + aIndexHi]; |
|
470 m.u16[4] = chosenValueHi; |
|
471 m.u16[5] = chosenValueHi; |
|
472 m.u16[6] = chosenValueHi; |
|
473 m.u16[7] = chosenValueHi; |
|
474 return m; |
|
475 } |
|
476 |
|
477 inline Scalaru8x16_t |
|
478 InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) |
|
479 { |
|
480 return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], |
|
481 m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3], |
|
482 m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5], |
|
483 m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]); |
|
484 } |
|
485 |
|
486 inline Scalaru8x16_t |
|
487 InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) |
|
488 { |
|
489 return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1], |
|
490 m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3], |
|
491 m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5], |
|
492 m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]); |
|
493 } |
|
494 |
|
495 inline Scalaru16x8_t |
|
496 InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) |
|
497 { |
|
498 return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1], |
|
499 m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]); |
|
500 } |
|
501 |
|
502 inline Scalaru16x8_t |
|
503 InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) |
|
504 { |
|
505 return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5], |
|
506 m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]); |
|
507 } |
|
508 |
|
509 inline Scalari32x4_t |
|
510 InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) |
|
511 { |
|
512 return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]); |
|
513 } |
|
514 |
|
515 inline Scalari16x8_t |
|
516 UnpackLo8x8ToI16x8(Scalaru8x16_t aM) |
|
517 { |
|
518 Scalari16x8_t m; |
|
519 m.i16[0] = aM.u8[0]; |
|
520 m.i16[1] = aM.u8[1]; |
|
521 m.i16[2] = aM.u8[2]; |
|
522 m.i16[3] = aM.u8[3]; |
|
523 m.i16[4] = aM.u8[4]; |
|
524 m.i16[5] = aM.u8[5]; |
|
525 m.i16[6] = aM.u8[6]; |
|
526 m.i16[7] = aM.u8[7]; |
|
527 return m; |
|
528 } |
|
529 |
|
530 inline Scalari16x8_t |
|
531 UnpackHi8x8ToI16x8(Scalaru8x16_t aM) |
|
532 { |
|
533 Scalari16x8_t m; |
|
534 m.i16[0] = aM.u8[8+0]; |
|
535 m.i16[1] = aM.u8[8+1]; |
|
536 m.i16[2] = aM.u8[8+2]; |
|
537 m.i16[3] = aM.u8[8+3]; |
|
538 m.i16[4] = aM.u8[8+4]; |
|
539 m.i16[5] = aM.u8[8+5]; |
|
540 m.i16[6] = aM.u8[8+6]; |
|
541 m.i16[7] = aM.u8[8+7]; |
|
542 return m; |
|
543 } |
|
544 |
|
545 inline Scalaru16x8_t |
|
546 UnpackLo8x8ToU16x8(Scalaru8x16_t aM) |
|
547 { |
|
548 return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]), |
|
549 uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7])); |
|
550 } |
|
551 |
|
552 inline Scalaru16x8_t |
|
553 UnpackHi8x8ToU16x8(Scalaru8x16_t aM) |
|
554 { |
|
555 return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3], |
|
556 aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]); |
|
557 } |
|
558 |
|
559 template<uint8_t aNumBytes> |
|
560 inline Scalaru8x16_t |
|
561 Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) |
|
562 { |
|
563 Scalaru8x16_t m; |
|
564 for (uint8_t i = 0; i < 16; i++) { |
|
565 uint8_t sourceByte = i + aNumBytes; |
|
566 m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16]; |
|
567 } |
|
568 return m; |
|
569 } |
|
570 |
|
571 template<typename T> |
|
572 inline int16_t |
|
573 SaturateTo16(T a) |
|
574 { |
|
575 return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN); |
|
576 } |
|
577 |
|
578 inline Scalari16x8_t |
|
579 PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) |
|
580 { |
|
581 Scalari16x8_t m; |
|
582 m.i16[0] = SaturateTo16(m1.i32[0]); |
|
583 m.i16[1] = SaturateTo16(m1.i32[1]); |
|
584 m.i16[2] = SaturateTo16(m1.i32[2]); |
|
585 m.i16[3] = SaturateTo16(m1.i32[3]); |
|
586 m.i16[4] = SaturateTo16(m2.i32[0]); |
|
587 m.i16[5] = SaturateTo16(m2.i32[1]); |
|
588 m.i16[6] = SaturateTo16(m2.i32[2]); |
|
589 m.i16[7] = SaturateTo16(m2.i32[3]); |
|
590 return m; |
|
591 } |
|
592 |
|
593 template<typename T> |
|
594 inline uint16_t |
|
595 SaturateToU16(T a) |
|
596 { |
|
597 return uint16_t(umin(a & -(a >= 0), INT16_MAX)); |
|
598 } |
|
599 |
|
600 inline Scalaru16x8_t |
|
601 PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2) |
|
602 { |
|
603 Scalaru16x8_t m; |
|
604 m.u16[0] = SaturateToU16(m1.i32[0]); |
|
605 m.u16[1] = SaturateToU16(m1.i32[1]); |
|
606 m.u16[2] = SaturateToU16(m1.i32[2]); |
|
607 m.u16[3] = SaturateToU16(m1.i32[3]); |
|
608 m.u16[4] = SaturateToU16(m2.i32[0]); |
|
609 m.u16[5] = SaturateToU16(m2.i32[1]); |
|
610 m.u16[6] = SaturateToU16(m2.i32[2]); |
|
611 m.u16[7] = SaturateToU16(m2.i32[3]); |
|
612 return m; |
|
613 } |
|
614 |
|
615 template<typename T> |
|
616 inline uint8_t |
|
617 SaturateTo8(T a) |
|
618 { |
|
619 return uint8_t(umin(a & -(a >= 0), 255)); |
|
620 } |
|
621 |
|
622 inline Scalaru8x16_t |
|
623 PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4) |
|
624 { |
|
625 Scalaru8x16_t m; |
|
626 m.u8[0] = SaturateTo8(m1.i32[0]); |
|
627 m.u8[1] = SaturateTo8(m1.i32[1]); |
|
628 m.u8[2] = SaturateTo8(m1.i32[2]); |
|
629 m.u8[3] = SaturateTo8(m1.i32[3]); |
|
630 m.u8[4] = SaturateTo8(m2.i32[0]); |
|
631 m.u8[5] = SaturateTo8(m2.i32[1]); |
|
632 m.u8[6] = SaturateTo8(m2.i32[2]); |
|
633 m.u8[7] = SaturateTo8(m2.i32[3]); |
|
634 m.u8[8] = SaturateTo8(m3.i32[0]); |
|
635 m.u8[9] = SaturateTo8(m3.i32[1]); |
|
636 m.u8[10] = SaturateTo8(m3.i32[2]); |
|
637 m.u8[11] = SaturateTo8(m3.i32[3]); |
|
638 m.u8[12] = SaturateTo8(m4.i32[0]); |
|
639 m.u8[13] = SaturateTo8(m4.i32[1]); |
|
640 m.u8[14] = SaturateTo8(m4.i32[2]); |
|
641 m.u8[15] = SaturateTo8(m4.i32[3]); |
|
642 return m; |
|
643 } |
|
644 |
|
645 inline Scalaru8x16_t |
|
646 PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) |
|
647 { |
|
648 Scalaru8x16_t m; |
|
649 m.u8[0] = SaturateTo8(m1.i16[0]); |
|
650 m.u8[1] = SaturateTo8(m1.i16[1]); |
|
651 m.u8[2] = SaturateTo8(m1.i16[2]); |
|
652 m.u8[3] = SaturateTo8(m1.i16[3]); |
|
653 m.u8[4] = SaturateTo8(m1.i16[4]); |
|
654 m.u8[5] = SaturateTo8(m1.i16[5]); |
|
655 m.u8[6] = SaturateTo8(m1.i16[6]); |
|
656 m.u8[7] = SaturateTo8(m1.i16[7]); |
|
657 m.u8[8] = SaturateTo8(m2.i16[0]); |
|
658 m.u8[9] = SaturateTo8(m2.i16[1]); |
|
659 m.u8[10] = SaturateTo8(m2.i16[2]); |
|
660 m.u8[11] = SaturateTo8(m2.i16[3]); |
|
661 m.u8[12] = SaturateTo8(m2.i16[4]); |
|
662 m.u8[13] = SaturateTo8(m2.i16[5]); |
|
663 m.u8[14] = SaturateTo8(m2.i16[6]); |
|
664 m.u8[15] = SaturateTo8(m2.i16[7]); |
|
665 return m; |
|
666 } |
|
667 |
|
668 // Fast approximate division by 255. It has the property that |
|
669 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. |
|
670 // But it only uses two adds and two shifts instead of an |
|
671 // integer division (which is expensive on many processors). |
|
672 // |
|
673 // equivalent to v/255 |
|
674 template<class B, class A> |
|
675 inline B FastDivideBy255(A v) |
|
676 { |
|
677 return ((v << 8) + v + 255) >> 16; |
|
678 } |
|
679 |
|
680 inline Scalaru16x8_t |
|
681 FastDivideBy255_16(Scalaru16x8_t m) |
|
682 { |
|
683 return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])), |
|
684 FastDivideBy255<uint16_t>(int32_t(m.u16[1])), |
|
685 FastDivideBy255<uint16_t>(int32_t(m.u16[2])), |
|
686 FastDivideBy255<uint16_t>(int32_t(m.u16[3])), |
|
687 FastDivideBy255<uint16_t>(int32_t(m.u16[4])), |
|
688 FastDivideBy255<uint16_t>(int32_t(m.u16[5])), |
|
689 FastDivideBy255<uint16_t>(int32_t(m.u16[6])), |
|
690 FastDivideBy255<uint16_t>(int32_t(m.u16[7]))); |
|
691 } |
|
692 |
|
693 inline Scalari32x4_t |
|
694 FastDivideBy255(Scalari32x4_t m) |
|
695 { |
|
696 return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]), |
|
697 FastDivideBy255<int32_t>(m.i32[1]), |
|
698 FastDivideBy255<int32_t>(m.i32[2]), |
|
699 FastDivideBy255<int32_t>(m.i32[3])); |
|
700 } |
|
701 |
|
702 inline Scalaru8x16_t |
|
703 Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b) |
|
704 { |
|
705 return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]), |
|
706 (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]), |
|
707 (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]), |
|
708 (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]), |
|
709 (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]), |
|
710 (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]), |
|
711 (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]), |
|
712 (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]), |
|
713 (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]), |
|
714 (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]), |
|
715 (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]), |
|
716 (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]), |
|
717 (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]), |
|
718 (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]), |
|
719 (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]), |
|
720 (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7])); |
|
721 } |
|
722 |
|
723 inline Scalari32x4_t |
|
724 Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b) |
|
725 { |
|
726 return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]), |
|
727 (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]), |
|
728 (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]), |
|
729 (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3])); |
|
730 } |
|
731 |
|
732 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) |
|
733 { |
|
734 return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t, |
|
735 a.f32[1] + (b.f32[1] - a.f32[1]) * t, |
|
736 a.f32[2] + (b.f32[2] - a.f32[2]) * t, |
|
737 a.f32[3] + (b.f32[3] - a.f32[3]) * t); |
|
738 } |
|
739 |
|
740 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb) |
|
741 { |
|
742 return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb, |
|
743 a.f32[1] * wa + b.f32[1] * wb, |
|
744 a.f32[2] * wa + b.f32[2] * wb, |
|
745 a.f32[3] * wa + b.f32[3] * wb); |
|
746 } |
|
747 |
|
748 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) |
|
749 { |
|
750 return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), |
|
751 fabs(a.f32[1]), |
|
752 fabs(a.f32[2]), |
|
753 fabs(a.f32[3])); |
|
754 } |
|
755 |
|
756 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) |
|
757 { |
|
758 return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], |
|
759 a.f32[1] + b.f32[1], |
|
760 a.f32[2] + b.f32[2], |
|
761 a.f32[3] + b.f32[3]); |
|
762 } |
|
763 |
|
764 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) |
|
765 { |
|
766 return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], |
|
767 a.f32[1] * b.f32[1], |
|
768 a.f32[2] * b.f32[2], |
|
769 a.f32[3] * b.f32[3]); |
|
770 } |
|
771 |
|
772 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) |
|
773 { |
|
774 return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], |
|
775 a.f32[1] / b.f32[1], |
|
776 a.f32[2] / b.f32[2], |
|
777 a.f32[3] / b.f32[3]); |
|
778 } |
|
779 |
|
780 template<uint8_t aIndex> |
|
781 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) |
|
782 { |
|
783 AssertIndex<aIndex>(); |
|
784 return FromF32<Scalarf32x4_t>(m.f32[aIndex], |
|
785 m.f32[aIndex], |
|
786 m.f32[aIndex], |
|
787 m.f32[aIndex]); |
|
788 } |
|
789 |
|
790 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) |
|
791 { |
|
792 return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)), |
|
793 int32_t(floor(m.f32[1] + 0.5f)), |
|
794 int32_t(floor(m.f32[2] + 0.5f)), |
|
795 int32_t(floor(m.f32[3] + 0.5f))); |
|
796 } |
|
797 |
|
798 #ifdef SIMD_COMPILE_SSE2 |
|
799 |
|
800 // SSE2 |
|
801 |
|
802 template<> |
|
803 inline __m128i |
|
804 Load8<__m128i>(const uint8_t* aSource) |
|
805 { |
|
806 return _mm_load_si128((const __m128i*)aSource); |
|
807 } |
|
808 |
|
809 inline void Store8(uint8_t* aTarget, __m128i aM) |
|
810 { |
|
811 _mm_store_si128((__m128i*)aTarget, aM); |
|
812 } |
|
813 |
|
814 template<> |
|
815 inline __m128i FromZero8<__m128i>() |
|
816 { |
|
817 return _mm_setzero_si128(); |
|
818 } |
|
819 |
|
820 template<> |
|
821 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
|
822 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) |
|
823 { |
|
824 return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g, |
|
825 (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o); |
|
826 } |
|
827 |
|
828 template<> |
|
829 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) |
|
830 { |
|
831 return _mm_setr_epi16(a, b, c, d, e, f, g, h); |
|
832 } |
|
833 |
|
834 template<> |
|
835 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) |
|
836 { |
|
837 return _mm_setr_epi16(a, b, c, d, e, f, g, h); |
|
838 } |
|
839 |
|
840 template<> |
|
841 inline __m128i FromI16<__m128i>(int16_t a) |
|
842 { |
|
843 return _mm_set1_epi16(a); |
|
844 } |
|
845 |
|
846 template<> |
|
847 inline __m128i FromU16<__m128i>(uint16_t a) |
|
848 { |
|
849 return _mm_set1_epi16((int16_t)a); |
|
850 } |
|
851 |
|
852 template<> |
|
853 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) |
|
854 { |
|
855 return _mm_setr_epi32(a, b, c, d); |
|
856 } |
|
857 |
|
858 template<> |
|
859 inline __m128i From32<__m128i>(int32_t a) |
|
860 { |
|
861 return _mm_set1_epi32(a); |
|
862 } |
|
863 |
|
864 template<> |
|
865 inline __m128 FromF32<__m128>(float a, float b, float c, float d) |
|
866 { |
|
867 return _mm_setr_ps(a, b, c, d); |
|
868 } |
|
869 |
|
870 template<> |
|
871 inline __m128 FromF32<__m128>(float a) |
|
872 { |
|
873 return _mm_set1_ps(a); |
|
874 } |
|
875 |
|
876 template<int32_t aNumberOfBits> |
|
877 inline __m128i ShiftRight16(__m128i aM) |
|
878 { |
|
879 return _mm_srli_epi16(aM, aNumberOfBits); |
|
880 } |
|
881 |
|
882 template<int32_t aNumberOfBits> |
|
883 inline __m128i ShiftRight32(__m128i aM) |
|
884 { |
|
885 return _mm_srai_epi32(aM, aNumberOfBits); |
|
886 } |
|
887 |
|
888 inline __m128i Add16(__m128i aM1, __m128i aM2) |
|
889 { |
|
890 return _mm_add_epi16(aM1, aM2); |
|
891 } |
|
892 |
|
893 inline __m128i Add32(__m128i aM1, __m128i aM2) |
|
894 { |
|
895 return _mm_add_epi32(aM1, aM2); |
|
896 } |
|
897 |
|
898 inline __m128i Sub16(__m128i aM1, __m128i aM2) |
|
899 { |
|
900 return _mm_sub_epi16(aM1, aM2); |
|
901 } |
|
902 |
|
903 inline __m128i Sub32(__m128i aM1, __m128i aM2) |
|
904 { |
|
905 return _mm_sub_epi32(aM1, aM2); |
|
906 } |
|
907 |
|
908 inline __m128i Min8(__m128i aM1, __m128i aM2) |
|
909 { |
|
910 return _mm_min_epu8(aM1, aM2); |
|
911 } |
|
912 |
|
913 inline __m128i Max8(__m128i aM1, __m128i aM2) |
|
914 { |
|
915 return _mm_max_epu8(aM1, aM2); |
|
916 } |
|
917 |
|
918 inline __m128i Min32(__m128i aM1, __m128i aM2) |
|
919 { |
|
920 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); |
|
921 __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2); |
|
922 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2)); |
|
923 } |
|
924 |
|
925 inline __m128i Max32(__m128i aM1, __m128i aM2) |
|
926 { |
|
927 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); |
|
928 __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1); |
|
929 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1)); |
|
930 } |
|
931 |
|
932 inline __m128i Mul16(__m128i aM1, __m128i aM2) |
|
933 { |
|
934 return _mm_mullo_epi16(aM1, aM2); |
|
935 } |
|
936 |
|
937 inline __m128i MulU16(__m128i aM1, __m128i aM2) |
|
938 { |
|
939 return _mm_mullo_epi16(aM1, aM2); |
|
940 } |
|
941 |
|
942 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, |
|
943 __m128i aFactorsA2B2, |
|
944 __m128i& aProductA, |
|
945 __m128i& aProductB) |
|
946 { |
|
947 __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2); |
|
948 __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2); |
|
949 aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi); |
|
950 aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi); |
|
951 } |
|
952 |
|
953 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, |
|
954 __m128i aFactorsB) |
|
955 { |
|
956 return _mm_madd_epi16(aFactorsA, aFactorsB); |
|
957 } |
|
958 |
|
959 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
960 inline __m128i Shuffle32(__m128i aM) |
|
961 { |
|
962 AssertIndex<i0>(); |
|
963 AssertIndex<i1>(); |
|
964 AssertIndex<i2>(); |
|
965 AssertIndex<i3>(); |
|
966 return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
|
967 } |
|
968 |
|
969 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
970 inline __m128i ShuffleLo16(__m128i aM) |
|
971 { |
|
972 AssertIndex<i0>(); |
|
973 AssertIndex<i1>(); |
|
974 AssertIndex<i2>(); |
|
975 AssertIndex<i3>(); |
|
976 return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
|
977 } |
|
978 |
|
979 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> |
|
980 inline __m128i ShuffleHi16(__m128i aM) |
|
981 { |
|
982 AssertIndex<i0>(); |
|
983 AssertIndex<i1>(); |
|
984 AssertIndex<i2>(); |
|
985 AssertIndex<i3>(); |
|
986 return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); |
|
987 } |
|
988 |
|
989 template<int8_t aIndex> |
|
990 inline __m128i Splat32(__m128i aM) |
|
991 { |
|
992 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); |
|
993 } |
|
994 |
|
995 template<int8_t aIndex> |
|
996 inline __m128i Splat32On8(__m128i aM) |
|
997 { |
|
998 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); |
|
999 } |
|
1000 |
|
1001 template<int8_t aIndexLo, int8_t aIndexHi> |
|
1002 inline __m128i Splat16(__m128i aM) |
|
1003 { |
|
1004 AssertIndex<aIndexLo>(); |
|
1005 AssertIndex<aIndexHi>(); |
|
1006 return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>( |
|
1007 ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM)); |
|
1008 } |
|
1009 |
|
1010 inline __m128i |
|
1011 UnpackLo8x8ToI16x8(__m128i m) |
|
1012 { |
|
1013 __m128i zero = _mm_set1_epi8(0); |
|
1014 return _mm_unpacklo_epi8(m, zero); |
|
1015 } |
|
1016 |
|
1017 inline __m128i |
|
1018 UnpackHi8x8ToI16x8(__m128i m) |
|
1019 { |
|
1020 __m128i zero = _mm_set1_epi8(0); |
|
1021 return _mm_unpackhi_epi8(m, zero); |
|
1022 } |
|
1023 |
|
1024 inline __m128i |
|
1025 UnpackLo8x8ToU16x8(__m128i m) |
|
1026 { |
|
1027 __m128i zero = _mm_set1_epi8(0); |
|
1028 return _mm_unpacklo_epi8(m, zero); |
|
1029 } |
|
1030 |
|
1031 inline __m128i |
|
1032 UnpackHi8x8ToU16x8(__m128i m) |
|
1033 { |
|
1034 __m128i zero = _mm_set1_epi8(0); |
|
1035 return _mm_unpackhi_epi8(m, zero); |
|
1036 } |
|
1037 |
|
1038 inline __m128i |
|
1039 InterleaveLo8(__m128i m1, __m128i m2) |
|
1040 { |
|
1041 return _mm_unpacklo_epi8(m1, m2); |
|
1042 } |
|
1043 |
|
1044 inline __m128i |
|
1045 InterleaveHi8(__m128i m1, __m128i m2) |
|
1046 { |
|
1047 return _mm_unpackhi_epi8(m1, m2); |
|
1048 } |
|
1049 |
|
1050 inline __m128i |
|
1051 InterleaveLo16(__m128i m1, __m128i m2) |
|
1052 { |
|
1053 return _mm_unpacklo_epi16(m1, m2); |
|
1054 } |
|
1055 |
|
1056 inline __m128i |
|
1057 InterleaveHi16(__m128i m1, __m128i m2) |
|
1058 { |
|
1059 return _mm_unpackhi_epi16(m1, m2); |
|
1060 } |
|
1061 |
|
1062 inline __m128i |
|
1063 InterleaveLo32(__m128i m1, __m128i m2) |
|
1064 { |
|
1065 return _mm_unpacklo_epi32(m1, m2); |
|
1066 } |
|
1067 |
|
1068 template<uint8_t aNumBytes> |
|
1069 inline __m128i |
|
1070 Rotate8(__m128i a1234, __m128i a5678) |
|
1071 { |
|
1072 return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes)); |
|
1073 } |
|
1074 |
|
1075 inline __m128i |
|
1076 PackAndSaturate32To16(__m128i m1, __m128i m2) |
|
1077 { |
|
1078 return _mm_packs_epi32(m1, m2); |
|
1079 } |
|
1080 |
|
1081 inline __m128i |
|
1082 PackAndSaturate32ToU16(__m128i m1, __m128i m2) |
|
1083 { |
|
1084 return _mm_packs_epi32(m1, m2); |
|
1085 } |
|
1086 |
|
1087 inline __m128i |
|
1088 PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4) |
|
1089 { |
|
1090 // Pack into 8 16bit signed integers (saturating). |
|
1091 __m128i m12 = _mm_packs_epi32(m1, m2); |
|
1092 __m128i m34 = _mm_packs_epi32(m3, m4); |
|
1093 |
|
1094 // Pack into 16 8bit unsigned integers (saturating). |
|
1095 return _mm_packus_epi16(m12, m34); |
|
1096 } |
|
1097 |
|
1098 inline __m128i |
|
1099 PackAndSaturate16To8(__m128i m1, __m128i m2) |
|
1100 { |
|
1101 // Pack into 16 8bit unsigned integers (saturating). |
|
1102 return _mm_packus_epi16(m1, m2); |
|
1103 } |
|
1104 |
|
1105 inline __m128i |
|
1106 FastDivideBy255(__m128i m) |
|
1107 { |
|
1108 // v = m << 8 |
|
1109 __m128i v = _mm_slli_epi32(m, 8); |
|
1110 // v = v + (m + (255,255,255,255)) |
|
1111 v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255))); |
|
1112 // v = v >> 16 |
|
1113 return _mm_srai_epi32(v, 16); |
|
1114 } |
|
1115 |
|
1116 inline __m128i |
|
1117 FastDivideBy255_16(__m128i m) |
|
1118 { |
|
1119 __m128i zero = _mm_set1_epi16(0); |
|
1120 __m128i lo = _mm_unpacklo_epi16(m, zero); |
|
1121 __m128i hi = _mm_unpackhi_epi16(m, zero); |
|
1122 return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi)); |
|
1123 } |
|
1124 |
|
1125 inline __m128i |
|
1126 Pick(__m128i mask, __m128i a, __m128i b) |
|
1127 { |
|
1128 return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); |
|
1129 } |
|
1130 |
|
1131 inline __m128 MixF32(__m128 a, __m128 b, float t) |
|
1132 { |
|
1133 return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t))); |
|
1134 } |
|
1135 |
|
1136 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) |
|
1137 { |
|
1138 return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb))); |
|
1139 } |
|
1140 |
|
1141 inline __m128 AbsF32(__m128 a) |
|
1142 { |
|
1143 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a); |
|
1144 } |
|
1145 |
|
1146 inline __m128 AddF32(__m128 a, __m128 b) |
|
1147 { |
|
1148 return _mm_add_ps(a, b); |
|
1149 } |
|
1150 |
|
1151 inline __m128 MulF32(__m128 a, __m128 b) |
|
1152 { |
|
1153 return _mm_mul_ps(a, b); |
|
1154 } |
|
1155 |
|
1156 inline __m128 DivF32(__m128 a, __m128 b) |
|
1157 { |
|
1158 return _mm_div_ps(a, b); |
|
1159 } |
|
1160 |
|
1161 template<uint8_t aIndex> |
|
1162 inline __m128 SplatF32(__m128 m) |
|
1163 { |
|
1164 AssertIndex<aIndex>(); |
|
1165 return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex)); |
|
1166 } |
|
1167 |
|
1168 inline __m128i F32ToI32(__m128 m) |
|
1169 { |
|
1170 return _mm_cvtps_epi32(m); |
|
1171 } |
|
1172 |
|
1173 #endif // SIMD_COMPILE_SSE2 |
|
1174 |
|
1175 } // namespace simd |
|
1176 |
|
1177 } // namespace gfx |
|
1178 } // namespace mozilla |
|
1179 |
|
1180 #endif // _MOZILLA_GFX_SIMD_H_ |