1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/2d/SIMD.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1180 @@ 1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- 1.5 + * This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#ifndef _MOZILLA_GFX_SIMD_H_ 1.10 +#define _MOZILLA_GFX_SIMD_H_ 1.11 + 1.12 +/** 1.13 + * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it 1.14 + * if they want access to the SSE2 functions. 1.15 + */ 1.16 + 1.17 +#ifdef SIMD_COMPILE_SSE2 1.18 +#include <xmmintrin.h> 1.19 +#endif 1.20 + 1.21 +namespace mozilla { 1.22 +namespace gfx { 1.23 + 1.24 +namespace simd { 1.25 + 1.26 +template<typename u8x16_t> 1.27 +u8x16_t Load8(const uint8_t* aSource); 1.28 + 1.29 +template<typename u8x16_t> 1.30 +u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, 1.31 + uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p); 1.32 + 1.33 +template<typename u8x16_t> 1.34 +u8x16_t FromZero8(); 1.35 + 1.36 +template<typename i16x8_t> 1.37 +i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h); 1.38 + 1.39 +template<typename u16x8_t> 1.40 +u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h); 1.41 + 1.42 +template<typename i16x8_t> 1.43 +i16x8_t FromI16(int16_t a); 1.44 + 1.45 +template<typename u16x8_t> 1.46 +u16x8_t FromU16(uint16_t a); 1.47 + 1.48 +template<typename i32x4_t> 1.49 +i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d); 1.50 + 1.51 +template<typename i32x4_t> 1.52 +i32x4_t From32(int32_t a); 1.53 + 1.54 +template<typename f32x4_t> 1.55 +f32x4_t FromF32(float a, float b, float c, float d); 1.56 + 1.57 +template<typename f32x4_t> 1.58 +f32x4_t FromF32(float a); 1.59 + 1.60 +// All SIMD backends overload these functions for their SIMD types: 1.61 + 1.62 +#if 0 1.63 + 1.64 +// Store 16 bytes to a 16-byte aligned address 1.65 +void Store8(uint8_t* aTarget, u8x16_t aM); 1.66 + 1.67 +// Fixed shifts 1.68 +template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM); 1.69 +template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM); 1.70 + 1.71 +i16x8_t Add16(i16x8_t aM1, i16x8_t aM2); 1.72 +i32x4_t Add32(i32x4_t aM1, i32x4_t aM2); 1.73 +i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2); 1.74 +i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2); 1.75 +u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2); 1.76 +u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2); 1.77 +i32x4_t Min32(i32x4_t aM1, i32x4_t aM2); 1.78 +i32x4_t Max32(i32x4_t aM1, i32x4_t aM2); 1.79 + 1.80 +// Truncating i16 -> i16 multiplication 1.81 +i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2); 1.82 + 1.83 +// Long multiplication i16 -> i32 1.84 +// aFactorsA1B1 = (a1[4] b1[4]) 1.85 +// aFactorsA2B2 = (a2[4] b2[4]) 1.86 +// aProductA = a1 * a2, aProductB = b1 * b2 1.87 +void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2, 1.88 + i32x4_t& aProductA, i32x4_t& aProductB); 1.89 + 1.90 +// Long multiplication + pairwise addition i16 -> i32 1.91 +// See the scalar implementation for specifics. 1.92 +i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB); 1.93 +i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB); 1.94 + 1.95 +// Set all four 32-bit components to the value of the component at aIndex. 1.96 +template<int8_t aIndex> 1.97 +i32x4_t Splat32(i32x4_t aM); 1.98 + 1.99 +// Interpret the input as four 32-bit values, apply Splat32<aIndex> on them, 1.100 +// re-interpret the result as sixteen 8-bit values. 1.101 +template<int8_t aIndex> 1.102 +u8x16_t Splat32On8(u8x16_t aM); 1.103 + 1.104 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM); 1.105 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM); 1.106 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM); 1.107 + 1.108 +u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2); 1.109 +u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2); 1.110 +i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2); 1.111 +i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2); 1.112 +i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2); 1.113 + 1.114 +i16x8_t UnpackLo8x8ToI16x8(u8x16_t m); 1.115 +i16x8_t UnpackHi8x8ToI16x8(u8x16_t m); 1.116 +u16x8_t UnpackLo8x8ToU16x8(u8x16_t m); 1.117 +u16x8_t UnpackHi8x8ToU16x8(u8x16_t m); 1.118 + 1.119 +i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2); 1.120 +u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2); 1.121 +u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4); 1.122 + 1.123 +i32x4 FastDivideBy255(i32x4 m); 1.124 +i16x8 FastDivideBy255_16(i16x8 m); 1.125 + 1.126 +#endif 1.127 + 1.128 +// Scalar 1.129 + 1.130 +struct Scalaru8x16_t { 1.131 + uint8_t u8[16]; 1.132 +}; 1.133 + 1.134 +union Scalari16x8_t { 1.135 + int16_t i16[8]; 1.136 + uint16_t u16[8]; 1.137 +}; 1.138 + 1.139 +typedef Scalari16x8_t Scalaru16x8_t; 1.140 + 1.141 +struct Scalari32x4_t { 1.142 + int32_t i32[4]; 1.143 +}; 1.144 + 1.145 +struct Scalarf32x4_t { 1.146 + float f32[4]; 1.147 +}; 1.148 + 1.149 +template<> 1.150 +inline Scalaru8x16_t 1.151 +Load8<Scalaru8x16_t>(const uint8_t* aSource) 1.152 +{ 1.153 + return *(Scalaru8x16_t*)aSource; 1.154 +} 1.155 + 1.156 +inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) 1.157 +{ 1.158 + *(Scalaru8x16_t*)aTarget = aM; 1.159 +} 1.160 + 1.161 +template<> 1.162 +inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, 1.163 + uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) 1.164 +{ 1.165 + Scalaru8x16_t _m; 1.166 + _m.u8[0] = a; 1.167 + _m.u8[1] = b; 1.168 + _m.u8[2] = c; 1.169 + _m.u8[3] = d; 1.170 + _m.u8[4] = e; 1.171 + _m.u8[5] = f; 1.172 + _m.u8[6] = g; 1.173 + _m.u8[7] = h; 1.174 + _m.u8[8+0] = i; 1.175 + _m.u8[8+1] = j; 1.176 + _m.u8[8+2] = k; 1.177 + _m.u8[8+3] = l; 1.178 + _m.u8[8+4] = m; 1.179 + _m.u8[8+5] = n; 1.180 + _m.u8[8+6] = o; 1.181 + _m.u8[8+7] = p; 1.182 + return _m; 1.183 +} 1.184 + 1.185 +template<> 1.186 +inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() 1.187 +{ 1.188 + return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); 1.189 +} 1.190 + 1.191 +template<> 1.192 +inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) 1.193 +{ 1.194 + Scalari16x8_t m; 1.195 + m.i16[0] = a; 1.196 + m.i16[1] = b; 1.197 + m.i16[2] = c; 1.198 + m.i16[3] = d; 1.199 + m.i16[4] = e; 1.200 + m.i16[5] = f; 1.201 + m.i16[6] = g; 1.202 + m.i16[7] = h; 1.203 + return m; 1.204 +} 1.205 + 1.206 +template<> 1.207 +inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) 1.208 +{ 1.209 + Scalaru16x8_t m; 1.210 + m.u16[0] = a; 1.211 + m.u16[1] = b; 1.212 + m.u16[2] = c; 1.213 + m.u16[3] = d; 1.214 + m.u16[4] = e; 1.215 + m.u16[5] = f; 1.216 + m.u16[6] = g; 1.217 + m.u16[7] = h; 1.218 + return m; 1.219 +} 1.220 + 1.221 +template<> 1.222 +inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) 1.223 +{ 1.224 + return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a); 1.225 +} 1.226 + 1.227 +template<> 1.228 +inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) 1.229 +{ 1.230 + return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a); 1.231 +} 1.232 + 1.233 +template<> 1.234 +inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d) 1.235 +{ 1.236 + Scalari32x4_t m; 1.237 + m.i32[0] = a; 1.238 + m.i32[1] = b; 1.239 + m.i32[2] = c; 1.240 + m.i32[3] = d; 1.241 + return m; 1.242 +} 1.243 + 1.244 +template<> 1.245 +inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d) 1.246 +{ 1.247 + Scalarf32x4_t m; 1.248 + m.f32[0] = a; 1.249 + m.f32[1] = b; 1.250 + m.f32[2] = c; 1.251 + m.f32[3] = d; 1.252 + return m; 1.253 +} 1.254 + 1.255 +template<> 1.256 +inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) 1.257 +{ 1.258 + return FromF32<Scalarf32x4_t>(a, a, a, a); 1.259 +} 1.260 + 1.261 +template<> 1.262 +inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) 1.263 +{ 1.264 + return From32<Scalari32x4_t>(a, a, a, a); 1.265 +} 1.266 + 1.267 +template<int32_t aNumberOfBits> 1.268 +inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) 1.269 +{ 1.270 + return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits, 1.271 + uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits, 1.272 + uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits, 1.273 + uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits); 1.274 +} 1.275 + 1.276 +template<int32_t aNumberOfBits> 1.277 +inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) 1.278 +{ 1.279 + return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits, 1.280 + aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits); 1.281 +} 1.282 + 1.283 +inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) 1.284 +{ 1.285 + return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], 1.286 + aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3], 1.287 + aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5], 1.288 + aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]); 1.289 +} 1.290 + 1.291 +inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) 1.292 +{ 1.293 + return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1], 1.294 + aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]); 1.295 +} 1.296 + 1.297 +inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) 1.298 +{ 1.299 + return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], 1.300 + aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3], 1.301 + aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5], 1.302 + aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]); 1.303 +} 1.304 + 1.305 +inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) 1.306 +{ 1.307 + return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1], 1.308 + aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]); 1.309 +} 1.310 + 1.311 +inline int32_t 1.312 +umin(int32_t a, int32_t b) 1.313 +{ 1.314 + return a - ((a - b) & -(a > b)); 1.315 +} 1.316 + 1.317 +inline int32_t 1.318 +umax(int32_t a, int32_t b) 1.319 +{ 1.320 + return a - ((a - b) & -(a < b)); 1.321 +} 1.322 + 1.323 +inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) 1.324 +{ 1.325 + return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]), 1.326 + umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]), 1.327 + umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]), 1.328 + umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]), 1.329 + umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]), 1.330 + umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]), 1.331 + umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]), 1.332 + umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7])); 1.333 +} 1.334 + 1.335 +inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) 1.336 +{ 1.337 + return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]), 1.338 + umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]), 1.339 + umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]), 1.340 + umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]), 1.341 + umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]), 1.342 + umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]), 1.343 + umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]), 1.344 + umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7])); 1.345 +} 1.346 + 1.347 +inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) 1.348 +{ 1.349 + return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]), 1.350 + umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3])); 1.351 +} 1.352 + 1.353 +inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) 1.354 +{ 1.355 + return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]), 1.356 + umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3])); 1.357 +} 1.358 + 1.359 +inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) 1.360 +{ 1.361 + return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])), 1.362 + uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])), 1.363 + uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])), 1.364 + uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7]))); 1.365 +} 1.366 + 1.367 +inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1, 1.368 + Scalari16x8_t aFactorsA2B2, 1.369 + Scalari32x4_t& aProductA, 1.370 + Scalari32x4_t& aProductB) 1.371 +{ 1.372 + aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0], 1.373 + aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1], 1.374 + aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2], 1.375 + aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]); 1.376 + aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4], 1.377 + aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5], 1.378 + aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6], 1.379 + aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]); 1.380 +} 1.381 + 1.382 +inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA, 1.383 + Scalari16x8_t aFactorsB) 1.384 +{ 1.385 + return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1], 1.386 + aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3], 1.387 + aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5], 1.388 + aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]); 1.389 +} 1.390 + 1.391 +template<int8_t aIndex> 1.392 +inline void AssertIndex() 1.393 +{ 1.394 + static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3, 1.395 + "Invalid splat index"); 1.396 +} 1.397 + 1.398 +template<int8_t aIndex> 1.399 +inline Scalari32x4_t Splat32(Scalari32x4_t aM) 1.400 +{ 1.401 + AssertIndex<aIndex>(); 1.402 + return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], 1.403 + aM.i32[aIndex], aM.i32[aIndex]); 1.404 +} 1.405 + 1.406 +template<int8_t i> 1.407 +inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) 1.408 +{ 1.409 + AssertIndex<i>(); 1.410 + return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], 1.411 + aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], 1.412 + aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3], 1.413 + aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]); 1.414 +} 1.415 + 1.416 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.417 +inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) 1.418 +{ 1.419 + AssertIndex<i0>(); 1.420 + AssertIndex<i1>(); 1.421 + AssertIndex<i2>(); 1.422 + AssertIndex<i3>(); 1.423 + Scalari32x4_t m = aM; 1.424 + m.i32[0] = aM.i32[i3]; 1.425 + m.i32[1] = aM.i32[i2]; 1.426 + m.i32[2] = aM.i32[i1]; 1.427 + m.i32[3] = aM.i32[i0]; 1.428 + return m; 1.429 +} 1.430 + 1.431 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.432 +inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) 1.433 +{ 1.434 + AssertIndex<i0>(); 1.435 + AssertIndex<i1>(); 1.436 + AssertIndex<i2>(); 1.437 + AssertIndex<i3>(); 1.438 + Scalari16x8_t m = aM; 1.439 + m.i16[0] = aM.i16[i3]; 1.440 + m.i16[1] = aM.i16[i2]; 1.441 + m.i16[2] = aM.i16[i1]; 1.442 + m.i16[3] = aM.i16[i0]; 1.443 + return m; 1.444 +} 1.445 + 1.446 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.447 +inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) 1.448 +{ 1.449 + AssertIndex<i0>(); 1.450 + AssertIndex<i1>(); 1.451 + AssertIndex<i2>(); 1.452 + AssertIndex<i3>(); 1.453 + Scalari16x8_t m = aM; 1.454 + m.i16[4 + 0] = aM.i16[4 + i3]; 1.455 + m.i16[4 + 1] = aM.i16[4 + i2]; 1.456 + m.i16[4 + 2] = aM.i16[4 + i1]; 1.457 + m.i16[4 + 3] = aM.i16[4 + i0]; 1.458 + return m; 1.459 +} 1.460 + 1.461 +template<int8_t aIndexLo, int8_t aIndexHi> 1.462 +inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) 1.463 +{ 1.464 + AssertIndex<aIndexLo>(); 1.465 + AssertIndex<aIndexHi>(); 1.466 + Scalaru16x8_t m; 1.467 + int16_t chosenValueLo = aM.u16[aIndexLo]; 1.468 + m.u16[0] = chosenValueLo; 1.469 + m.u16[1] = chosenValueLo; 1.470 + m.u16[2] = chosenValueLo; 1.471 + m.u16[3] = chosenValueLo; 1.472 + int16_t chosenValueHi = aM.u16[4 + aIndexHi]; 1.473 + m.u16[4] = chosenValueHi; 1.474 + m.u16[5] = chosenValueHi; 1.475 + m.u16[6] = chosenValueHi; 1.476 + m.u16[7] = chosenValueHi; 1.477 + return m; 1.478 +} 1.479 + 1.480 +inline Scalaru8x16_t 1.481 +InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) 1.482 +{ 1.483 + return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], 1.484 + m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3], 1.485 + m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5], 1.486 + m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]); 1.487 +} 1.488 + 1.489 +inline Scalaru8x16_t 1.490 +InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) 1.491 +{ 1.492 + return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1], 1.493 + m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3], 1.494 + m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5], 1.495 + m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]); 1.496 +} 1.497 + 1.498 +inline Scalaru16x8_t 1.499 +InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) 1.500 +{ 1.501 + return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1], 1.502 + m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]); 1.503 +} 1.504 + 1.505 +inline Scalaru16x8_t 1.506 +InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) 1.507 +{ 1.508 + return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5], 1.509 + m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]); 1.510 +} 1.511 + 1.512 +inline Scalari32x4_t 1.513 +InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) 1.514 +{ 1.515 + return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]); 1.516 +} 1.517 + 1.518 +inline Scalari16x8_t 1.519 +UnpackLo8x8ToI16x8(Scalaru8x16_t aM) 1.520 +{ 1.521 + Scalari16x8_t m; 1.522 + m.i16[0] = aM.u8[0]; 1.523 + m.i16[1] = aM.u8[1]; 1.524 + m.i16[2] = aM.u8[2]; 1.525 + m.i16[3] = aM.u8[3]; 1.526 + m.i16[4] = aM.u8[4]; 1.527 + m.i16[5] = aM.u8[5]; 1.528 + m.i16[6] = aM.u8[6]; 1.529 + m.i16[7] = aM.u8[7]; 1.530 + return m; 1.531 +} 1.532 + 1.533 +inline Scalari16x8_t 1.534 +UnpackHi8x8ToI16x8(Scalaru8x16_t aM) 1.535 +{ 1.536 + Scalari16x8_t m; 1.537 + m.i16[0] = aM.u8[8+0]; 1.538 + m.i16[1] = aM.u8[8+1]; 1.539 + m.i16[2] = aM.u8[8+2]; 1.540 + m.i16[3] = aM.u8[8+3]; 1.541 + m.i16[4] = aM.u8[8+4]; 1.542 + m.i16[5] = aM.u8[8+5]; 1.543 + m.i16[6] = aM.u8[8+6]; 1.544 + m.i16[7] = aM.u8[8+7]; 1.545 + return m; 1.546 +} 1.547 + 1.548 +inline Scalaru16x8_t 1.549 +UnpackLo8x8ToU16x8(Scalaru8x16_t aM) 1.550 +{ 1.551 + return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]), 1.552 + uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7])); 1.553 +} 1.554 + 1.555 +inline Scalaru16x8_t 1.556 +UnpackHi8x8ToU16x8(Scalaru8x16_t aM) 1.557 +{ 1.558 + return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3], 1.559 + aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]); 1.560 +} 1.561 + 1.562 +template<uint8_t aNumBytes> 1.563 +inline Scalaru8x16_t 1.564 +Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) 1.565 +{ 1.566 + Scalaru8x16_t m; 1.567 + for (uint8_t i = 0; i < 16; i++) { 1.568 + uint8_t sourceByte = i + aNumBytes; 1.569 + m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16]; 1.570 + } 1.571 + return m; 1.572 +} 1.573 + 1.574 +template<typename T> 1.575 +inline int16_t 1.576 +SaturateTo16(T a) 1.577 +{ 1.578 + return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN); 1.579 +} 1.580 + 1.581 +inline Scalari16x8_t 1.582 +PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) 1.583 +{ 1.584 + Scalari16x8_t m; 1.585 + m.i16[0] = SaturateTo16(m1.i32[0]); 1.586 + m.i16[1] = SaturateTo16(m1.i32[1]); 1.587 + m.i16[2] = SaturateTo16(m1.i32[2]); 1.588 + m.i16[3] = SaturateTo16(m1.i32[3]); 1.589 + m.i16[4] = SaturateTo16(m2.i32[0]); 1.590 + m.i16[5] = SaturateTo16(m2.i32[1]); 1.591 + m.i16[6] = SaturateTo16(m2.i32[2]); 1.592 + m.i16[7] = SaturateTo16(m2.i32[3]); 1.593 + return m; 1.594 +} 1.595 + 1.596 +template<typename T> 1.597 +inline uint16_t 1.598 +SaturateToU16(T a) 1.599 +{ 1.600 + return uint16_t(umin(a & -(a >= 0), INT16_MAX)); 1.601 +} 1.602 + 1.603 +inline Scalaru16x8_t 1.604 +PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2) 1.605 +{ 1.606 + Scalaru16x8_t m; 1.607 + m.u16[0] = SaturateToU16(m1.i32[0]); 1.608 + m.u16[1] = SaturateToU16(m1.i32[1]); 1.609 + m.u16[2] = SaturateToU16(m1.i32[2]); 1.610 + m.u16[3] = SaturateToU16(m1.i32[3]); 1.611 + m.u16[4] = SaturateToU16(m2.i32[0]); 1.612 + m.u16[5] = SaturateToU16(m2.i32[1]); 1.613 + m.u16[6] = SaturateToU16(m2.i32[2]); 1.614 + m.u16[7] = SaturateToU16(m2.i32[3]); 1.615 + return m; 1.616 +} 1.617 + 1.618 +template<typename T> 1.619 +inline uint8_t 1.620 +SaturateTo8(T a) 1.621 +{ 1.622 + return uint8_t(umin(a & -(a >= 0), 255)); 1.623 +} 1.624 + 1.625 +inline Scalaru8x16_t 1.626 +PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4) 1.627 +{ 1.628 + Scalaru8x16_t m; 1.629 + m.u8[0] = SaturateTo8(m1.i32[0]); 1.630 + m.u8[1] = SaturateTo8(m1.i32[1]); 1.631 + m.u8[2] = SaturateTo8(m1.i32[2]); 1.632 + m.u8[3] = SaturateTo8(m1.i32[3]); 1.633 + m.u8[4] = SaturateTo8(m2.i32[0]); 1.634 + m.u8[5] = SaturateTo8(m2.i32[1]); 1.635 + m.u8[6] = SaturateTo8(m2.i32[2]); 1.636 + m.u8[7] = SaturateTo8(m2.i32[3]); 1.637 + m.u8[8] = SaturateTo8(m3.i32[0]); 1.638 + m.u8[9] = SaturateTo8(m3.i32[1]); 1.639 + m.u8[10] = SaturateTo8(m3.i32[2]); 1.640 + m.u8[11] = SaturateTo8(m3.i32[3]); 1.641 + m.u8[12] = SaturateTo8(m4.i32[0]); 1.642 + m.u8[13] = SaturateTo8(m4.i32[1]); 1.643 + m.u8[14] = SaturateTo8(m4.i32[2]); 1.644 + m.u8[15] = SaturateTo8(m4.i32[3]); 1.645 + return m; 1.646 +} 1.647 + 1.648 +inline Scalaru8x16_t 1.649 +PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) 1.650 +{ 1.651 + Scalaru8x16_t m; 1.652 + m.u8[0] = SaturateTo8(m1.i16[0]); 1.653 + m.u8[1] = SaturateTo8(m1.i16[1]); 1.654 + m.u8[2] = SaturateTo8(m1.i16[2]); 1.655 + m.u8[3] = SaturateTo8(m1.i16[3]); 1.656 + m.u8[4] = SaturateTo8(m1.i16[4]); 1.657 + m.u8[5] = SaturateTo8(m1.i16[5]); 1.658 + m.u8[6] = SaturateTo8(m1.i16[6]); 1.659 + m.u8[7] = SaturateTo8(m1.i16[7]); 1.660 + m.u8[8] = SaturateTo8(m2.i16[0]); 1.661 + m.u8[9] = SaturateTo8(m2.i16[1]); 1.662 + m.u8[10] = SaturateTo8(m2.i16[2]); 1.663 + m.u8[11] = SaturateTo8(m2.i16[3]); 1.664 + m.u8[12] = SaturateTo8(m2.i16[4]); 1.665 + m.u8[13] = SaturateTo8(m2.i16[5]); 1.666 + m.u8[14] = SaturateTo8(m2.i16[6]); 1.667 + m.u8[15] = SaturateTo8(m2.i16[7]); 1.668 + return m; 1.669 +} 1.670 + 1.671 +// Fast approximate division by 255. It has the property that 1.672 +// for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. 1.673 +// But it only uses two adds and two shifts instead of an 1.674 +// integer division (which is expensive on many processors). 1.675 +// 1.676 +// equivalent to v/255 1.677 +template<class B, class A> 1.678 +inline B FastDivideBy255(A v) 1.679 +{ 1.680 + return ((v << 8) + v + 255) >> 16; 1.681 +} 1.682 + 1.683 +inline Scalaru16x8_t 1.684 +FastDivideBy255_16(Scalaru16x8_t m) 1.685 +{ 1.686 + return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])), 1.687 + FastDivideBy255<uint16_t>(int32_t(m.u16[1])), 1.688 + FastDivideBy255<uint16_t>(int32_t(m.u16[2])), 1.689 + FastDivideBy255<uint16_t>(int32_t(m.u16[3])), 1.690 + FastDivideBy255<uint16_t>(int32_t(m.u16[4])), 1.691 + FastDivideBy255<uint16_t>(int32_t(m.u16[5])), 1.692 + FastDivideBy255<uint16_t>(int32_t(m.u16[6])), 1.693 + FastDivideBy255<uint16_t>(int32_t(m.u16[7]))); 1.694 +} 1.695 + 1.696 +inline Scalari32x4_t 1.697 +FastDivideBy255(Scalari32x4_t m) 1.698 +{ 1.699 + return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]), 1.700 + FastDivideBy255<int32_t>(m.i32[1]), 1.701 + FastDivideBy255<int32_t>(m.i32[2]), 1.702 + FastDivideBy255<int32_t>(m.i32[3])); 1.703 +} 1.704 + 1.705 +inline Scalaru8x16_t 1.706 +Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b) 1.707 +{ 1.708 + return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]), 1.709 + (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]), 1.710 + (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]), 1.711 + (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]), 1.712 + (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]), 1.713 + (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]), 1.714 + (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]), 1.715 + (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]), 1.716 + (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]), 1.717 + (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]), 1.718 + (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]), 1.719 + (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]), 1.720 + (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]), 1.721 + (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]), 1.722 + (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]), 1.723 + (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7])); 1.724 +} 1.725 + 1.726 +inline Scalari32x4_t 1.727 +Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b) 1.728 +{ 1.729 + return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]), 1.730 + (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]), 1.731 + (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]), 1.732 + (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3])); 1.733 +} 1.734 + 1.735 +inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) 1.736 +{ 1.737 + return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t, 1.738 + a.f32[1] + (b.f32[1] - a.f32[1]) * t, 1.739 + a.f32[2] + (b.f32[2] - a.f32[2]) * t, 1.740 + a.f32[3] + (b.f32[3] - a.f32[3]) * t); 1.741 +} 1.742 + 1.743 +inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb) 1.744 +{ 1.745 + return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb, 1.746 + a.f32[1] * wa + b.f32[1] * wb, 1.747 + a.f32[2] * wa + b.f32[2] * wb, 1.748 + a.f32[3] * wa + b.f32[3] * wb); 1.749 +} 1.750 + 1.751 +inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) 1.752 +{ 1.753 + return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), 1.754 + fabs(a.f32[1]), 1.755 + fabs(a.f32[2]), 1.756 + fabs(a.f32[3])); 1.757 +} 1.758 + 1.759 +inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) 1.760 +{ 1.761 + return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], 1.762 + a.f32[1] + b.f32[1], 1.763 + a.f32[2] + b.f32[2], 1.764 + a.f32[3] + b.f32[3]); 1.765 +} 1.766 + 1.767 +inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) 1.768 +{ 1.769 + return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], 1.770 + a.f32[1] * b.f32[1], 1.771 + a.f32[2] * b.f32[2], 1.772 + a.f32[3] * b.f32[3]); 1.773 +} 1.774 + 1.775 +inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) 1.776 +{ 1.777 + return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], 1.778 + a.f32[1] / b.f32[1], 1.779 + a.f32[2] / b.f32[2], 1.780 + a.f32[3] / b.f32[3]); 1.781 +} 1.782 + 1.783 +template<uint8_t aIndex> 1.784 +inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) 1.785 +{ 1.786 + AssertIndex<aIndex>(); 1.787 + return FromF32<Scalarf32x4_t>(m.f32[aIndex], 1.788 + m.f32[aIndex], 1.789 + m.f32[aIndex], 1.790 + m.f32[aIndex]); 1.791 +} 1.792 + 1.793 +inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) 1.794 +{ 1.795 + return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)), 1.796 + int32_t(floor(m.f32[1] + 0.5f)), 1.797 + int32_t(floor(m.f32[2] + 0.5f)), 1.798 + int32_t(floor(m.f32[3] + 0.5f))); 1.799 +} 1.800 + 1.801 +#ifdef SIMD_COMPILE_SSE2 1.802 + 1.803 +// SSE2 1.804 + 1.805 +template<> 1.806 +inline __m128i 1.807 +Load8<__m128i>(const uint8_t* aSource) 1.808 +{ 1.809 + return _mm_load_si128((const __m128i*)aSource); 1.810 +} 1.811 + 1.812 +inline void Store8(uint8_t* aTarget, __m128i aM) 1.813 +{ 1.814 + _mm_store_si128((__m128i*)aTarget, aM); 1.815 +} 1.816 + 1.817 +template<> 1.818 +inline __m128i FromZero8<__m128i>() 1.819 +{ 1.820 + return _mm_setzero_si128(); 1.821 +} 1.822 + 1.823 +template<> 1.824 +inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h, 1.825 + uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) 1.826 +{ 1.827 + return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g, 1.828 + (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o); 1.829 +} 1.830 + 1.831 +template<> 1.832 +inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h) 1.833 +{ 1.834 + return _mm_setr_epi16(a, b, c, d, e, f, g, h); 1.835 +} 1.836 + 1.837 +template<> 1.838 +inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) 1.839 +{ 1.840 + return _mm_setr_epi16(a, b, c, d, e, f, g, h); 1.841 +} 1.842 + 1.843 +template<> 1.844 +inline __m128i FromI16<__m128i>(int16_t a) 1.845 +{ 1.846 + return _mm_set1_epi16(a); 1.847 +} 1.848 + 1.849 +template<> 1.850 +inline __m128i FromU16<__m128i>(uint16_t a) 1.851 +{ 1.852 + return _mm_set1_epi16((int16_t)a); 1.853 +} 1.854 + 1.855 +template<> 1.856 +inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) 1.857 +{ 1.858 + return _mm_setr_epi32(a, b, c, d); 1.859 +} 1.860 + 1.861 +template<> 1.862 +inline __m128i From32<__m128i>(int32_t a) 1.863 +{ 1.864 + return _mm_set1_epi32(a); 1.865 +} 1.866 + 1.867 +template<> 1.868 +inline __m128 FromF32<__m128>(float a, float b, float c, float d) 1.869 +{ 1.870 + return _mm_setr_ps(a, b, c, d); 1.871 +} 1.872 + 1.873 +template<> 1.874 +inline __m128 FromF32<__m128>(float a) 1.875 +{ 1.876 + return _mm_set1_ps(a); 1.877 +} 1.878 + 1.879 +template<int32_t aNumberOfBits> 1.880 +inline __m128i ShiftRight16(__m128i aM) 1.881 +{ 1.882 + return _mm_srli_epi16(aM, aNumberOfBits); 1.883 +} 1.884 + 1.885 +template<int32_t aNumberOfBits> 1.886 +inline __m128i ShiftRight32(__m128i aM) 1.887 +{ 1.888 + return _mm_srai_epi32(aM, aNumberOfBits); 1.889 +} 1.890 + 1.891 +inline __m128i Add16(__m128i aM1, __m128i aM2) 1.892 +{ 1.893 + return _mm_add_epi16(aM1, aM2); 1.894 +} 1.895 + 1.896 +inline __m128i Add32(__m128i aM1, __m128i aM2) 1.897 +{ 1.898 + return _mm_add_epi32(aM1, aM2); 1.899 +} 1.900 + 1.901 +inline __m128i Sub16(__m128i aM1, __m128i aM2) 1.902 +{ 1.903 + return _mm_sub_epi16(aM1, aM2); 1.904 +} 1.905 + 1.906 +inline __m128i Sub32(__m128i aM1, __m128i aM2) 1.907 +{ 1.908 + return _mm_sub_epi32(aM1, aM2); 1.909 +} 1.910 + 1.911 +inline __m128i Min8(__m128i aM1, __m128i aM2) 1.912 +{ 1.913 + return _mm_min_epu8(aM1, aM2); 1.914 +} 1.915 + 1.916 +inline __m128i Max8(__m128i aM1, __m128i aM2) 1.917 +{ 1.918 + return _mm_max_epu8(aM1, aM2); 1.919 +} 1.920 + 1.921 +inline __m128i Min32(__m128i aM1, __m128i aM2) 1.922 +{ 1.923 + __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); 1.924 + __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2); 1.925 + return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2)); 1.926 +} 1.927 + 1.928 +inline __m128i Max32(__m128i aM1, __m128i aM2) 1.929 +{ 1.930 + __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); 1.931 + __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1); 1.932 + return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1)); 1.933 +} 1.934 + 1.935 +inline __m128i Mul16(__m128i aM1, __m128i aM2) 1.936 +{ 1.937 + return _mm_mullo_epi16(aM1, aM2); 1.938 +} 1.939 + 1.940 +inline __m128i MulU16(__m128i aM1, __m128i aM2) 1.941 +{ 1.942 + return _mm_mullo_epi16(aM1, aM2); 1.943 +} 1.944 + 1.945 +inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, 1.946 + __m128i aFactorsA2B2, 1.947 + __m128i& aProductA, 1.948 + __m128i& aProductB) 1.949 +{ 1.950 + __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2); 1.951 + __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2); 1.952 + aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi); 1.953 + aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi); 1.954 +} 1.955 + 1.956 +inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, 1.957 + __m128i aFactorsB) 1.958 +{ 1.959 + return _mm_madd_epi16(aFactorsA, aFactorsB); 1.960 +} 1.961 + 1.962 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.963 +inline __m128i Shuffle32(__m128i aM) 1.964 +{ 1.965 + AssertIndex<i0>(); 1.966 + AssertIndex<i1>(); 1.967 + AssertIndex<i2>(); 1.968 + AssertIndex<i3>(); 1.969 + return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 1.970 +} 1.971 + 1.972 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.973 +inline __m128i ShuffleLo16(__m128i aM) 1.974 +{ 1.975 + AssertIndex<i0>(); 1.976 + AssertIndex<i1>(); 1.977 + AssertIndex<i2>(); 1.978 + AssertIndex<i3>(); 1.979 + return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 1.980 +} 1.981 + 1.982 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> 1.983 +inline __m128i ShuffleHi16(__m128i aM) 1.984 +{ 1.985 + AssertIndex<i0>(); 1.986 + AssertIndex<i1>(); 1.987 + AssertIndex<i2>(); 1.988 + AssertIndex<i3>(); 1.989 + return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 1.990 +} 1.991 + 1.992 +template<int8_t aIndex> 1.993 +inline __m128i Splat32(__m128i aM) 1.994 +{ 1.995 + return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); 1.996 +} 1.997 + 1.998 +template<int8_t aIndex> 1.999 +inline __m128i Splat32On8(__m128i aM) 1.1000 +{ 1.1001 + return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM); 1.1002 +} 1.1003 + 1.1004 +template<int8_t aIndexLo, int8_t aIndexHi> 1.1005 +inline __m128i Splat16(__m128i aM) 1.1006 +{ 1.1007 + AssertIndex<aIndexLo>(); 1.1008 + AssertIndex<aIndexHi>(); 1.1009 + return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>( 1.1010 + ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM)); 1.1011 +} 1.1012 + 1.1013 +inline __m128i 1.1014 +UnpackLo8x8ToI16x8(__m128i m) 1.1015 +{ 1.1016 + __m128i zero = _mm_set1_epi8(0); 1.1017 + return _mm_unpacklo_epi8(m, zero); 1.1018 +} 1.1019 + 1.1020 +inline __m128i 1.1021 +UnpackHi8x8ToI16x8(__m128i m) 1.1022 +{ 1.1023 + __m128i zero = _mm_set1_epi8(0); 1.1024 + return _mm_unpackhi_epi8(m, zero); 1.1025 +} 1.1026 + 1.1027 +inline __m128i 1.1028 +UnpackLo8x8ToU16x8(__m128i m) 1.1029 +{ 1.1030 + __m128i zero = _mm_set1_epi8(0); 1.1031 + return _mm_unpacklo_epi8(m, zero); 1.1032 +} 1.1033 + 1.1034 +inline __m128i 1.1035 +UnpackHi8x8ToU16x8(__m128i m) 1.1036 +{ 1.1037 + __m128i zero = _mm_set1_epi8(0); 1.1038 + return _mm_unpackhi_epi8(m, zero); 1.1039 +} 1.1040 + 1.1041 +inline __m128i 1.1042 +InterleaveLo8(__m128i m1, __m128i m2) 1.1043 +{ 1.1044 + return _mm_unpacklo_epi8(m1, m2); 1.1045 +} 1.1046 + 1.1047 +inline __m128i 1.1048 +InterleaveHi8(__m128i m1, __m128i m2) 1.1049 +{ 1.1050 + return _mm_unpackhi_epi8(m1, m2); 1.1051 +} 1.1052 + 1.1053 +inline __m128i 1.1054 +InterleaveLo16(__m128i m1, __m128i m2) 1.1055 +{ 1.1056 + return _mm_unpacklo_epi16(m1, m2); 1.1057 +} 1.1058 + 1.1059 +inline __m128i 1.1060 +InterleaveHi16(__m128i m1, __m128i m2) 1.1061 +{ 1.1062 + return _mm_unpackhi_epi16(m1, m2); 1.1063 +} 1.1064 + 1.1065 +inline __m128i 1.1066 +InterleaveLo32(__m128i m1, __m128i m2) 1.1067 +{ 1.1068 + return _mm_unpacklo_epi32(m1, m2); 1.1069 +} 1.1070 + 1.1071 +template<uint8_t aNumBytes> 1.1072 +inline __m128i 1.1073 +Rotate8(__m128i a1234, __m128i a5678) 1.1074 +{ 1.1075 + return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes)); 1.1076 +} 1.1077 + 1.1078 +inline __m128i 1.1079 +PackAndSaturate32To16(__m128i m1, __m128i m2) 1.1080 +{ 1.1081 + return _mm_packs_epi32(m1, m2); 1.1082 +} 1.1083 + 1.1084 +inline __m128i 1.1085 +PackAndSaturate32ToU16(__m128i m1, __m128i m2) 1.1086 +{ 1.1087 + return _mm_packs_epi32(m1, m2); 1.1088 +} 1.1089 + 1.1090 +inline __m128i 1.1091 +PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4) 1.1092 +{ 1.1093 + // Pack into 8 16bit signed integers (saturating). 1.1094 + __m128i m12 = _mm_packs_epi32(m1, m2); 1.1095 + __m128i m34 = _mm_packs_epi32(m3, m4); 1.1096 + 1.1097 + // Pack into 16 8bit unsigned integers (saturating). 1.1098 + return _mm_packus_epi16(m12, m34); 1.1099 +} 1.1100 + 1.1101 +inline __m128i 1.1102 +PackAndSaturate16To8(__m128i m1, __m128i m2) 1.1103 +{ 1.1104 + // Pack into 16 8bit unsigned integers (saturating). 1.1105 + return _mm_packus_epi16(m1, m2); 1.1106 +} 1.1107 + 1.1108 +inline __m128i 1.1109 +FastDivideBy255(__m128i m) 1.1110 +{ 1.1111 + // v = m << 8 1.1112 + __m128i v = _mm_slli_epi32(m, 8); 1.1113 + // v = v + (m + (255,255,255,255)) 1.1114 + v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255))); 1.1115 + // v = v >> 16 1.1116 + return _mm_srai_epi32(v, 16); 1.1117 +} 1.1118 + 1.1119 +inline __m128i 1.1120 +FastDivideBy255_16(__m128i m) 1.1121 +{ 1.1122 + __m128i zero = _mm_set1_epi16(0); 1.1123 + __m128i lo = _mm_unpacklo_epi16(m, zero); 1.1124 + __m128i hi = _mm_unpackhi_epi16(m, zero); 1.1125 + return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi)); 1.1126 +} 1.1127 + 1.1128 +inline __m128i 1.1129 +Pick(__m128i mask, __m128i a, __m128i b) 1.1130 +{ 1.1131 + return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); 1.1132 +} 1.1133 + 1.1134 +inline __m128 MixF32(__m128 a, __m128 b, float t) 1.1135 +{ 1.1136 + return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t))); 1.1137 +} 1.1138 + 1.1139 +inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) 1.1140 +{ 1.1141 + return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb))); 1.1142 +} 1.1143 + 1.1144 +inline __m128 AbsF32(__m128 a) 1.1145 +{ 1.1146 + return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a); 1.1147 +} 1.1148 + 1.1149 +inline __m128 AddF32(__m128 a, __m128 b) 1.1150 +{ 1.1151 + return _mm_add_ps(a, b); 1.1152 +} 1.1153 + 1.1154 +inline __m128 MulF32(__m128 a, __m128 b) 1.1155 +{ 1.1156 + return _mm_mul_ps(a, b); 1.1157 +} 1.1158 + 1.1159 +inline __m128 DivF32(__m128 a, __m128 b) 1.1160 +{ 1.1161 + return _mm_div_ps(a, b); 1.1162 +} 1.1163 + 1.1164 +template<uint8_t aIndex> 1.1165 +inline __m128 SplatF32(__m128 m) 1.1166 +{ 1.1167 + AssertIndex<aIndex>(); 1.1168 + return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex)); 1.1169 +} 1.1170 + 1.1171 +inline __m128i F32ToI32(__m128 m) 1.1172 +{ 1.1173 + return _mm_cvtps_epi32(m); 1.1174 +} 1.1175 + 1.1176 +#endif // SIMD_COMPILE_SSE2 1.1177 + 1.1178 +} // namespace simd 1.1179 + 1.1180 +} // namespace gfx 1.1181 +} // namespace mozilla 1.1182 + 1.1183 +#endif // _MOZILLA_GFX_SIMD_H_