gfx/2d/SIMD.h

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
michael@0 2 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #ifndef _MOZILLA_GFX_SIMD_H_
michael@0 7 #define _MOZILLA_GFX_SIMD_H_
michael@0 8
michael@0 9 /**
michael@0 10 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
michael@0 11 * if they want access to the SSE2 functions.
michael@0 12 */
michael@0 13
michael@0 14 #ifdef SIMD_COMPILE_SSE2
michael@0 15 #include <xmmintrin.h>
michael@0 16 #endif
michael@0 17
michael@0 18 namespace mozilla {
michael@0 19 namespace gfx {
michael@0 20
michael@0 21 namespace simd {
michael@0 22
michael@0 23 template<typename u8x16_t>
michael@0 24 u8x16_t Load8(const uint8_t* aSource);
michael@0 25
michael@0 26 template<typename u8x16_t>
michael@0 27 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
michael@0 28 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p);
michael@0 29
michael@0 30 template<typename u8x16_t>
michael@0 31 u8x16_t FromZero8();
michael@0 32
michael@0 33 template<typename i16x8_t>
michael@0 34 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h);
michael@0 35
michael@0 36 template<typename u16x8_t>
michael@0 37 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h);
michael@0 38
michael@0 39 template<typename i16x8_t>
michael@0 40 i16x8_t FromI16(int16_t a);
michael@0 41
michael@0 42 template<typename u16x8_t>
michael@0 43 u16x8_t FromU16(uint16_t a);
michael@0 44
michael@0 45 template<typename i32x4_t>
michael@0 46 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
michael@0 47
michael@0 48 template<typename i32x4_t>
michael@0 49 i32x4_t From32(int32_t a);
michael@0 50
michael@0 51 template<typename f32x4_t>
michael@0 52 f32x4_t FromF32(float a, float b, float c, float d);
michael@0 53
michael@0 54 template<typename f32x4_t>
michael@0 55 f32x4_t FromF32(float a);
michael@0 56
michael@0 57 // All SIMD backends overload these functions for their SIMD types:
michael@0 58
michael@0 59 #if 0
michael@0 60
michael@0 61 // Store 16 bytes to a 16-byte aligned address
michael@0 62 void Store8(uint8_t* aTarget, u8x16_t aM);
michael@0 63
michael@0 64 // Fixed shifts
michael@0 65 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
michael@0 66 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
michael@0 67
michael@0 68 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
michael@0 69 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
michael@0 70 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
michael@0 71 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
michael@0 72 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
michael@0 73 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
michael@0 74 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
michael@0 75 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
michael@0 76
michael@0 77 // Truncating i16 -> i16 multiplication
michael@0 78 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
michael@0 79
michael@0 80 // Long multiplication i16 -> i32
michael@0 81 // aFactorsA1B1 = (a1[4] b1[4])
michael@0 82 // aFactorsA2B2 = (a2[4] b2[4])
michael@0 83 // aProductA = a1 * a2, aProductB = b1 * b2
michael@0 84 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
michael@0 85 i32x4_t& aProductA, i32x4_t& aProductB);
michael@0 86
michael@0 87 // Long multiplication + pairwise addition i16 -> i32
michael@0 88 // See the scalar implementation for specifics.
michael@0 89 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
michael@0 90 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
michael@0 91
michael@0 92 // Set all four 32-bit components to the value of the component at aIndex.
michael@0 93 template<int8_t aIndex>
michael@0 94 i32x4_t Splat32(i32x4_t aM);
michael@0 95
michael@0 96 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
michael@0 97 // re-interpret the result as sixteen 8-bit values.
michael@0 98 template<int8_t aIndex>
michael@0 99 u8x16_t Splat32On8(u8x16_t aM);
michael@0 100
michael@0 101 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
michael@0 102 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
michael@0 103 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
michael@0 104
michael@0 105 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
michael@0 106 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
michael@0 107 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
michael@0 108 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
michael@0 109 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
michael@0 110
michael@0 111 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
michael@0 112 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
michael@0 113 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
michael@0 114 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
michael@0 115
michael@0 116 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
michael@0 117 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
michael@0 118 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
michael@0 119
michael@0 120 i32x4 FastDivideBy255(i32x4 m);
michael@0 121 i16x8 FastDivideBy255_16(i16x8 m);
michael@0 122
michael@0 123 #endif
michael@0 124
michael@0 125 // Scalar
michael@0 126
michael@0 127 struct Scalaru8x16_t {
michael@0 128 uint8_t u8[16];
michael@0 129 };
michael@0 130
michael@0 131 union Scalari16x8_t {
michael@0 132 int16_t i16[8];
michael@0 133 uint16_t u16[8];
michael@0 134 };
michael@0 135
michael@0 136 typedef Scalari16x8_t Scalaru16x8_t;
michael@0 137
michael@0 138 struct Scalari32x4_t {
michael@0 139 int32_t i32[4];
michael@0 140 };
michael@0 141
michael@0 142 struct Scalarf32x4_t {
michael@0 143 float f32[4];
michael@0 144 };
michael@0 145
michael@0 146 template<>
michael@0 147 inline Scalaru8x16_t
michael@0 148 Load8<Scalaru8x16_t>(const uint8_t* aSource)
michael@0 149 {
michael@0 150 return *(Scalaru8x16_t*)aSource;
michael@0 151 }
michael@0 152
michael@0 153 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM)
michael@0 154 {
michael@0 155 *(Scalaru8x16_t*)aTarget = aM;
michael@0 156 }
michael@0 157
michael@0 158 template<>
michael@0 159 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
michael@0 160 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
michael@0 161 {
michael@0 162 Scalaru8x16_t _m;
michael@0 163 _m.u8[0] = a;
michael@0 164 _m.u8[1] = b;
michael@0 165 _m.u8[2] = c;
michael@0 166 _m.u8[3] = d;
michael@0 167 _m.u8[4] = e;
michael@0 168 _m.u8[5] = f;
michael@0 169 _m.u8[6] = g;
michael@0 170 _m.u8[7] = h;
michael@0 171 _m.u8[8+0] = i;
michael@0 172 _m.u8[8+1] = j;
michael@0 173 _m.u8[8+2] = k;
michael@0 174 _m.u8[8+3] = l;
michael@0 175 _m.u8[8+4] = m;
michael@0 176 _m.u8[8+5] = n;
michael@0 177 _m.u8[8+6] = o;
michael@0 178 _m.u8[8+7] = p;
michael@0 179 return _m;
michael@0 180 }
michael@0 181
michael@0 182 template<>
michael@0 183 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>()
michael@0 184 {
michael@0 185 return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
michael@0 186 }
michael@0 187
michael@0 188 template<>
michael@0 189 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
michael@0 190 {
michael@0 191 Scalari16x8_t m;
michael@0 192 m.i16[0] = a;
michael@0 193 m.i16[1] = b;
michael@0 194 m.i16[2] = c;
michael@0 195 m.i16[3] = d;
michael@0 196 m.i16[4] = e;
michael@0 197 m.i16[5] = f;
michael@0 198 m.i16[6] = g;
michael@0 199 m.i16[7] = h;
michael@0 200 return m;
michael@0 201 }
michael@0 202
michael@0 203 template<>
michael@0 204 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
michael@0 205 {
michael@0 206 Scalaru16x8_t m;
michael@0 207 m.u16[0] = a;
michael@0 208 m.u16[1] = b;
michael@0 209 m.u16[2] = c;
michael@0 210 m.u16[3] = d;
michael@0 211 m.u16[4] = e;
michael@0 212 m.u16[5] = f;
michael@0 213 m.u16[6] = g;
michael@0 214 m.u16[7] = h;
michael@0 215 return m;
michael@0 216 }
michael@0 217
michael@0 218 template<>
michael@0 219 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a)
michael@0 220 {
michael@0 221 return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
michael@0 222 }
michael@0 223
michael@0 224 template<>
michael@0 225 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a)
michael@0 226 {
michael@0 227 return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
michael@0 228 }
michael@0 229
michael@0 230 template<>
michael@0 231 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d)
michael@0 232 {
michael@0 233 Scalari32x4_t m;
michael@0 234 m.i32[0] = a;
michael@0 235 m.i32[1] = b;
michael@0 236 m.i32[2] = c;
michael@0 237 m.i32[3] = d;
michael@0 238 return m;
michael@0 239 }
michael@0 240
michael@0 241 template<>
michael@0 242 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d)
michael@0 243 {
michael@0 244 Scalarf32x4_t m;
michael@0 245 m.f32[0] = a;
michael@0 246 m.f32[1] = b;
michael@0 247 m.f32[2] = c;
michael@0 248 m.f32[3] = d;
michael@0 249 return m;
michael@0 250 }
michael@0 251
michael@0 252 template<>
michael@0 253 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a)
michael@0 254 {
michael@0 255 return FromF32<Scalarf32x4_t>(a, a, a, a);
michael@0 256 }
michael@0 257
michael@0 258 template<>
michael@0 259 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a)
michael@0 260 {
michael@0 261 return From32<Scalari32x4_t>(a, a, a, a);
michael@0 262 }
michael@0 263
michael@0 264 template<int32_t aNumberOfBits>
michael@0 265 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM)
michael@0 266 {
michael@0 267 return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits,
michael@0 268 uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits,
michael@0 269 uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits,
michael@0 270 uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits);
michael@0 271 }
michael@0 272
michael@0 273 template<int32_t aNumberOfBits>
michael@0 274 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM)
michael@0 275 {
michael@0 276 return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
michael@0 277 aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
michael@0 278 }
michael@0 279
michael@0 280 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
michael@0 281 {
michael@0 282 return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1],
michael@0 283 aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3],
michael@0 284 aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
michael@0 285 aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
michael@0 286 }
michael@0 287
michael@0 288 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2)
michael@0 289 {
michael@0 290 return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
michael@0 291 aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]);
michael@0 292 }
michael@0 293
michael@0 294 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
michael@0 295 {
michael@0 296 return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1],
michael@0 297 aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3],
michael@0 298 aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
michael@0 299 aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
michael@0 300 }
michael@0 301
michael@0 302 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2)
michael@0 303 {
michael@0 304 return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
michael@0 305 aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]);
michael@0 306 }
michael@0 307
michael@0 308 inline int32_t
michael@0 309 umin(int32_t a, int32_t b)
michael@0 310 {
michael@0 311 return a - ((a - b) & -(a > b));
michael@0 312 }
michael@0 313
michael@0 314 inline int32_t
michael@0 315 umax(int32_t a, int32_t b)
michael@0 316 {
michael@0 317 return a - ((a - b) & -(a < b));
michael@0 318 }
michael@0 319
michael@0 320 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
michael@0 321 {
michael@0 322 return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
michael@0 323 umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
michael@0 324 umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
michael@0 325 umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
michael@0 326 umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]),
michael@0 327 umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]),
michael@0 328 umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]),
michael@0 329 umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7]));
michael@0 330 }
michael@0 331
michael@0 332 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
michael@0 333 {
michael@0 334 return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
michael@0 335 umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
michael@0 336 umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
michael@0 337 umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
michael@0 338 umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]),
michael@0 339 umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]),
michael@0 340 umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]),
michael@0 341 umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7]));
michael@0 342 }
michael@0 343
michael@0 344 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2)
michael@0 345 {
michael@0 346 return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
michael@0 347 umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
michael@0 348 }
michael@0 349
michael@0 350 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2)
michael@0 351 {
michael@0 352 return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
michael@0 353 umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
michael@0 354 }
michael@0 355
michael@0 356 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
michael@0 357 {
michael@0 358 return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
michael@0 359 uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
michael@0 360 uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
michael@0 361 uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
michael@0 362 }
michael@0 363
michael@0 364 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
michael@0 365 Scalari16x8_t aFactorsA2B2,
michael@0 366 Scalari32x4_t& aProductA,
michael@0 367 Scalari32x4_t& aProductB)
michael@0 368 {
michael@0 369 aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
michael@0 370 aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
michael@0 371 aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
michael@0 372 aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
michael@0 373 aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
michael@0 374 aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
michael@0 375 aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
michael@0 376 aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
michael@0 377 }
michael@0 378
michael@0 379 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
michael@0 380 Scalari16x8_t aFactorsB)
michael@0 381 {
michael@0 382 return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
michael@0 383 aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
michael@0 384 aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
michael@0 385 aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]);
michael@0 386 }
michael@0 387
michael@0 388 template<int8_t aIndex>
michael@0 389 inline void AssertIndex()
michael@0 390 {
michael@0 391 static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
michael@0 392 "Invalid splat index");
michael@0 393 }
michael@0 394
michael@0 395 template<int8_t aIndex>
michael@0 396 inline Scalari32x4_t Splat32(Scalari32x4_t aM)
michael@0 397 {
michael@0 398 AssertIndex<aIndex>();
michael@0 399 return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex],
michael@0 400 aM.i32[aIndex], aM.i32[aIndex]);
michael@0 401 }
michael@0 402
michael@0 403 template<int8_t i>
michael@0 404 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM)
michael@0 405 {
michael@0 406 AssertIndex<i>();
michael@0 407 return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
michael@0 408 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
michael@0 409 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
michael@0 410 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]);
michael@0 411 }
michael@0 412
michael@0 413 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 414 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM)
michael@0 415 {
michael@0 416 AssertIndex<i0>();
michael@0 417 AssertIndex<i1>();
michael@0 418 AssertIndex<i2>();
michael@0 419 AssertIndex<i3>();
michael@0 420 Scalari32x4_t m = aM;
michael@0 421 m.i32[0] = aM.i32[i3];
michael@0 422 m.i32[1] = aM.i32[i2];
michael@0 423 m.i32[2] = aM.i32[i1];
michael@0 424 m.i32[3] = aM.i32[i0];
michael@0 425 return m;
michael@0 426 }
michael@0 427
michael@0 428 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 429 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM)
michael@0 430 {
michael@0 431 AssertIndex<i0>();
michael@0 432 AssertIndex<i1>();
michael@0 433 AssertIndex<i2>();
michael@0 434 AssertIndex<i3>();
michael@0 435 Scalari16x8_t m = aM;
michael@0 436 m.i16[0] = aM.i16[i3];
michael@0 437 m.i16[1] = aM.i16[i2];
michael@0 438 m.i16[2] = aM.i16[i1];
michael@0 439 m.i16[3] = aM.i16[i0];
michael@0 440 return m;
michael@0 441 }
michael@0 442
michael@0 443 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 444 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM)
michael@0 445 {
michael@0 446 AssertIndex<i0>();
michael@0 447 AssertIndex<i1>();
michael@0 448 AssertIndex<i2>();
michael@0 449 AssertIndex<i3>();
michael@0 450 Scalari16x8_t m = aM;
michael@0 451 m.i16[4 + 0] = aM.i16[4 + i3];
michael@0 452 m.i16[4 + 1] = aM.i16[4 + i2];
michael@0 453 m.i16[4 + 2] = aM.i16[4 + i1];
michael@0 454 m.i16[4 + 3] = aM.i16[4 + i0];
michael@0 455 return m;
michael@0 456 }
michael@0 457
michael@0 458 template<int8_t aIndexLo, int8_t aIndexHi>
michael@0 459 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM)
michael@0 460 {
michael@0 461 AssertIndex<aIndexLo>();
michael@0 462 AssertIndex<aIndexHi>();
michael@0 463 Scalaru16x8_t m;
michael@0 464 int16_t chosenValueLo = aM.u16[aIndexLo];
michael@0 465 m.u16[0] = chosenValueLo;
michael@0 466 m.u16[1] = chosenValueLo;
michael@0 467 m.u16[2] = chosenValueLo;
michael@0 468 m.u16[3] = chosenValueLo;
michael@0 469 int16_t chosenValueHi = aM.u16[4 + aIndexHi];
michael@0 470 m.u16[4] = chosenValueHi;
michael@0 471 m.u16[5] = chosenValueHi;
michael@0 472 m.u16[6] = chosenValueHi;
michael@0 473 m.u16[7] = chosenValueHi;
michael@0 474 return m;
michael@0 475 }
michael@0 476
michael@0 477 inline Scalaru8x16_t
michael@0 478 InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2)
michael@0 479 {
michael@0 480 return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1],
michael@0 481 m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3],
michael@0 482 m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5],
michael@0 483 m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]);
michael@0 484 }
michael@0 485
michael@0 486 inline Scalaru8x16_t
michael@0 487 InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2)
michael@0 488 {
michael@0 489 return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1],
michael@0 490 m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3],
michael@0 491 m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5],
michael@0 492 m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]);
michael@0 493 }
michael@0 494
michael@0 495 inline Scalaru16x8_t
michael@0 496 InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2)
michael@0 497 {
michael@0 498 return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
michael@0 499 m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
michael@0 500 }
michael@0 501
michael@0 502 inline Scalaru16x8_t
michael@0 503 InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2)
michael@0 504 {
michael@0 505 return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
michael@0 506 m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
michael@0 507 }
michael@0 508
michael@0 509 inline Scalari32x4_t
michael@0 510 InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2)
michael@0 511 {
michael@0 512 return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
michael@0 513 }
michael@0 514
michael@0 515 inline Scalari16x8_t
michael@0 516 UnpackLo8x8ToI16x8(Scalaru8x16_t aM)
michael@0 517 {
michael@0 518 Scalari16x8_t m;
michael@0 519 m.i16[0] = aM.u8[0];
michael@0 520 m.i16[1] = aM.u8[1];
michael@0 521 m.i16[2] = aM.u8[2];
michael@0 522 m.i16[3] = aM.u8[3];
michael@0 523 m.i16[4] = aM.u8[4];
michael@0 524 m.i16[5] = aM.u8[5];
michael@0 525 m.i16[6] = aM.u8[6];
michael@0 526 m.i16[7] = aM.u8[7];
michael@0 527 return m;
michael@0 528 }
michael@0 529
michael@0 530 inline Scalari16x8_t
michael@0 531 UnpackHi8x8ToI16x8(Scalaru8x16_t aM)
michael@0 532 {
michael@0 533 Scalari16x8_t m;
michael@0 534 m.i16[0] = aM.u8[8+0];
michael@0 535 m.i16[1] = aM.u8[8+1];
michael@0 536 m.i16[2] = aM.u8[8+2];
michael@0 537 m.i16[3] = aM.u8[8+3];
michael@0 538 m.i16[4] = aM.u8[8+4];
michael@0 539 m.i16[5] = aM.u8[8+5];
michael@0 540 m.i16[6] = aM.u8[8+6];
michael@0 541 m.i16[7] = aM.u8[8+7];
michael@0 542 return m;
michael@0 543 }
michael@0 544
michael@0 545 inline Scalaru16x8_t
michael@0 546 UnpackLo8x8ToU16x8(Scalaru8x16_t aM)
michael@0 547 {
michael@0 548 return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
michael@0 549 uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
michael@0 550 }
michael@0 551
michael@0 552 inline Scalaru16x8_t
michael@0 553 UnpackHi8x8ToU16x8(Scalaru8x16_t aM)
michael@0 554 {
michael@0 555 return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3],
michael@0 556 aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]);
michael@0 557 }
michael@0 558
michael@0 559 template<uint8_t aNumBytes>
michael@0 560 inline Scalaru8x16_t
michael@0 561 Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678)
michael@0 562 {
michael@0 563 Scalaru8x16_t m;
michael@0 564 for (uint8_t i = 0; i < 16; i++) {
michael@0 565 uint8_t sourceByte = i + aNumBytes;
michael@0 566 m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
michael@0 567 }
michael@0 568 return m;
michael@0 569 }
michael@0 570
michael@0 571 template<typename T>
michael@0 572 inline int16_t
michael@0 573 SaturateTo16(T a)
michael@0 574 {
michael@0 575 return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
michael@0 576 }
michael@0 577
michael@0 578 inline Scalari16x8_t
michael@0 579 PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2)
michael@0 580 {
michael@0 581 Scalari16x8_t m;
michael@0 582 m.i16[0] = SaturateTo16(m1.i32[0]);
michael@0 583 m.i16[1] = SaturateTo16(m1.i32[1]);
michael@0 584 m.i16[2] = SaturateTo16(m1.i32[2]);
michael@0 585 m.i16[3] = SaturateTo16(m1.i32[3]);
michael@0 586 m.i16[4] = SaturateTo16(m2.i32[0]);
michael@0 587 m.i16[5] = SaturateTo16(m2.i32[1]);
michael@0 588 m.i16[6] = SaturateTo16(m2.i32[2]);
michael@0 589 m.i16[7] = SaturateTo16(m2.i32[3]);
michael@0 590 return m;
michael@0 591 }
michael@0 592
michael@0 593 template<typename T>
michael@0 594 inline uint16_t
michael@0 595 SaturateToU16(T a)
michael@0 596 {
michael@0 597 return uint16_t(umin(a & -(a >= 0), INT16_MAX));
michael@0 598 }
michael@0 599
michael@0 600 inline Scalaru16x8_t
michael@0 601 PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2)
michael@0 602 {
michael@0 603 Scalaru16x8_t m;
michael@0 604 m.u16[0] = SaturateToU16(m1.i32[0]);
michael@0 605 m.u16[1] = SaturateToU16(m1.i32[1]);
michael@0 606 m.u16[2] = SaturateToU16(m1.i32[2]);
michael@0 607 m.u16[3] = SaturateToU16(m1.i32[3]);
michael@0 608 m.u16[4] = SaturateToU16(m2.i32[0]);
michael@0 609 m.u16[5] = SaturateToU16(m2.i32[1]);
michael@0 610 m.u16[6] = SaturateToU16(m2.i32[2]);
michael@0 611 m.u16[7] = SaturateToU16(m2.i32[3]);
michael@0 612 return m;
michael@0 613 }
michael@0 614
michael@0 615 template<typename T>
michael@0 616 inline uint8_t
michael@0 617 SaturateTo8(T a)
michael@0 618 {
michael@0 619 return uint8_t(umin(a & -(a >= 0), 255));
michael@0 620 }
michael@0 621
michael@0 622 inline Scalaru8x16_t
michael@0 623 PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4)
michael@0 624 {
michael@0 625 Scalaru8x16_t m;
michael@0 626 m.u8[0] = SaturateTo8(m1.i32[0]);
michael@0 627 m.u8[1] = SaturateTo8(m1.i32[1]);
michael@0 628 m.u8[2] = SaturateTo8(m1.i32[2]);
michael@0 629 m.u8[3] = SaturateTo8(m1.i32[3]);
michael@0 630 m.u8[4] = SaturateTo8(m2.i32[0]);
michael@0 631 m.u8[5] = SaturateTo8(m2.i32[1]);
michael@0 632 m.u8[6] = SaturateTo8(m2.i32[2]);
michael@0 633 m.u8[7] = SaturateTo8(m2.i32[3]);
michael@0 634 m.u8[8] = SaturateTo8(m3.i32[0]);
michael@0 635 m.u8[9] = SaturateTo8(m3.i32[1]);
michael@0 636 m.u8[10] = SaturateTo8(m3.i32[2]);
michael@0 637 m.u8[11] = SaturateTo8(m3.i32[3]);
michael@0 638 m.u8[12] = SaturateTo8(m4.i32[0]);
michael@0 639 m.u8[13] = SaturateTo8(m4.i32[1]);
michael@0 640 m.u8[14] = SaturateTo8(m4.i32[2]);
michael@0 641 m.u8[15] = SaturateTo8(m4.i32[3]);
michael@0 642 return m;
michael@0 643 }
michael@0 644
michael@0 645 inline Scalaru8x16_t
michael@0 646 PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2)
michael@0 647 {
michael@0 648 Scalaru8x16_t m;
michael@0 649 m.u8[0] = SaturateTo8(m1.i16[0]);
michael@0 650 m.u8[1] = SaturateTo8(m1.i16[1]);
michael@0 651 m.u8[2] = SaturateTo8(m1.i16[2]);
michael@0 652 m.u8[3] = SaturateTo8(m1.i16[3]);
michael@0 653 m.u8[4] = SaturateTo8(m1.i16[4]);
michael@0 654 m.u8[5] = SaturateTo8(m1.i16[5]);
michael@0 655 m.u8[6] = SaturateTo8(m1.i16[6]);
michael@0 656 m.u8[7] = SaturateTo8(m1.i16[7]);
michael@0 657 m.u8[8] = SaturateTo8(m2.i16[0]);
michael@0 658 m.u8[9] = SaturateTo8(m2.i16[1]);
michael@0 659 m.u8[10] = SaturateTo8(m2.i16[2]);
michael@0 660 m.u8[11] = SaturateTo8(m2.i16[3]);
michael@0 661 m.u8[12] = SaturateTo8(m2.i16[4]);
michael@0 662 m.u8[13] = SaturateTo8(m2.i16[5]);
michael@0 663 m.u8[14] = SaturateTo8(m2.i16[6]);
michael@0 664 m.u8[15] = SaturateTo8(m2.i16[7]);
michael@0 665 return m;
michael@0 666 }
michael@0 667
michael@0 668 // Fast approximate division by 255. It has the property that
michael@0 669 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
michael@0 670 // But it only uses two adds and two shifts instead of an
michael@0 671 // integer division (which is expensive on many processors).
michael@0 672 //
michael@0 673 // equivalent to v/255
michael@0 674 template<class B, class A>
michael@0 675 inline B FastDivideBy255(A v)
michael@0 676 {
michael@0 677 return ((v << 8) + v + 255) >> 16;
michael@0 678 }
michael@0 679
michael@0 680 inline Scalaru16x8_t
michael@0 681 FastDivideBy255_16(Scalaru16x8_t m)
michael@0 682 {
michael@0 683 return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
michael@0 684 FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
michael@0 685 FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
michael@0 686 FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
michael@0 687 FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
michael@0 688 FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
michael@0 689 FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
michael@0 690 FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
michael@0 691 }
michael@0 692
michael@0 693 inline Scalari32x4_t
michael@0 694 FastDivideBy255(Scalari32x4_t m)
michael@0 695 {
michael@0 696 return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]),
michael@0 697 FastDivideBy255<int32_t>(m.i32[1]),
michael@0 698 FastDivideBy255<int32_t>(m.i32[2]),
michael@0 699 FastDivideBy255<int32_t>(m.i32[3]));
michael@0 700 }
michael@0 701
michael@0 702 inline Scalaru8x16_t
michael@0 703 Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b)
michael@0 704 {
michael@0 705 return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
michael@0 706 (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
michael@0 707 (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
michael@0 708 (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
michael@0 709 (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
michael@0 710 (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
michael@0 711 (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
michael@0 712 (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
michael@0 713 (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]),
michael@0 714 (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]),
michael@0 715 (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]),
michael@0 716 (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]),
michael@0 717 (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]),
michael@0 718 (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]),
michael@0 719 (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]),
michael@0 720 (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7]));
michael@0 721 }
michael@0 722
michael@0 723 inline Scalari32x4_t
michael@0 724 Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b)
michael@0 725 {
michael@0 726 return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
michael@0 727 (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
michael@0 728 (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
michael@0 729 (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
michael@0 730 }
michael@0 731
michael@0 732 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t)
michael@0 733 {
michael@0 734 return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
michael@0 735 a.f32[1] + (b.f32[1] - a.f32[1]) * t,
michael@0 736 a.f32[2] + (b.f32[2] - a.f32[2]) * t,
michael@0 737 a.f32[3] + (b.f32[3] - a.f32[3]) * t);
michael@0 738 }
michael@0 739
michael@0 740 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb)
michael@0 741 {
michael@0 742 return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb,
michael@0 743 a.f32[1] * wa + b.f32[1] * wb,
michael@0 744 a.f32[2] * wa + b.f32[2] * wb,
michael@0 745 a.f32[3] * wa + b.f32[3] * wb);
michael@0 746 }
michael@0 747
michael@0 748 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a)
michael@0 749 {
michael@0 750 return FromF32<Scalarf32x4_t>(fabs(a.f32[0]),
michael@0 751 fabs(a.f32[1]),
michael@0 752 fabs(a.f32[2]),
michael@0 753 fabs(a.f32[3]));
michael@0 754 }
michael@0 755
michael@0 756 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b)
michael@0 757 {
michael@0 758 return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0],
michael@0 759 a.f32[1] + b.f32[1],
michael@0 760 a.f32[2] + b.f32[2],
michael@0 761 a.f32[3] + b.f32[3]);
michael@0 762 }
michael@0 763
michael@0 764 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b)
michael@0 765 {
michael@0 766 return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0],
michael@0 767 a.f32[1] * b.f32[1],
michael@0 768 a.f32[2] * b.f32[2],
michael@0 769 a.f32[3] * b.f32[3]);
michael@0 770 }
michael@0 771
michael@0 772 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b)
michael@0 773 {
michael@0 774 return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0],
michael@0 775 a.f32[1] / b.f32[1],
michael@0 776 a.f32[2] / b.f32[2],
michael@0 777 a.f32[3] / b.f32[3]);
michael@0 778 }
michael@0 779
michael@0 780 template<uint8_t aIndex>
michael@0 781 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m)
michael@0 782 {
michael@0 783 AssertIndex<aIndex>();
michael@0 784 return FromF32<Scalarf32x4_t>(m.f32[aIndex],
michael@0 785 m.f32[aIndex],
michael@0 786 m.f32[aIndex],
michael@0 787 m.f32[aIndex]);
michael@0 788 }
michael@0 789
michael@0 790 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m)
michael@0 791 {
michael@0 792 return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)),
michael@0 793 int32_t(floor(m.f32[1] + 0.5f)),
michael@0 794 int32_t(floor(m.f32[2] + 0.5f)),
michael@0 795 int32_t(floor(m.f32[3] + 0.5f)));
michael@0 796 }
michael@0 797
michael@0 798 #ifdef SIMD_COMPILE_SSE2
michael@0 799
michael@0 800 // SSE2
michael@0 801
michael@0 802 template<>
michael@0 803 inline __m128i
michael@0 804 Load8<__m128i>(const uint8_t* aSource)
michael@0 805 {
michael@0 806 return _mm_load_si128((const __m128i*)aSource);
michael@0 807 }
michael@0 808
michael@0 809 inline void Store8(uint8_t* aTarget, __m128i aM)
michael@0 810 {
michael@0 811 _mm_store_si128((__m128i*)aTarget, aM);
michael@0 812 }
michael@0 813
michael@0 814 template<>
michael@0 815 inline __m128i FromZero8<__m128i>()
michael@0 816 {
michael@0 817 return _mm_setzero_si128();
michael@0 818 }
michael@0 819
michael@0 820 template<>
michael@0 821 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
michael@0 822 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
michael@0 823 {
michael@0 824 return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
michael@0 825 (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
michael@0 826 }
michael@0 827
michael@0 828 template<>
michael@0 829 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
michael@0 830 {
michael@0 831 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
michael@0 832 }
michael@0 833
michael@0 834 template<>
michael@0 835 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
michael@0 836 {
michael@0 837 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
michael@0 838 }
michael@0 839
michael@0 840 template<>
michael@0 841 inline __m128i FromI16<__m128i>(int16_t a)
michael@0 842 {
michael@0 843 return _mm_set1_epi16(a);
michael@0 844 }
michael@0 845
michael@0 846 template<>
michael@0 847 inline __m128i FromU16<__m128i>(uint16_t a)
michael@0 848 {
michael@0 849 return _mm_set1_epi16((int16_t)a);
michael@0 850 }
michael@0 851
michael@0 852 template<>
michael@0 853 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d)
michael@0 854 {
michael@0 855 return _mm_setr_epi32(a, b, c, d);
michael@0 856 }
michael@0 857
michael@0 858 template<>
michael@0 859 inline __m128i From32<__m128i>(int32_t a)
michael@0 860 {
michael@0 861 return _mm_set1_epi32(a);
michael@0 862 }
michael@0 863
michael@0 864 template<>
michael@0 865 inline __m128 FromF32<__m128>(float a, float b, float c, float d)
michael@0 866 {
michael@0 867 return _mm_setr_ps(a, b, c, d);
michael@0 868 }
michael@0 869
michael@0 870 template<>
michael@0 871 inline __m128 FromF32<__m128>(float a)
michael@0 872 {
michael@0 873 return _mm_set1_ps(a);
michael@0 874 }
michael@0 875
michael@0 876 template<int32_t aNumberOfBits>
michael@0 877 inline __m128i ShiftRight16(__m128i aM)
michael@0 878 {
michael@0 879 return _mm_srli_epi16(aM, aNumberOfBits);
michael@0 880 }
michael@0 881
michael@0 882 template<int32_t aNumberOfBits>
michael@0 883 inline __m128i ShiftRight32(__m128i aM)
michael@0 884 {
michael@0 885 return _mm_srai_epi32(aM, aNumberOfBits);
michael@0 886 }
michael@0 887
michael@0 888 inline __m128i Add16(__m128i aM1, __m128i aM2)
michael@0 889 {
michael@0 890 return _mm_add_epi16(aM1, aM2);
michael@0 891 }
michael@0 892
michael@0 893 inline __m128i Add32(__m128i aM1, __m128i aM2)
michael@0 894 {
michael@0 895 return _mm_add_epi32(aM1, aM2);
michael@0 896 }
michael@0 897
michael@0 898 inline __m128i Sub16(__m128i aM1, __m128i aM2)
michael@0 899 {
michael@0 900 return _mm_sub_epi16(aM1, aM2);
michael@0 901 }
michael@0 902
michael@0 903 inline __m128i Sub32(__m128i aM1, __m128i aM2)
michael@0 904 {
michael@0 905 return _mm_sub_epi32(aM1, aM2);
michael@0 906 }
michael@0 907
michael@0 908 inline __m128i Min8(__m128i aM1, __m128i aM2)
michael@0 909 {
michael@0 910 return _mm_min_epu8(aM1, aM2);
michael@0 911 }
michael@0 912
michael@0 913 inline __m128i Max8(__m128i aM1, __m128i aM2)
michael@0 914 {
michael@0 915 return _mm_max_epu8(aM1, aM2);
michael@0 916 }
michael@0 917
michael@0 918 inline __m128i Min32(__m128i aM1, __m128i aM2)
michael@0 919 {
michael@0 920 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
michael@0 921 __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
michael@0 922 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
michael@0 923 }
michael@0 924
michael@0 925 inline __m128i Max32(__m128i aM1, __m128i aM2)
michael@0 926 {
michael@0 927 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
michael@0 928 __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
michael@0 929 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
michael@0 930 }
michael@0 931
michael@0 932 inline __m128i Mul16(__m128i aM1, __m128i aM2)
michael@0 933 {
michael@0 934 return _mm_mullo_epi16(aM1, aM2);
michael@0 935 }
michael@0 936
michael@0 937 inline __m128i MulU16(__m128i aM1, __m128i aM2)
michael@0 938 {
michael@0 939 return _mm_mullo_epi16(aM1, aM2);
michael@0 940 }
michael@0 941
michael@0 942 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1,
michael@0 943 __m128i aFactorsA2B2,
michael@0 944 __m128i& aProductA,
michael@0 945 __m128i& aProductB)
michael@0 946 {
michael@0 947 __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
michael@0 948 __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
michael@0 949 aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
michael@0 950 aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
michael@0 951 }
michael@0 952
michael@0 953 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA,
michael@0 954 __m128i aFactorsB)
michael@0 955 {
michael@0 956 return _mm_madd_epi16(aFactorsA, aFactorsB);
michael@0 957 }
michael@0 958
michael@0 959 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 960 inline __m128i Shuffle32(__m128i aM)
michael@0 961 {
michael@0 962 AssertIndex<i0>();
michael@0 963 AssertIndex<i1>();
michael@0 964 AssertIndex<i2>();
michael@0 965 AssertIndex<i3>();
michael@0 966 return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
michael@0 967 }
michael@0 968
michael@0 969 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 970 inline __m128i ShuffleLo16(__m128i aM)
michael@0 971 {
michael@0 972 AssertIndex<i0>();
michael@0 973 AssertIndex<i1>();
michael@0 974 AssertIndex<i2>();
michael@0 975 AssertIndex<i3>();
michael@0 976 return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
michael@0 977 }
michael@0 978
michael@0 979 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
michael@0 980 inline __m128i ShuffleHi16(__m128i aM)
michael@0 981 {
michael@0 982 AssertIndex<i0>();
michael@0 983 AssertIndex<i1>();
michael@0 984 AssertIndex<i2>();
michael@0 985 AssertIndex<i3>();
michael@0 986 return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
michael@0 987 }
michael@0 988
michael@0 989 template<int8_t aIndex>
michael@0 990 inline __m128i Splat32(__m128i aM)
michael@0 991 {
michael@0 992 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
michael@0 993 }
michael@0 994
michael@0 995 template<int8_t aIndex>
michael@0 996 inline __m128i Splat32On8(__m128i aM)
michael@0 997 {
michael@0 998 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
michael@0 999 }
michael@0 1000
michael@0 1001 template<int8_t aIndexLo, int8_t aIndexHi>
michael@0 1002 inline __m128i Splat16(__m128i aM)
michael@0 1003 {
michael@0 1004 AssertIndex<aIndexLo>();
michael@0 1005 AssertIndex<aIndexHi>();
michael@0 1006 return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>(
michael@0 1007 ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM));
michael@0 1008 }
michael@0 1009
michael@0 1010 inline __m128i
michael@0 1011 UnpackLo8x8ToI16x8(__m128i m)
michael@0 1012 {
michael@0 1013 __m128i zero = _mm_set1_epi8(0);
michael@0 1014 return _mm_unpacklo_epi8(m, zero);
michael@0 1015 }
michael@0 1016
michael@0 1017 inline __m128i
michael@0 1018 UnpackHi8x8ToI16x8(__m128i m)
michael@0 1019 {
michael@0 1020 __m128i zero = _mm_set1_epi8(0);
michael@0 1021 return _mm_unpackhi_epi8(m, zero);
michael@0 1022 }
michael@0 1023
michael@0 1024 inline __m128i
michael@0 1025 UnpackLo8x8ToU16x8(__m128i m)
michael@0 1026 {
michael@0 1027 __m128i zero = _mm_set1_epi8(0);
michael@0 1028 return _mm_unpacklo_epi8(m, zero);
michael@0 1029 }
michael@0 1030
michael@0 1031 inline __m128i
michael@0 1032 UnpackHi8x8ToU16x8(__m128i m)
michael@0 1033 {
michael@0 1034 __m128i zero = _mm_set1_epi8(0);
michael@0 1035 return _mm_unpackhi_epi8(m, zero);
michael@0 1036 }
michael@0 1037
michael@0 1038 inline __m128i
michael@0 1039 InterleaveLo8(__m128i m1, __m128i m2)
michael@0 1040 {
michael@0 1041 return _mm_unpacklo_epi8(m1, m2);
michael@0 1042 }
michael@0 1043
michael@0 1044 inline __m128i
michael@0 1045 InterleaveHi8(__m128i m1, __m128i m2)
michael@0 1046 {
michael@0 1047 return _mm_unpackhi_epi8(m1, m2);
michael@0 1048 }
michael@0 1049
michael@0 1050 inline __m128i
michael@0 1051 InterleaveLo16(__m128i m1, __m128i m2)
michael@0 1052 {
michael@0 1053 return _mm_unpacklo_epi16(m1, m2);
michael@0 1054 }
michael@0 1055
michael@0 1056 inline __m128i
michael@0 1057 InterleaveHi16(__m128i m1, __m128i m2)
michael@0 1058 {
michael@0 1059 return _mm_unpackhi_epi16(m1, m2);
michael@0 1060 }
michael@0 1061
michael@0 1062 inline __m128i
michael@0 1063 InterleaveLo32(__m128i m1, __m128i m2)
michael@0 1064 {
michael@0 1065 return _mm_unpacklo_epi32(m1, m2);
michael@0 1066 }
michael@0 1067
michael@0 1068 template<uint8_t aNumBytes>
michael@0 1069 inline __m128i
michael@0 1070 Rotate8(__m128i a1234, __m128i a5678)
michael@0 1071 {
michael@0 1072 return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes));
michael@0 1073 }
michael@0 1074
michael@0 1075 inline __m128i
michael@0 1076 PackAndSaturate32To16(__m128i m1, __m128i m2)
michael@0 1077 {
michael@0 1078 return _mm_packs_epi32(m1, m2);
michael@0 1079 }
michael@0 1080
michael@0 1081 inline __m128i
michael@0 1082 PackAndSaturate32ToU16(__m128i m1, __m128i m2)
michael@0 1083 {
michael@0 1084 return _mm_packs_epi32(m1, m2);
michael@0 1085 }
michael@0 1086
michael@0 1087 inline __m128i
michael@0 1088 PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4)
michael@0 1089 {
michael@0 1090 // Pack into 8 16bit signed integers (saturating).
michael@0 1091 __m128i m12 = _mm_packs_epi32(m1, m2);
michael@0 1092 __m128i m34 = _mm_packs_epi32(m3, m4);
michael@0 1093
michael@0 1094 // Pack into 16 8bit unsigned integers (saturating).
michael@0 1095 return _mm_packus_epi16(m12, m34);
michael@0 1096 }
michael@0 1097
michael@0 1098 inline __m128i
michael@0 1099 PackAndSaturate16To8(__m128i m1, __m128i m2)
michael@0 1100 {
michael@0 1101 // Pack into 16 8bit unsigned integers (saturating).
michael@0 1102 return _mm_packus_epi16(m1, m2);
michael@0 1103 }
michael@0 1104
michael@0 1105 inline __m128i
michael@0 1106 FastDivideBy255(__m128i m)
michael@0 1107 {
michael@0 1108 // v = m << 8
michael@0 1109 __m128i v = _mm_slli_epi32(m, 8);
michael@0 1110 // v = v + (m + (255,255,255,255))
michael@0 1111 v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
michael@0 1112 // v = v >> 16
michael@0 1113 return _mm_srai_epi32(v, 16);
michael@0 1114 }
michael@0 1115
michael@0 1116 inline __m128i
michael@0 1117 FastDivideBy255_16(__m128i m)
michael@0 1118 {
michael@0 1119 __m128i zero = _mm_set1_epi16(0);
michael@0 1120 __m128i lo = _mm_unpacklo_epi16(m, zero);
michael@0 1121 __m128i hi = _mm_unpackhi_epi16(m, zero);
michael@0 1122 return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
michael@0 1123 }
michael@0 1124
michael@0 1125 inline __m128i
michael@0 1126 Pick(__m128i mask, __m128i a, __m128i b)
michael@0 1127 {
michael@0 1128 return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
michael@0 1129 }
michael@0 1130
michael@0 1131 inline __m128 MixF32(__m128 a, __m128 b, float t)
michael@0 1132 {
michael@0 1133 return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
michael@0 1134 }
michael@0 1135
michael@0 1136 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb)
michael@0 1137 {
michael@0 1138 return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb)));
michael@0 1139 }
michael@0 1140
michael@0 1141 inline __m128 AbsF32(__m128 a)
michael@0 1142 {
michael@0 1143 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
michael@0 1144 }
michael@0 1145
michael@0 1146 inline __m128 AddF32(__m128 a, __m128 b)
michael@0 1147 {
michael@0 1148 return _mm_add_ps(a, b);
michael@0 1149 }
michael@0 1150
michael@0 1151 inline __m128 MulF32(__m128 a, __m128 b)
michael@0 1152 {
michael@0 1153 return _mm_mul_ps(a, b);
michael@0 1154 }
michael@0 1155
michael@0 1156 inline __m128 DivF32(__m128 a, __m128 b)
michael@0 1157 {
michael@0 1158 return _mm_div_ps(a, b);
michael@0 1159 }
michael@0 1160
michael@0 1161 template<uint8_t aIndex>
michael@0 1162 inline __m128 SplatF32(__m128 m)
michael@0 1163 {
michael@0 1164 AssertIndex<aIndex>();
michael@0 1165 return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
michael@0 1166 }
michael@0 1167
michael@0 1168 inline __m128i F32ToI32(__m128 m)
michael@0 1169 {
michael@0 1170 return _mm_cvtps_epi32(m);
michael@0 1171 }
michael@0 1172
michael@0 1173 #endif // SIMD_COMPILE_SSE2
michael@0 1174
michael@0 1175 } // namespace simd
michael@0 1176
michael@0 1177 } // namespace gfx
michael@0 1178 } // namespace mozilla
michael@0 1179
michael@0 1180 #endif // _MOZILLA_GFX_SIMD_H_

mercurial