gfx/2d/SIMD.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/2d/SIMD.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1180 @@
     1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     1.5 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#ifndef _MOZILLA_GFX_SIMD_H_
    1.10 +#define _MOZILLA_GFX_SIMD_H_
    1.11 +
    1.12 +/**
    1.13 + * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
    1.14 + * if they want access to the SSE2 functions.
    1.15 + */
    1.16 +
    1.17 +#ifdef SIMD_COMPILE_SSE2
    1.18 +#include <xmmintrin.h>
    1.19 +#endif
    1.20 +
    1.21 +namespace mozilla {
    1.22 +namespace gfx {
    1.23 +
    1.24 +namespace simd {
    1.25 +
    1.26 +template<typename u8x16_t>
    1.27 +u8x16_t Load8(const uint8_t* aSource);
    1.28 +
    1.29 +template<typename u8x16_t>
    1.30 +u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
    1.31 +              uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p);
    1.32 +
    1.33 +template<typename u8x16_t>
    1.34 +u8x16_t FromZero8();
    1.35 +
    1.36 +template<typename i16x8_t>
    1.37 +i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h);
    1.38 +
    1.39 +template<typename u16x8_t>
    1.40 +u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h);
    1.41 +
    1.42 +template<typename i16x8_t>
    1.43 +i16x8_t FromI16(int16_t a);
    1.44 +
    1.45 +template<typename u16x8_t>
    1.46 +u16x8_t FromU16(uint16_t a);
    1.47 +
    1.48 +template<typename i32x4_t>
    1.49 +i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
    1.50 +
    1.51 +template<typename i32x4_t>
    1.52 +i32x4_t From32(int32_t a);
    1.53 +
    1.54 +template<typename f32x4_t>
    1.55 +f32x4_t FromF32(float a, float b, float c, float d);
    1.56 +
    1.57 +template<typename f32x4_t>
    1.58 +f32x4_t FromF32(float a);
    1.59 +
    1.60 +// All SIMD backends overload these functions for their SIMD types:
    1.61 +
    1.62 +#if 0
    1.63 +
    1.64 +// Store 16 bytes to a 16-byte aligned address
    1.65 +void Store8(uint8_t* aTarget, u8x16_t aM);
    1.66 +
    1.67 +// Fixed shifts
    1.68 +template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
    1.69 +template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
    1.70 +
    1.71 +i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
    1.72 +i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
    1.73 +i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
    1.74 +i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
    1.75 +u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
    1.76 +u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
    1.77 +i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
    1.78 +i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
    1.79 +
    1.80 +// Truncating i16 -> i16 multiplication
    1.81 +i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
    1.82 +
    1.83 +// Long multiplication i16 -> i32
    1.84 +// aFactorsA1B1 = (a1[4] b1[4])
    1.85 +// aFactorsA2B2 = (a2[4] b2[4])
    1.86 +// aProductA = a1 * a2, aProductB = b1 * b2
    1.87 +void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
    1.88 +                         i32x4_t& aProductA, i32x4_t& aProductB);
    1.89 +
    1.90 +// Long multiplication + pairwise addition i16 -> i32
    1.91 +// See the scalar implementation for specifics.
    1.92 +i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
    1.93 +i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
    1.94 +
    1.95 +// Set all four 32-bit components to the value of the component at aIndex.
    1.96 +template<int8_t aIndex>
    1.97 +i32x4_t Splat32(i32x4_t aM);
    1.98 +
    1.99 +// Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
   1.100 +// re-interpret the result as sixteen 8-bit values.
   1.101 +template<int8_t aIndex>
   1.102 +u8x16_t Splat32On8(u8x16_t aM);
   1.103 +
   1.104 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
   1.105 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
   1.106 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
   1.107 +
   1.108 +u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
   1.109 +u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
   1.110 +i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
   1.111 +i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
   1.112 +i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
   1.113 +
   1.114 +i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
   1.115 +i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
   1.116 +u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
   1.117 +u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
   1.118 +
   1.119 +i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
   1.120 +u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
   1.121 +u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
   1.122 +
   1.123 +i32x4 FastDivideBy255(i32x4 m);
   1.124 +i16x8 FastDivideBy255_16(i16x8 m);
   1.125 +
   1.126 +#endif
   1.127 +
   1.128 +// Scalar
   1.129 +
   1.130 +struct Scalaru8x16_t {
   1.131 +  uint8_t u8[16];
   1.132 +};
   1.133 +
   1.134 +union Scalari16x8_t {
   1.135 +  int16_t i16[8];
   1.136 +  uint16_t u16[8];
   1.137 +};
   1.138 +
   1.139 +typedef Scalari16x8_t Scalaru16x8_t;
   1.140 +
   1.141 +struct Scalari32x4_t {
   1.142 +  int32_t i32[4];
   1.143 +};
   1.144 +
   1.145 +struct Scalarf32x4_t {
   1.146 +  float f32[4];
   1.147 +};
   1.148 +
   1.149 +template<>
   1.150 +inline Scalaru8x16_t
   1.151 +Load8<Scalaru8x16_t>(const uint8_t* aSource)
   1.152 +{
   1.153 +  return *(Scalaru8x16_t*)aSource;
   1.154 +}
   1.155 +
   1.156 +inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM)
   1.157 +{
   1.158 +  *(Scalaru8x16_t*)aTarget = aM;
   1.159 +}
   1.160 +
   1.161 +template<>
   1.162 +inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
   1.163 +                                          uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
   1.164 +{
   1.165 +  Scalaru8x16_t _m;
   1.166 +  _m.u8[0] = a;
   1.167 +  _m.u8[1] = b;
   1.168 +  _m.u8[2] = c;
   1.169 +  _m.u8[3] = d;
   1.170 +  _m.u8[4] = e;
   1.171 +  _m.u8[5] = f;
   1.172 +  _m.u8[6] = g;
   1.173 +  _m.u8[7] = h;
   1.174 +  _m.u8[8+0] = i;
   1.175 +  _m.u8[8+1] = j;
   1.176 +  _m.u8[8+2] = k;
   1.177 +  _m.u8[8+3] = l;
   1.178 +  _m.u8[8+4] = m;
   1.179 +  _m.u8[8+5] = n;
   1.180 +  _m.u8[8+6] = o;
   1.181 +  _m.u8[8+7] = p;
   1.182 +  return _m;
   1.183 +}
   1.184 +
   1.185 +template<>
   1.186 +inline Scalaru8x16_t FromZero8<Scalaru8x16_t>()
   1.187 +{
   1.188 +  return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
   1.189 +}
   1.190 +
   1.191 +template<>
   1.192 +inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
   1.193 +{
   1.194 +  Scalari16x8_t m;
   1.195 +  m.i16[0] = a;
   1.196 +  m.i16[1] = b;
   1.197 +  m.i16[2] = c;
   1.198 +  m.i16[3] = d;
   1.199 +  m.i16[4] = e;
   1.200 +  m.i16[5] = f;
   1.201 +  m.i16[6] = g;
   1.202 +  m.i16[7] = h;
   1.203 +  return m;
   1.204 +}
   1.205 +
   1.206 +template<>
   1.207 +inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
   1.208 +{
   1.209 +  Scalaru16x8_t m;
   1.210 +  m.u16[0] = a;
   1.211 +  m.u16[1] = b;
   1.212 +  m.u16[2] = c;
   1.213 +  m.u16[3] = d;
   1.214 +  m.u16[4] = e;
   1.215 +  m.u16[5] = f;
   1.216 +  m.u16[6] = g;
   1.217 +  m.u16[7] = h;
   1.218 +  return m;
   1.219 +}
   1.220 +
   1.221 +template<>
   1.222 +inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a)
   1.223 +{
   1.224 +  return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
   1.225 +}
   1.226 +
   1.227 +template<>
   1.228 +inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a)
   1.229 +{
   1.230 +  return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
   1.231 +}
   1.232 +
   1.233 +template<>
   1.234 +inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d)
   1.235 +{
   1.236 +  Scalari32x4_t m;
   1.237 +  m.i32[0] = a;
   1.238 +  m.i32[1] = b;
   1.239 +  m.i32[2] = c;
   1.240 +  m.i32[3] = d;
   1.241 +  return m;
   1.242 +}
   1.243 +
   1.244 +template<>
   1.245 +inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d)
   1.246 +{
   1.247 +  Scalarf32x4_t m;
   1.248 +  m.f32[0] = a;
   1.249 +  m.f32[1] = b;
   1.250 +  m.f32[2] = c;
   1.251 +  m.f32[3] = d;
   1.252 +  return m;
   1.253 +}
   1.254 +
   1.255 +template<>
   1.256 +inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a)
   1.257 +{
   1.258 +  return FromF32<Scalarf32x4_t>(a, a, a, a);
   1.259 +}
   1.260 +
   1.261 +template<>
   1.262 +inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a)
   1.263 +{
   1.264 +  return From32<Scalari32x4_t>(a, a, a, a);
   1.265 +}
   1.266 +
   1.267 +template<int32_t aNumberOfBits>
   1.268 +inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM)
   1.269 +{
   1.270 +  return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits,
   1.271 +                               uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits,
   1.272 +                               uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits,
   1.273 +                               uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits);
   1.274 +}
   1.275 +
   1.276 +template<int32_t aNumberOfBits>
   1.277 +inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM)
   1.278 +{
   1.279 +  return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
   1.280 +                               aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
   1.281 +}
   1.282 +
   1.283 +inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
   1.284 +{
   1.285 +  return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1],
   1.286 +                               aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3],
   1.287 +                               aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
   1.288 +                               aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
   1.289 +}
   1.290 +
   1.291 +inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2)
   1.292 +{
   1.293 +  return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
   1.294 +                               aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]);
   1.295 +}
   1.296 +
   1.297 +inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
   1.298 +{
   1.299 +  return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1],
   1.300 +                               aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3],
   1.301 +                               aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
   1.302 +                               aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
   1.303 +}
   1.304 +
   1.305 +inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2)
   1.306 +{
   1.307 +  return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
   1.308 +                               aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]);
   1.309 +}
   1.310 +
   1.311 +inline int32_t
   1.312 +umin(int32_t a, int32_t b)
   1.313 +{
   1.314 +  return a - ((a - b) & -(a > b));
   1.315 +}
   1.316 +
   1.317 +inline int32_t
   1.318 +umax(int32_t a, int32_t b)
   1.319 +{
   1.320 +  return a - ((a - b) & -(a < b));
   1.321 +}
   1.322 +
   1.323 +inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
   1.324 +{
   1.325 +  return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
   1.326 +                              umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
   1.327 +                              umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
   1.328 +                              umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
   1.329 +                              umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]),
   1.330 +                              umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]),
   1.331 +                              umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]),
   1.332 +                              umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7]));
   1.333 +}
   1.334 +
   1.335 +inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
   1.336 +{
   1.337 +  return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
   1.338 +                              umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
   1.339 +                              umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
   1.340 +                              umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
   1.341 +                              umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]),
   1.342 +                              umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]),
   1.343 +                              umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]),
   1.344 +                              umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7]));
   1.345 +}
   1.346 +
   1.347 +inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2)
   1.348 +{
   1.349 +  return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
   1.350 +                               umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
   1.351 +}
   1.352 +
   1.353 +inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2)
   1.354 +{
   1.355 +  return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
   1.356 +                               umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
   1.357 +}
   1.358 +
   1.359 +inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
   1.360 +{
   1.361 +  return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
   1.362 +                                uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
   1.363 +                                uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
   1.364 +                                uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
   1.365 +}
   1.366 +
   1.367 +inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
   1.368 +                                Scalari16x8_t aFactorsA2B2,
   1.369 +                                Scalari32x4_t& aProductA,
   1.370 +                                Scalari32x4_t& aProductB)
   1.371 +{
   1.372 +  aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
   1.373 +                                    aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
   1.374 +                                    aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
   1.375 +                                    aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
   1.376 +  aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
   1.377 +                                    aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
   1.378 +                                    aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
   1.379 +                                    aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
   1.380 +}
   1.381 +
   1.382 +inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
   1.383 +                                        Scalari16x8_t aFactorsB)
   1.384 +{
   1.385 +  return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
   1.386 +                               aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
   1.387 +                               aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
   1.388 +                               aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]);
   1.389 +}
   1.390 +
   1.391 +template<int8_t aIndex>
   1.392 +inline void AssertIndex()
   1.393 +{
   1.394 +  static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
   1.395 +                "Invalid splat index");
   1.396 +}
   1.397 +
   1.398 +template<int8_t aIndex>
   1.399 +inline Scalari32x4_t Splat32(Scalari32x4_t aM)
   1.400 +{
   1.401 +  AssertIndex<aIndex>();
   1.402 +  return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex],
   1.403 +                               aM.i32[aIndex], aM.i32[aIndex]);
   1.404 +}
   1.405 +
   1.406 +template<int8_t i>
   1.407 +inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM)
   1.408 +{
   1.409 +  AssertIndex<i>();
   1.410 +  return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
   1.411 +                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
   1.412 +                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
   1.413 +                              aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]);
   1.414 +}
   1.415 +
   1.416 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.417 +inline Scalari32x4_t Shuffle32(Scalari32x4_t aM)
   1.418 +{
   1.419 +  AssertIndex<i0>();
   1.420 +  AssertIndex<i1>();
   1.421 +  AssertIndex<i2>();
   1.422 +  AssertIndex<i3>();
   1.423 +  Scalari32x4_t m = aM;
   1.424 +  m.i32[0] = aM.i32[i3];
   1.425 +  m.i32[1] = aM.i32[i2];
   1.426 +  m.i32[2] = aM.i32[i1];
   1.427 +  m.i32[3] = aM.i32[i0];
   1.428 +  return m;
   1.429 +}
   1.430 +
   1.431 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.432 +inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM)
   1.433 +{
   1.434 +  AssertIndex<i0>();
   1.435 +  AssertIndex<i1>();
   1.436 +  AssertIndex<i2>();
   1.437 +  AssertIndex<i3>();
   1.438 +  Scalari16x8_t m = aM;
   1.439 +  m.i16[0] = aM.i16[i3];
   1.440 +  m.i16[1] = aM.i16[i2];
   1.441 +  m.i16[2] = aM.i16[i1];
   1.442 +  m.i16[3] = aM.i16[i0];
   1.443 +  return m;
   1.444 +}
   1.445 +
   1.446 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.447 +inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM)
   1.448 +{
   1.449 +  AssertIndex<i0>();
   1.450 +  AssertIndex<i1>();
   1.451 +  AssertIndex<i2>();
   1.452 +  AssertIndex<i3>();
   1.453 +  Scalari16x8_t m = aM;
   1.454 +  m.i16[4 + 0] = aM.i16[4 + i3];
   1.455 +  m.i16[4 + 1] = aM.i16[4 + i2];
   1.456 +  m.i16[4 + 2] = aM.i16[4 + i1];
   1.457 +  m.i16[4 + 3] = aM.i16[4 + i0];
   1.458 +  return m;
   1.459 +}
   1.460 +
   1.461 +template<int8_t aIndexLo, int8_t aIndexHi>
   1.462 +inline Scalaru16x8_t Splat16(Scalaru16x8_t aM)
   1.463 +{
   1.464 +  AssertIndex<aIndexLo>();
   1.465 +  AssertIndex<aIndexHi>();
   1.466 +  Scalaru16x8_t m;
   1.467 +  int16_t chosenValueLo = aM.u16[aIndexLo];
   1.468 +  m.u16[0] = chosenValueLo;
   1.469 +  m.u16[1] = chosenValueLo;
   1.470 +  m.u16[2] = chosenValueLo;
   1.471 +  m.u16[3] = chosenValueLo;
   1.472 +  int16_t chosenValueHi = aM.u16[4 + aIndexHi];
   1.473 +  m.u16[4] = chosenValueHi;
   1.474 +  m.u16[5] = chosenValueHi;
   1.475 +  m.u16[6] = chosenValueHi;
   1.476 +  m.u16[7] = chosenValueHi;
   1.477 +  return m;
   1.478 +}
   1.479 +
   1.480 +inline Scalaru8x16_t
   1.481 +InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2)
   1.482 +{
   1.483 +  return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1],
   1.484 +                              m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3],
   1.485 +                              m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5],
   1.486 +                              m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]);
   1.487 +}
   1.488 +
   1.489 +inline Scalaru8x16_t
   1.490 +InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2)
   1.491 +{
   1.492 +  return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1],
   1.493 +                              m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3],
   1.494 +                              m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5],
   1.495 +                              m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]);
   1.496 +}
   1.497 +
   1.498 +inline Scalaru16x8_t
   1.499 +InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2)
   1.500 +{
   1.501 +  return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
   1.502 +                               m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
   1.503 +}
   1.504 +
   1.505 +inline Scalaru16x8_t
   1.506 +InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2)
   1.507 +{
   1.508 +  return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
   1.509 +                               m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
   1.510 +}
   1.511 +
   1.512 +inline Scalari32x4_t
   1.513 +InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2)
   1.514 +{
   1.515 +  return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
   1.516 +}
   1.517 +
   1.518 +inline Scalari16x8_t
   1.519 +UnpackLo8x8ToI16x8(Scalaru8x16_t aM)
   1.520 +{
   1.521 +  Scalari16x8_t m;
   1.522 +  m.i16[0] = aM.u8[0];
   1.523 +  m.i16[1] = aM.u8[1];
   1.524 +  m.i16[2] = aM.u8[2];
   1.525 +  m.i16[3] = aM.u8[3];
   1.526 +  m.i16[4] = aM.u8[4];
   1.527 +  m.i16[5] = aM.u8[5];
   1.528 +  m.i16[6] = aM.u8[6];
   1.529 +  m.i16[7] = aM.u8[7];
   1.530 +  return m;
   1.531 +}
   1.532 +
   1.533 +inline Scalari16x8_t
   1.534 +UnpackHi8x8ToI16x8(Scalaru8x16_t aM)
   1.535 +{
   1.536 +  Scalari16x8_t m;
   1.537 +  m.i16[0] = aM.u8[8+0];
   1.538 +  m.i16[1] = aM.u8[8+1];
   1.539 +  m.i16[2] = aM.u8[8+2];
   1.540 +  m.i16[3] = aM.u8[8+3];
   1.541 +  m.i16[4] = aM.u8[8+4];
   1.542 +  m.i16[5] = aM.u8[8+5];
   1.543 +  m.i16[6] = aM.u8[8+6];
   1.544 +  m.i16[7] = aM.u8[8+7];
   1.545 +  return m;
   1.546 +}
   1.547 +
   1.548 +inline Scalaru16x8_t
   1.549 +UnpackLo8x8ToU16x8(Scalaru8x16_t aM)
   1.550 +{
   1.551 +  return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
   1.552 +                                uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
   1.553 +}
   1.554 +
   1.555 +inline Scalaru16x8_t
   1.556 +UnpackHi8x8ToU16x8(Scalaru8x16_t aM)
   1.557 +{
   1.558 +  return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3],
   1.559 +                                aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]);
   1.560 +}
   1.561 +
   1.562 +template<uint8_t aNumBytes>
   1.563 +inline Scalaru8x16_t
   1.564 +Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678)
   1.565 +{
   1.566 +  Scalaru8x16_t m;
   1.567 +  for (uint8_t i = 0; i < 16; i++) {
   1.568 +    uint8_t sourceByte = i + aNumBytes;
   1.569 +    m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
   1.570 +  }
   1.571 +  return m;
   1.572 +}
   1.573 +
   1.574 +template<typename T>
   1.575 +inline int16_t
   1.576 +SaturateTo16(T a)
   1.577 +{
   1.578 +  return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
   1.579 +}
   1.580 +
   1.581 +inline Scalari16x8_t
   1.582 +PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2)
   1.583 +{
   1.584 +  Scalari16x8_t m;
   1.585 +  m.i16[0] = SaturateTo16(m1.i32[0]);
   1.586 +  m.i16[1] = SaturateTo16(m1.i32[1]);
   1.587 +  m.i16[2] = SaturateTo16(m1.i32[2]);
   1.588 +  m.i16[3] = SaturateTo16(m1.i32[3]);
   1.589 +  m.i16[4] = SaturateTo16(m2.i32[0]);
   1.590 +  m.i16[5] = SaturateTo16(m2.i32[1]);
   1.591 +  m.i16[6] = SaturateTo16(m2.i32[2]);
   1.592 +  m.i16[7] = SaturateTo16(m2.i32[3]);
   1.593 +  return m;
   1.594 +}
   1.595 +
   1.596 +template<typename T>
   1.597 +inline uint16_t
   1.598 +SaturateToU16(T a)
   1.599 +{
   1.600 +  return uint16_t(umin(a & -(a >= 0), INT16_MAX));
   1.601 +}
   1.602 +
   1.603 +inline Scalaru16x8_t
   1.604 +PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2)
   1.605 +{
   1.606 +  Scalaru16x8_t m;
   1.607 +  m.u16[0] = SaturateToU16(m1.i32[0]);
   1.608 +  m.u16[1] = SaturateToU16(m1.i32[1]);
   1.609 +  m.u16[2] = SaturateToU16(m1.i32[2]);
   1.610 +  m.u16[3] = SaturateToU16(m1.i32[3]);
   1.611 +  m.u16[4] = SaturateToU16(m2.i32[0]);
   1.612 +  m.u16[5] = SaturateToU16(m2.i32[1]);
   1.613 +  m.u16[6] = SaturateToU16(m2.i32[2]);
   1.614 +  m.u16[7] = SaturateToU16(m2.i32[3]);
   1.615 +  return m;
   1.616 +}
   1.617 +
   1.618 +template<typename T>
   1.619 +inline uint8_t
   1.620 +SaturateTo8(T a)
   1.621 +{
   1.622 +  return uint8_t(umin(a & -(a >= 0), 255));
   1.623 +}
   1.624 +
   1.625 +inline Scalaru8x16_t
   1.626 +PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4)
   1.627 +{
   1.628 +  Scalaru8x16_t m;
   1.629 +  m.u8[0]  = SaturateTo8(m1.i32[0]);
   1.630 +  m.u8[1]  = SaturateTo8(m1.i32[1]);
   1.631 +  m.u8[2]  = SaturateTo8(m1.i32[2]);
   1.632 +  m.u8[3]  = SaturateTo8(m1.i32[3]);
   1.633 +  m.u8[4]  = SaturateTo8(m2.i32[0]);
   1.634 +  m.u8[5]  = SaturateTo8(m2.i32[1]);
   1.635 +  m.u8[6]  = SaturateTo8(m2.i32[2]);
   1.636 +  m.u8[7]  = SaturateTo8(m2.i32[3]);
   1.637 +  m.u8[8]  = SaturateTo8(m3.i32[0]);
   1.638 +  m.u8[9]  = SaturateTo8(m3.i32[1]);
   1.639 +  m.u8[10] = SaturateTo8(m3.i32[2]);
   1.640 +  m.u8[11] = SaturateTo8(m3.i32[3]);
   1.641 +  m.u8[12] = SaturateTo8(m4.i32[0]);
   1.642 +  m.u8[13] = SaturateTo8(m4.i32[1]);
   1.643 +  m.u8[14] = SaturateTo8(m4.i32[2]);
   1.644 +  m.u8[15] = SaturateTo8(m4.i32[3]);
   1.645 +  return m;
   1.646 +}
   1.647 +
   1.648 +inline Scalaru8x16_t
   1.649 +PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2)
   1.650 +{
   1.651 +  Scalaru8x16_t m;
   1.652 +  m.u8[0]  = SaturateTo8(m1.i16[0]);
   1.653 +  m.u8[1]  = SaturateTo8(m1.i16[1]);
   1.654 +  m.u8[2]  = SaturateTo8(m1.i16[2]);
   1.655 +  m.u8[3]  = SaturateTo8(m1.i16[3]);
   1.656 +  m.u8[4]  = SaturateTo8(m1.i16[4]);
   1.657 +  m.u8[5]  = SaturateTo8(m1.i16[5]);
   1.658 +  m.u8[6]  = SaturateTo8(m1.i16[6]);
   1.659 +  m.u8[7]  = SaturateTo8(m1.i16[7]);
   1.660 +  m.u8[8]  = SaturateTo8(m2.i16[0]);
   1.661 +  m.u8[9]  = SaturateTo8(m2.i16[1]);
   1.662 +  m.u8[10] = SaturateTo8(m2.i16[2]);
   1.663 +  m.u8[11] = SaturateTo8(m2.i16[3]);
   1.664 +  m.u8[12] = SaturateTo8(m2.i16[4]);
   1.665 +  m.u8[13] = SaturateTo8(m2.i16[5]);
   1.666 +  m.u8[14] = SaturateTo8(m2.i16[6]);
   1.667 +  m.u8[15] = SaturateTo8(m2.i16[7]);
   1.668 +  return m;
   1.669 +}
   1.670 +
   1.671 +// Fast approximate division by 255. It has the property that
   1.672 +// for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
   1.673 +// But it only uses two adds and two shifts instead of an
   1.674 +// integer division (which is expensive on many processors).
   1.675 +//
   1.676 +// equivalent to v/255
   1.677 +template<class B, class A>
   1.678 +inline B FastDivideBy255(A v)
   1.679 +{
   1.680 +  return ((v << 8) + v + 255) >> 16;
   1.681 +}
   1.682 +
   1.683 +inline Scalaru16x8_t
   1.684 +FastDivideBy255_16(Scalaru16x8_t m)
   1.685 +{
   1.686 +  return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
   1.687 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
   1.688 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
   1.689 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
   1.690 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
   1.691 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
   1.692 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
   1.693 +                                FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
   1.694 +}
   1.695 +
   1.696 +inline Scalari32x4_t
   1.697 +FastDivideBy255(Scalari32x4_t m)
   1.698 +{
   1.699 +  return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]),
   1.700 +                               FastDivideBy255<int32_t>(m.i32[1]),
   1.701 +                               FastDivideBy255<int32_t>(m.i32[2]),
   1.702 +                               FastDivideBy255<int32_t>(m.i32[3]));
   1.703 +}
   1.704 +
   1.705 +inline Scalaru8x16_t
   1.706 +Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b)
   1.707 +{
   1.708 +  return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
   1.709 +                              (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
   1.710 +                              (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
   1.711 +                              (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
   1.712 +                              (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
   1.713 +                              (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
   1.714 +                              (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
   1.715 +                              (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
   1.716 +                              (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]),
   1.717 +                              (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]),
   1.718 +                              (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]),
   1.719 +                              (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]),
   1.720 +                              (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]),
   1.721 +                              (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]),
   1.722 +                              (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]),
   1.723 +                              (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7]));
   1.724 +}
   1.725 +
   1.726 +inline Scalari32x4_t
   1.727 +Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b)
   1.728 +{
   1.729 +  return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
   1.730 +                               (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
   1.731 +                               (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
   1.732 +                               (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
   1.733 +}
   1.734 +
   1.735 +inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t)
   1.736 +{
   1.737 +  return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
   1.738 +                                a.f32[1] + (b.f32[1] - a.f32[1]) * t,
   1.739 +                                a.f32[2] + (b.f32[2] - a.f32[2]) * t,
   1.740 +                                a.f32[3] + (b.f32[3] - a.f32[3]) * t);
   1.741 +}
   1.742 +
   1.743 +inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb)
   1.744 +{
   1.745 +  return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb,
   1.746 +                                a.f32[1] * wa + b.f32[1] * wb,
   1.747 +                                a.f32[2] * wa + b.f32[2] * wb,
   1.748 +                                a.f32[3] * wa + b.f32[3] * wb);
   1.749 +}
   1.750 +
   1.751 +inline Scalarf32x4_t AbsF32(Scalarf32x4_t a)
   1.752 +{
   1.753 +  return FromF32<Scalarf32x4_t>(fabs(a.f32[0]),
   1.754 +                                fabs(a.f32[1]),
   1.755 +                                fabs(a.f32[2]),
   1.756 +                                fabs(a.f32[3]));
   1.757 +}
   1.758 +
   1.759 +inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b)
   1.760 +{
   1.761 +  return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0],
   1.762 +                                a.f32[1] + b.f32[1],
   1.763 +                                a.f32[2] + b.f32[2],
   1.764 +                                a.f32[3] + b.f32[3]);
   1.765 +}
   1.766 +
   1.767 +inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b)
   1.768 +{
   1.769 +  return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0],
   1.770 +                                a.f32[1] * b.f32[1],
   1.771 +                                a.f32[2] * b.f32[2],
   1.772 +                                a.f32[3] * b.f32[3]);
   1.773 +}
   1.774 +
   1.775 +inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b)
   1.776 +{
   1.777 +  return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0],
   1.778 +                                a.f32[1] / b.f32[1],
   1.779 +                                a.f32[2] / b.f32[2],
   1.780 +                                a.f32[3] / b.f32[3]);
   1.781 +}
   1.782 +
   1.783 +template<uint8_t aIndex>
   1.784 +inline Scalarf32x4_t SplatF32(Scalarf32x4_t m)
   1.785 +{
   1.786 +  AssertIndex<aIndex>();
   1.787 +  return FromF32<Scalarf32x4_t>(m.f32[aIndex],
   1.788 +                                m.f32[aIndex],
   1.789 +                                m.f32[aIndex],
   1.790 +                                m.f32[aIndex]);
   1.791 +}
   1.792 +
   1.793 +inline Scalari32x4_t F32ToI32(Scalarf32x4_t m)
   1.794 +{
   1.795 +  return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)),
   1.796 +                               int32_t(floor(m.f32[1] + 0.5f)),
   1.797 +                               int32_t(floor(m.f32[2] + 0.5f)),
   1.798 +                               int32_t(floor(m.f32[3] + 0.5f)));
   1.799 +}
   1.800 +
   1.801 +#ifdef SIMD_COMPILE_SSE2
   1.802 +
   1.803 +// SSE2
   1.804 +
   1.805 +template<>
   1.806 +inline __m128i
   1.807 +Load8<__m128i>(const uint8_t* aSource)
   1.808 +{
   1.809 +  return _mm_load_si128((const __m128i*)aSource);
   1.810 +}
   1.811 +
   1.812 +inline void Store8(uint8_t* aTarget, __m128i aM)
   1.813 +{
   1.814 +  _mm_store_si128((__m128i*)aTarget, aM);
   1.815 +}
   1.816 +
   1.817 +template<>
   1.818 +inline __m128i FromZero8<__m128i>()
   1.819 +{
   1.820 +  return _mm_setzero_si128();
   1.821 +}
   1.822 +
   1.823 +template<>
   1.824 +inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
   1.825 +                              uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
   1.826 +{
   1.827 +  return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
   1.828 +                        (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
   1.829 +}
   1.830 +
   1.831 +template<>
   1.832 +inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
   1.833 +{
   1.834 +  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
   1.835 +}
   1.836 +
   1.837 +template<>
   1.838 +inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
   1.839 +{
   1.840 +  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
   1.841 +}
   1.842 +
   1.843 +template<>
   1.844 +inline __m128i FromI16<__m128i>(int16_t a)
   1.845 +{
   1.846 +  return _mm_set1_epi16(a);
   1.847 +}
   1.848 +
   1.849 +template<>
   1.850 +inline __m128i FromU16<__m128i>(uint16_t a)
   1.851 +{
   1.852 +  return _mm_set1_epi16((int16_t)a);
   1.853 +}
   1.854 +
   1.855 +template<>
   1.856 +inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d)
   1.857 +{
   1.858 +  return _mm_setr_epi32(a, b, c, d);
   1.859 +}
   1.860 +
   1.861 +template<>
   1.862 +inline __m128i From32<__m128i>(int32_t a)
   1.863 +{
   1.864 +  return _mm_set1_epi32(a);
   1.865 +}
   1.866 +
   1.867 +template<>
   1.868 +inline __m128 FromF32<__m128>(float a, float b, float c, float d)
   1.869 +{
   1.870 +  return _mm_setr_ps(a, b, c, d);
   1.871 +}
   1.872 +
   1.873 +template<>
   1.874 +inline __m128 FromF32<__m128>(float a)
   1.875 +{
   1.876 +  return _mm_set1_ps(a);
   1.877 +}
   1.878 +
   1.879 +template<int32_t aNumberOfBits>
   1.880 +inline __m128i ShiftRight16(__m128i aM)
   1.881 +{
   1.882 +  return _mm_srli_epi16(aM, aNumberOfBits);
   1.883 +}
   1.884 +
   1.885 +template<int32_t aNumberOfBits>
   1.886 +inline __m128i ShiftRight32(__m128i aM)
   1.887 +{
   1.888 +  return _mm_srai_epi32(aM, aNumberOfBits);
   1.889 +}
   1.890 +
   1.891 +inline __m128i Add16(__m128i aM1, __m128i aM2)
   1.892 +{
   1.893 +  return _mm_add_epi16(aM1, aM2);
   1.894 +}
   1.895 +
   1.896 +inline __m128i Add32(__m128i aM1, __m128i aM2)
   1.897 +{
   1.898 +  return _mm_add_epi32(aM1, aM2);
   1.899 +}
   1.900 +
   1.901 +inline __m128i Sub16(__m128i aM1, __m128i aM2)
   1.902 +{
   1.903 +  return _mm_sub_epi16(aM1, aM2);
   1.904 +}
   1.905 +
   1.906 +inline __m128i Sub32(__m128i aM1, __m128i aM2)
   1.907 +{
   1.908 +  return _mm_sub_epi32(aM1, aM2);
   1.909 +}
   1.910 +
   1.911 +inline __m128i Min8(__m128i aM1, __m128i aM2)
   1.912 +{
   1.913 +  return _mm_min_epu8(aM1, aM2);
   1.914 +}
   1.915 +
   1.916 +inline __m128i Max8(__m128i aM1, __m128i aM2)
   1.917 +{
   1.918 +  return _mm_max_epu8(aM1, aM2);
   1.919 +}
   1.920 +
   1.921 +inline __m128i Min32(__m128i aM1, __m128i aM2)
   1.922 +{
   1.923 +  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
   1.924 +  __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
   1.925 +  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
   1.926 +}
   1.927 +
   1.928 +inline __m128i Max32(__m128i aM1, __m128i aM2)
   1.929 +{
   1.930 +  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
   1.931 +  __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
   1.932 +  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
   1.933 +}
   1.934 +
   1.935 +inline __m128i Mul16(__m128i aM1, __m128i aM2)
   1.936 +{
   1.937 +  return _mm_mullo_epi16(aM1, aM2);
   1.938 +}
   1.939 +
   1.940 +inline __m128i MulU16(__m128i aM1, __m128i aM2)
   1.941 +{
   1.942 +  return _mm_mullo_epi16(aM1, aM2);
   1.943 +}
   1.944 +
   1.945 +inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1,
   1.946 +                                __m128i aFactorsA2B2,
   1.947 +                                __m128i& aProductA,
   1.948 +                                __m128i& aProductB)
   1.949 +{
   1.950 +  __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
   1.951 +  __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
   1.952 +  aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
   1.953 +  aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
   1.954 +}
   1.955 +
   1.956 +inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA,
   1.957 +                                  __m128i aFactorsB)
   1.958 +{
   1.959 +  return _mm_madd_epi16(aFactorsA, aFactorsB);
   1.960 +}
   1.961 +
   1.962 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.963 +inline __m128i Shuffle32(__m128i aM)
   1.964 +{
   1.965 +  AssertIndex<i0>();
   1.966 +  AssertIndex<i1>();
   1.967 +  AssertIndex<i2>();
   1.968 +  AssertIndex<i3>();
   1.969 +  return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
   1.970 +}
   1.971 +
   1.972 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.973 +inline __m128i ShuffleLo16(__m128i aM)
   1.974 +{
   1.975 +  AssertIndex<i0>();
   1.976 +  AssertIndex<i1>();
   1.977 +  AssertIndex<i2>();
   1.978 +  AssertIndex<i3>();
   1.979 +  return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
   1.980 +}
   1.981 +
   1.982 +template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
   1.983 +inline __m128i ShuffleHi16(__m128i aM)
   1.984 +{
   1.985 +  AssertIndex<i0>();
   1.986 +  AssertIndex<i1>();
   1.987 +  AssertIndex<i2>();
   1.988 +  AssertIndex<i3>();
   1.989 +  return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
   1.990 +}
   1.991 +
   1.992 +template<int8_t aIndex>
   1.993 +inline __m128i Splat32(__m128i aM)
   1.994 +{
   1.995 +  return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
   1.996 +}
   1.997 +
   1.998 +template<int8_t aIndex>
   1.999 +inline __m128i Splat32On8(__m128i aM)
  1.1000 +{
  1.1001 +  return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
  1.1002 +}
  1.1003 +
  1.1004 +template<int8_t aIndexLo, int8_t aIndexHi>
  1.1005 +inline __m128i Splat16(__m128i aM)
  1.1006 +{
  1.1007 +  AssertIndex<aIndexLo>();
  1.1008 +  AssertIndex<aIndexHi>();
  1.1009 +  return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>(
  1.1010 +           ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM));
  1.1011 +}
  1.1012 +
  1.1013 +inline __m128i
  1.1014 +UnpackLo8x8ToI16x8(__m128i m)
  1.1015 +{
  1.1016 +  __m128i zero = _mm_set1_epi8(0);
  1.1017 +  return _mm_unpacklo_epi8(m, zero);
  1.1018 +}
  1.1019 +
  1.1020 +inline __m128i
  1.1021 +UnpackHi8x8ToI16x8(__m128i m)
  1.1022 +{
  1.1023 +  __m128i zero = _mm_set1_epi8(0);
  1.1024 +  return _mm_unpackhi_epi8(m, zero);
  1.1025 +}
  1.1026 +
  1.1027 +inline __m128i
  1.1028 +UnpackLo8x8ToU16x8(__m128i m)
  1.1029 +{
  1.1030 +  __m128i zero = _mm_set1_epi8(0);
  1.1031 +  return _mm_unpacklo_epi8(m, zero);
  1.1032 +}
  1.1033 +
  1.1034 +inline __m128i
  1.1035 +UnpackHi8x8ToU16x8(__m128i m)
  1.1036 +{
  1.1037 +  __m128i zero = _mm_set1_epi8(0);
  1.1038 +  return _mm_unpackhi_epi8(m, zero);
  1.1039 +}
  1.1040 +
  1.1041 +inline __m128i
  1.1042 +InterleaveLo8(__m128i m1, __m128i m2)
  1.1043 +{
  1.1044 +  return _mm_unpacklo_epi8(m1, m2);
  1.1045 +}
  1.1046 +
  1.1047 +inline __m128i
  1.1048 +InterleaveHi8(__m128i m1, __m128i m2)
  1.1049 +{
  1.1050 +  return _mm_unpackhi_epi8(m1, m2);
  1.1051 +}
  1.1052 +
  1.1053 +inline __m128i
  1.1054 +InterleaveLo16(__m128i m1, __m128i m2)
  1.1055 +{
  1.1056 +  return _mm_unpacklo_epi16(m1, m2);
  1.1057 +}
  1.1058 +
  1.1059 +inline __m128i
  1.1060 +InterleaveHi16(__m128i m1, __m128i m2)
  1.1061 +{
  1.1062 +  return _mm_unpackhi_epi16(m1, m2);
  1.1063 +}
  1.1064 +
  1.1065 +inline __m128i
  1.1066 +InterleaveLo32(__m128i m1, __m128i m2)
  1.1067 +{
  1.1068 +  return _mm_unpacklo_epi32(m1, m2);
  1.1069 +}
  1.1070 +
  1.1071 +template<uint8_t aNumBytes>
  1.1072 +inline __m128i
  1.1073 +Rotate8(__m128i a1234, __m128i a5678)
  1.1074 +{
  1.1075 +  return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes));
  1.1076 +}
  1.1077 +
  1.1078 +inline __m128i
  1.1079 +PackAndSaturate32To16(__m128i m1, __m128i m2)
  1.1080 +{
  1.1081 +  return _mm_packs_epi32(m1, m2);
  1.1082 +}
  1.1083 +
  1.1084 +inline __m128i
  1.1085 +PackAndSaturate32ToU16(__m128i m1, __m128i m2)
  1.1086 +{
  1.1087 +  return _mm_packs_epi32(m1, m2);
  1.1088 +}
  1.1089 +
  1.1090 +inline __m128i
  1.1091 +PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4)
  1.1092 +{
  1.1093 +  // Pack into 8 16bit signed integers (saturating).
  1.1094 +  __m128i m12 = _mm_packs_epi32(m1, m2);
  1.1095 +  __m128i m34 = _mm_packs_epi32(m3, m4);
  1.1096 +
  1.1097 +  // Pack into 16 8bit unsigned integers (saturating).
  1.1098 +  return _mm_packus_epi16(m12, m34);
  1.1099 +}
  1.1100 +
  1.1101 +inline __m128i
  1.1102 +PackAndSaturate16To8(__m128i m1, __m128i m2)
  1.1103 +{
  1.1104 +  // Pack into 16 8bit unsigned integers (saturating).
  1.1105 +  return _mm_packus_epi16(m1, m2);
  1.1106 +}
  1.1107 +
  1.1108 +inline __m128i
  1.1109 +FastDivideBy255(__m128i m)
  1.1110 +{
  1.1111 +  // v = m << 8
  1.1112 +  __m128i v = _mm_slli_epi32(m, 8);
  1.1113 +  // v = v + (m + (255,255,255,255))
  1.1114 +  v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
  1.1115 +  // v = v >> 16
  1.1116 +  return _mm_srai_epi32(v, 16);
  1.1117 +}
  1.1118 +
  1.1119 +inline __m128i
  1.1120 +FastDivideBy255_16(__m128i m)
  1.1121 +{
  1.1122 +  __m128i zero = _mm_set1_epi16(0);
  1.1123 +  __m128i lo = _mm_unpacklo_epi16(m, zero);
  1.1124 +  __m128i hi = _mm_unpackhi_epi16(m, zero);
  1.1125 +  return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
  1.1126 +}
  1.1127 +
  1.1128 +inline __m128i
  1.1129 +Pick(__m128i mask, __m128i a, __m128i b)
  1.1130 +{
  1.1131 +  return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
  1.1132 +}
  1.1133 +
  1.1134 +inline __m128 MixF32(__m128 a, __m128 b, float t)
  1.1135 +{
  1.1136 +  return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
  1.1137 +}
  1.1138 +
  1.1139 +inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb)
  1.1140 +{
  1.1141 +  return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb)));
  1.1142 +}
  1.1143 +
  1.1144 +inline __m128 AbsF32(__m128 a)
  1.1145 +{
  1.1146 +  return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
  1.1147 +}
  1.1148 +
  1.1149 +inline __m128 AddF32(__m128 a, __m128 b)
  1.1150 +{
  1.1151 +  return _mm_add_ps(a, b);
  1.1152 +}
  1.1153 +
  1.1154 +inline __m128 MulF32(__m128 a, __m128 b)
  1.1155 +{
  1.1156 +  return _mm_mul_ps(a, b);
  1.1157 +}
  1.1158 +
  1.1159 +inline __m128 DivF32(__m128 a, __m128 b)
  1.1160 +{
  1.1161 +  return _mm_div_ps(a, b);
  1.1162 +}
  1.1163 +
  1.1164 +template<uint8_t aIndex>
  1.1165 +inline __m128 SplatF32(__m128 m)
  1.1166 +{
  1.1167 +  AssertIndex<aIndex>();
  1.1168 +  return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
  1.1169 +}
  1.1170 +
  1.1171 +inline __m128i F32ToI32(__m128 m)
  1.1172 +{
  1.1173 +  return _mm_cvtps_epi32(m);
  1.1174 +}
  1.1175 +
  1.1176 +#endif // SIMD_COMPILE_SSE2
  1.1177 +
  1.1178 +} // namespace simd
  1.1179 +
  1.1180 +} // namespace gfx
  1.1181 +} // namespace mozilla
  1.1182 +
  1.1183 +#endif // _MOZILLA_GFX_SIMD_H_

mercurial