gfx/2d/FilterProcessingSIMD-inl.h

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/2d/FilterProcessingSIMD-inl.h	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1081 @@
     1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     1.5 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "FilterProcessing.h"
    1.10 +
    1.11 +#include "SIMD.h"
    1.12 +#include "SVGTurbulenceRenderer-inl.h"
    1.13 +
    1.14 +namespace mozilla {
    1.15 +namespace gfx {
    1.16 +
    1.17 +template<typename u8x16_t>
    1.18 +inline TemporaryRef<DataSourceSurface>
    1.19 +ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
    1.20 +{
    1.21 +  IntSize size = aSurface->GetSize();
    1.22 +  RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
    1.23 +  RefPtr<DataSourceSurface> output =
    1.24 +    Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
    1.25 +  uint8_t *inputData = input->GetData();
    1.26 +  uint8_t *outputData = output->GetData();
    1.27 +  int32_t inputStride = input->Stride();
    1.28 +  int32_t outputStride = output->Stride();
    1.29 +  switch (input->GetFormat()) {
    1.30 +    case SurfaceFormat::B8G8R8A8:
    1.31 +      output = input;
    1.32 +      break;
    1.33 +    case SurfaceFormat::B8G8R8X8:
    1.34 +      for (int32_t y = 0; y < size.height; y++) {
    1.35 +        for (int32_t x = 0; x < size.width; x++) {
    1.36 +          int32_t inputIndex = y * inputStride + 4 * x;
    1.37 +          int32_t outputIndex = y * outputStride + 4 * x;
    1.38 +          outputData[outputIndex + 0] = inputData[inputIndex + 0];
    1.39 +          outputData[outputIndex + 1] = inputData[inputIndex + 1];
    1.40 +          outputData[outputIndex + 2] = inputData[inputIndex + 2];
    1.41 +          outputData[outputIndex + 3] = 255;
    1.42 +        }
    1.43 +      }
    1.44 +      break;
    1.45 +    case SurfaceFormat::R8G8B8A8:
    1.46 +      for (int32_t y = 0; y < size.height; y++) {
    1.47 +        for (int32_t x = 0; x < size.width; x++) {
    1.48 +          int32_t inputIndex = y * inputStride + 4 * x;
    1.49 +          int32_t outputIndex = y * outputStride + 4 * x;
    1.50 +          outputData[outputIndex + 2] = inputData[inputIndex + 0];
    1.51 +          outputData[outputIndex + 1] = inputData[inputIndex + 1];
    1.52 +          outputData[outputIndex + 0] = inputData[inputIndex + 2];
    1.53 +          outputData[outputIndex + 3] = inputData[inputIndex + 3];
    1.54 +        }
    1.55 +      }
    1.56 +      break;
    1.57 +    case SurfaceFormat::R8G8B8X8:
    1.58 +      for (int32_t y = 0; y < size.height; y++) {
    1.59 +        for (int32_t x = 0; x < size.width; x++) {
    1.60 +          int32_t inputIndex = y * inputStride + 4 * x;
    1.61 +          int32_t outputIndex = y * outputStride + 4 * x;
    1.62 +          outputData[outputIndex + 2] = inputData[inputIndex + 0];
    1.63 +          outputData[outputIndex + 1] = inputData[inputIndex + 1];
    1.64 +          outputData[outputIndex + 0] = inputData[inputIndex + 2];
    1.65 +          outputData[outputIndex + 3] = 255;
    1.66 +        }
    1.67 +      }
    1.68 +      break;
    1.69 +    case SurfaceFormat::A8:
    1.70 +      for (int32_t y = 0; y < size.height; y++) {
    1.71 +        for (int32_t x = 0; x < size.width; x += 16) {
    1.72 +          int32_t inputIndex = y * inputStride + x;
    1.73 +          int32_t outputIndex = y * outputStride + 4 * x;
    1.74 +          u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
    1.75 +          // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
    1.76 +          // interleaving with 0000000000000000 twice.
    1.77 +          u8x16_t zero = simd::FromZero8<u8x16_t>();
    1.78 +          u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
    1.79 +          u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
    1.80 +          u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
    1.81 +          u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
    1.82 +          u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
    1.83 +          u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
    1.84 +          simd::Store8(&outputData[outputIndex], p1To4);
    1.85 +          if ((x + 4) * 4 < outputStride) {
    1.86 +            simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
    1.87 +          }
    1.88 +          if ((x + 8) * 4 < outputStride) {
    1.89 +            simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
    1.90 +          }
    1.91 +          if ((x + 12) * 4 < outputStride) {
    1.92 +            simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
    1.93 +          }
    1.94 +        }
    1.95 +      }
    1.96 +      break;
    1.97 +    default:
    1.98 +      output = nullptr;
    1.99 +      break;
   1.100 +  }
   1.101 +  return output;
   1.102 +}
   1.103 +
   1.104 +template<typename u8x16_t>
   1.105 +inline void
   1.106 +ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
   1.107 +{
   1.108 +  for (int32_t y = 0; y < size.height; y++) {
   1.109 +    for (int32_t x = 0; x < size.width; x += 16) {
   1.110 +      // Process 16 pixels at a time.
   1.111 +      // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
   1.112 +      int32_t sourceIndex = y * sourceStride + 4 * x;
   1.113 +      int32_t targetIndex = y * alphaStride + x;
   1.114 +
   1.115 +      u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
   1.116 +      u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
   1.117 +      u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
   1.118 +      u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
   1.119 +
   1.120 +      bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   1.121 +      if (4 * (x + 4) < sourceStride) {
   1.122 +        bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
   1.123 +      }
   1.124 +      if (4 * (x + 8) < sourceStride) {
   1.125 +        bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
   1.126 +      }
   1.127 +      if (4 * (x + 12) < sourceStride) {
   1.128 +        bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
   1.129 +      }
   1.130 +
   1.131 +      u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   1.132 +      u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   1.133 +      u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   1.134 +      u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   1.135 +      u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   1.136 +      u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   1.137 +      u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   1.138 +      u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   1.139 +      u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   1.140 +      u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   1.141 +      u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   1.142 +
   1.143 +      simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
   1.144 +    }
   1.145 +  }
   1.146 +}
   1.147 +
   1.148 +// This function calculates the result color values for four pixels, but for
   1.149 +// only two color channels - either b & r or g & a. However, the a result will
   1.150 +// not be used.
   1.151 +// source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
   1.152 +// sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
   1.153 +// alpha of all four pixels (and both aaaa's are the same).
   1.154 +// blendendComponent1 and blendedComponent2 are the out parameters.
   1.155 +template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
   1.156 +inline void
   1.157 +BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
   1.158 +                               i16x8_t dest, const i16x8_t& destAlpha,
   1.159 +                               i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
   1.160 +{
   1.161 +  i16x8_t x255 = simd::FromI16<i16x8_t>(255);
   1.162 +
   1.163 +  switch (aBlendMode) {
   1.164 +
   1.165 +    case BLEND_MODE_MULTIPLY:
   1.166 +    {
   1.167 +      // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
   1.168 +      i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   1.169 +      i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   1.170 +      i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
   1.171 +
   1.172 +      i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
   1.173 +      i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
   1.174 +      blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
   1.175 +      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   1.176 +
   1.177 +      i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
   1.178 +      i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
   1.179 +      blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
   1.180 +      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   1.181 +
   1.182 +      break;
   1.183 +    }
   1.184 +
   1.185 +    case BLEND_MODE_SCREEN:
   1.186 +    {
   1.187 +      // val = 255 * (source + dest) + (0 - dest) * source;
   1.188 +      i16x8_t sourcePlusDest = simd::Add16(source, dest);
   1.189 +      i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
   1.190 +
   1.191 +      i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
   1.192 +      i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
   1.193 +      blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
   1.194 +      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   1.195 +
   1.196 +      i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
   1.197 +      i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
   1.198 +      blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
   1.199 +      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   1.200 +
   1.201 +      break;
   1.202 +    }
   1.203 +
   1.204 +    case BLEND_MODE_DARKEN:
   1.205 +    case BLEND_MODE_LIGHTEN:
   1.206 +    {
   1.207 +      // Darken:
   1.208 +      // val = min((255 - destAlpha) * source + 255                 * dest,
   1.209 +      //           255               * source + (255 - sourceAlpha) * dest);
   1.210 +      //
   1.211 +      // Lighten:
   1.212 +      // val = max((255 - destAlpha) * source + 255                 * dest,
   1.213 +      //           255               * source + (255 - sourceAlpha) * dest);
   1.214 +
   1.215 +      i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   1.216 +      i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   1.217 +
   1.218 +      i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
   1.219 +      i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
   1.220 +      i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
   1.221 +      i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
   1.222 +      i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
   1.223 +      blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
   1.224 +      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   1.225 +
   1.226 +      i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
   1.227 +      i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
   1.228 +      i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
   1.229 +      i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
   1.230 +      i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
   1.231 +      blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
   1.232 +      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   1.233 +
   1.234 +      break;
   1.235 +    }
   1.236 +
   1.237 +  }
   1.238 +}
   1.239 +
   1.240 +// The alpha channel is subject to a different calculation than the RGB
   1.241 +// channels, and this calculation is the same for all blend modes:
   1.242 +// resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
   1.243 +template<typename i16x8_t, typename i32x4_t>
   1.244 +inline i32x4_t
   1.245 +BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
   1.246 +{
   1.247 +  // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
   1.248 +  // appropriately. The calculation is rewritten as follows:
   1.249 +  // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
   1.250 +  //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
   1.251 +  //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
   1.252 +  //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
   1.253 +  i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
   1.254 +  i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
   1.255 +  i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
   1.256 +  i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
   1.257 +  return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
   1.258 +}
   1.259 +
   1.260 +template<typename u8x16_t, typename i16x8_t>
   1.261 +inline void
   1.262 +UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
   1.263 +                           i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
   1.264 +{
   1.265 +  // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
   1.266 +  i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
   1.267 +  i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
   1.268 +  i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
   1.269 +  i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
   1.270 +  bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
   1.271 +  rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
   1.272 +}
   1.273 +
   1.274 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1.275 +inline u8x16_t
   1.276 +ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
   1.277 +                         i32x4_t rrrr1234, const i32x4_t& aaaa1234)
   1.278 +{
   1.279 +  // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
   1.280 +  i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
   1.281 +  i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
   1.282 +  i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
   1.283 +  i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
   1.284 +  i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
   1.285 +  i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
   1.286 +  return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
   1.287 +}
   1.288 +
   1.289 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
   1.290 +inline TemporaryRef<DataSourceSurface>
   1.291 +ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
   1.292 +{
   1.293 +  IntSize size = aInput1->GetSize();
   1.294 +  RefPtr<DataSourceSurface> target =
   1.295 +    Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
   1.296 +  if (!target) {
   1.297 +    return nullptr;
   1.298 +  }
   1.299 +
   1.300 +  uint8_t* source1Data = aInput1->GetData();
   1.301 +  uint8_t* source2Data = aInput2->GetData();
   1.302 +  uint8_t* targetData = target->GetData();
   1.303 +  int32_t targetStride = target->Stride();
   1.304 +  int32_t source1Stride = aInput1->Stride();
   1.305 +  int32_t source2Stride = aInput2->Stride();
   1.306 +
   1.307 +  for (int32_t y = 0; y < size.height; y++) {
   1.308 +    for (int32_t x = 0; x < size.width; x += 4) {
   1.309 +      int32_t targetIndex = y * targetStride + 4 * x;
   1.310 +      int32_t source1Index = y * source1Stride + 4 * x;
   1.311 +      int32_t source2Index = y * source2Stride + 4 * x;
   1.312 +
   1.313 +      u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
   1.314 +      u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
   1.315 +
   1.316 +      // The blending calculation for the RGB channels all need access to the
   1.317 +      // alpha channel of their pixel, and the alpha calculation is different,
   1.318 +      // so it makes sense to separate by channel.
   1.319 +
   1.320 +      i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
   1.321 +      i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
   1.322 +      UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
   1.323 +      UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
   1.324 +      i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
   1.325 +      i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
   1.326 +
   1.327 +      // We only use blendedB, blendedG and blendedR.
   1.328 +      i32x4_t blendedB, blendedG, blendedR, blendedA;
   1.329 +      BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
   1.330 +      BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
   1.331 +
   1.332 +      // Throw away blendedA and overwrite it with the correct blended alpha.
   1.333 +      blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
   1.334 +
   1.335 +      u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
   1.336 +      simd::Store8(&targetData[targetIndex], result1234);
   1.337 +    }
   1.338 +  }
   1.339 +
   1.340 +  return target;
   1.341 +}
   1.342 +
   1.343 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1.344 +static TemporaryRef<DataSourceSurface>
   1.345 +ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
   1.346 +                      BlendMode aBlendMode)
   1.347 +{
   1.348 +  switch (aBlendMode) {
   1.349 +    case BLEND_MODE_MULTIPLY:
   1.350 +      return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
   1.351 +    case BLEND_MODE_SCREEN:
   1.352 +      return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
   1.353 +    case BLEND_MODE_DARKEN:
   1.354 +      return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
   1.355 +    case BLEND_MODE_LIGHTEN:
   1.356 +      return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
   1.357 +    default:
   1.358 +      return nullptr;
   1.359 +  }
   1.360 +}
   1.361 +
   1.362 +template<MorphologyOperator Operator, typename u8x16_t>
   1.363 +static u8x16_t
   1.364 +Morph8(u8x16_t a, u8x16_t b)
   1.365 +{
   1.366 +  return Operator == MORPHOLOGY_OPERATOR_ERODE ?
   1.367 +    simd::Min8(a, b) : simd::Max8(a, b);
   1.368 +}
   1.369 +
   1.370 +// Set every pixel to the per-component minimum or maximum of the pixels around
   1.371 +// it that are up to aRadius pixels away from it (horizontally).
   1.372 +template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
   1.373 +inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   1.374 +                                           uint8_t* aDestData, int32_t aDestStride,
   1.375 +                                           const IntRect& aDestRect, int32_t aRadius)
   1.376 +{
   1.377 +  static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
   1.378 +                op == MORPHOLOGY_OPERATOR_DILATE,
   1.379 +                "unexpected morphology operator");
   1.380 +
   1.381 +  int32_t kernelSize = aRadius + 1 + aRadius;
   1.382 +  MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
   1.383 +  MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
   1.384 +  int32_t completeKernelSizeForFourPixels = kernelSize + 3;
   1.385 +  MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
   1.386 +             completeKernelSizeForFourPixels % 4 == 2);
   1.387 +
   1.388 +  // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
   1.389 +  // the way we need them to be.
   1.390 +
   1.391 +  IntRect sourceRect = aDestRect;
   1.392 +  sourceRect.Inflate(aRadius, 0);
   1.393 +
   1.394 +  for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
   1.395 +    int32_t kernelStartX = aDestRect.x - aRadius;
   1.396 +    for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
   1.397 +      // We process four pixels (16 color values) at a time.
   1.398 +      // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
   1.399 +      // source values can be read beyond that because the source is extended
   1.400 +      // by aRadius pixels.
   1.401 +
   1.402 +      int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
   1.403 +      u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   1.404 +      u8x16_t m1234 = p1234;
   1.405 +
   1.406 +      for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
   1.407 +        u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
   1.408 +          simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
   1.409 +          simd::FromZero8<u8x16_t>();
   1.410 +        u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
   1.411 +        u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
   1.412 +        m1234 = Morph8<op,u8x16_t>(m1234, p2345);
   1.413 +        m1234 = Morph8<op,u8x16_t>(m1234, p3456);
   1.414 +        if (i + 2 < completeKernelSizeForFourPixels) {
   1.415 +          u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
   1.416 +          m1234 = Morph8<op,u8x16_t>(m1234, p4567);
   1.417 +          m1234 = Morph8<op,u8x16_t>(m1234, p5678);
   1.418 +        }
   1.419 +        p1234 = p5678;
   1.420 +      }
   1.421 +
   1.422 +      int32_t destIndex = y * aDestStride + 4 * x;
   1.423 +      simd::Store8(&aDestData[destIndex], m1234);
   1.424 +    }
   1.425 +  }
   1.426 +}
   1.427 +
   1.428 +template<typename i16x8_t, typename u8x16_t>
   1.429 +inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   1.430 +                                           uint8_t* aDestData, int32_t aDestStride,
   1.431 +                                           const IntRect& aDestRect, int32_t aRadius,
   1.432 +                                           MorphologyOperator aOp)
   1.433 +{
   1.434 +  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
   1.435 +    ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
   1.436 +      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   1.437 +  } else {
   1.438 +    ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
   1.439 +      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   1.440 +  }
   1.441 +}
   1.442 +
   1.443 +// Set every pixel to the per-component minimum or maximum of the pixels around
   1.444 +// it that are up to aRadius pixels away from it (vertically).
   1.445 +template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
   1.446 +static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   1.447 +                                         uint8_t* aDestData, int32_t aDestStride,
   1.448 +                                         const IntRect& aDestRect, int32_t aRadius)
   1.449 +{
   1.450 +  static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
   1.451 +                op == MORPHOLOGY_OPERATOR_DILATE,
   1.452 +                "unexpected morphology operator");
   1.453 +
   1.454 +  int32_t startY = aDestRect.y - aRadius;
   1.455 +  int32_t endY = aDestRect.y + aRadius;
   1.456 +  for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
   1.457 +    for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
   1.458 +      int32_t sourceIndex = startY * aSourceStride + 4 * x;
   1.459 +      u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   1.460 +      sourceIndex += aSourceStride;
   1.461 +      for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
   1.462 +        u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   1.463 +        u = Morph8<op,u8x16_t>(u, u2);
   1.464 +      }
   1.465 +
   1.466 +      int32_t destIndex = y * aDestStride + 4 * x;
   1.467 +      simd::Store8(&aDestData[destIndex], u);
   1.468 +    }
   1.469 +  }
   1.470 +}
   1.471 +
   1.472 +template<typename i16x8_t, typename u8x16_t>
   1.473 +inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   1.474 +                                           uint8_t* aDestData, int32_t aDestStride,
   1.475 +                                           const IntRect& aDestRect, int32_t aRadius,
   1.476 +                                           MorphologyOperator aOp)
   1.477 +{
   1.478 +  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
   1.479 +    ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
   1.480 +      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   1.481 +  } else {
   1.482 +    ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
   1.483 +      aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   1.484 +  }
   1.485 +}
   1.486 +
   1.487 +template<typename i32x4_t, typename i16x8_t>
   1.488 +static i32x4_t
   1.489 +ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
   1.490 +{
   1.491 +  // int16_t p[8] == { b, g, r, a, b, g, r, a }.
   1.492 +  // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
   1.493 +  // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
   1.494 +  // int32_t bias[4] == { _B, _G, _R, _A }.
   1.495 +
   1.496 +  i32x4_t sum = bias;
   1.497 +
   1.498 +  // int16_t bg[8] = { b, g, b, g, b, g, b, g };
   1.499 +  i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
   1.500 +  // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
   1.501 +  i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
   1.502 +  sum = simd::Add32(sum, prodsum_bg);
   1.503 +
   1.504 +  // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
   1.505 +  i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
   1.506 +  // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
   1.507 +  i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
   1.508 +  sum = simd::Add32(sum, prodsum_ra);
   1.509 +
   1.510 +  // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
   1.511 +  return sum;
   1.512 +}
   1.513 +
   1.514 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1.515 +static TemporaryRef<DataSourceSurface>
   1.516 +ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
   1.517 +{
   1.518 +  IntSize size = aInput->GetSize();
   1.519 +  RefPtr<DataSourceSurface> target =
   1.520 +    Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
   1.521 +  if (!target) {
   1.522 +    return nullptr;
   1.523 +  }
   1.524 +
   1.525 +  uint8_t* sourceData = aInput->GetData();
   1.526 +  uint8_t* targetData = target->GetData();
   1.527 +  int32_t sourceStride = aInput->Stride();
   1.528 +  int32_t targetStride = target->Stride();
   1.529 +
   1.530 +  const int16_t factor = 128;
   1.531 +  const Float floatElementMax = INT16_MAX / factor; // 255
   1.532 +  MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
   1.533 +
   1.534 +  const Float *floats = &aMatrix._11;
   1.535 +
   1.536 +  ptrdiff_t componentOffsets[4] = {
   1.537 +    B8G8R8A8_COMPONENT_BYTEOFFSET_R,
   1.538 +    B8G8R8A8_COMPONENT_BYTEOFFSET_G,
   1.539 +    B8G8R8A8_COMPONENT_BYTEOFFSET_B,
   1.540 +    B8G8R8A8_COMPONENT_BYTEOFFSET_A
   1.541 +  };
   1.542 +
   1.543 +  // We store the color matrix in rows_bgra in the following format:
   1.544 +  // { bB, bG, bR, bA, gB, gG, gR, gA }.
   1.545 +  // { bB, gB, bG, gG, bR, gR, bA, gA }
   1.546 +  // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
   1.547 +  // which works especially well for our use case.
   1.548 +  int16_t rows_bgra[2][8];
   1.549 +  for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
   1.550 +    for (size_t colIndex = 0; colIndex < 4; colIndex++) {
   1.551 +      const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
   1.552 +      Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
   1.553 +      int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
   1.554 +      int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
   1.555 +      int8_t g_or_a = componentOffsets[rowIndex] % 2;
   1.556 +      int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
   1.557 +      rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
   1.558 +    }
   1.559 +  }
   1.560 +
   1.561 +  int32_t rowBias[4];
   1.562 +  Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
   1.563 +  for (size_t colIndex = 0; colIndex < 4; colIndex++) {
   1.564 +    size_t rowIndex = 4;
   1.565 +    const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
   1.566 +    Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
   1.567 +    int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
   1.568 +    rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
   1.569 +  }
   1.570 +
   1.571 +  i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
   1.572 +    rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
   1.573 +    rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
   1.574 +
   1.575 +  i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
   1.576 +    rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
   1.577 +    rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
   1.578 +
   1.579 +  i32x4_t rowsBias_v =
   1.580 +    simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
   1.581 +
   1.582 +  for (int32_t y = 0; y < size.height; y++) {
   1.583 +    for (int32_t x = 0; x < size.width; x += 4) {
   1.584 +      MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
   1.585 +      MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
   1.586 +      int32_t sourceIndex = y * sourceStride + 4 * x;
   1.587 +      int32_t targetIndex = y * targetStride + 4 * x;
   1.588 +
   1.589 +      // We load 4 pixels, unpack them, process them 1 pixel at a time, and
   1.590 +      // finally pack and store the 4 result pixels.
   1.591 +
   1.592 +      u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   1.593 +
   1.594 +      // Splat needed to get each pixel twice into i16x8
   1.595 +      i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
   1.596 +      i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
   1.597 +      i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
   1.598 +      i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
   1.599 +
   1.600 +      i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
   1.601 +      i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
   1.602 +      i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
   1.603 +      i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
   1.604 +
   1.605 +      static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
   1.606 +      u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
   1.607 +                                                        simd::ShiftRight32<7>(result_p2),
   1.608 +                                                        simd::ShiftRight32<7>(result_p3),
   1.609 +                                                        simd::ShiftRight32<7>(result_p4));
   1.610 +      simd::Store8(&targetData[targetIndex], result_p1234);
   1.611 +    }
   1.612 +  }
   1.613 +
   1.614 +  return target;
   1.615 +}
   1.616 +
   1.617 +// source / dest: bgra bgra
   1.618 +// sourceAlpha / destAlpha: aaaa aaaa
   1.619 +// result: bgra bgra
   1.620 +template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
   1.621 +static inline u16x8_t
   1.622 +CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
   1.623 +{
   1.624 +  u16x8_t x255 = simd::FromU16<u16x8_t>(255);
   1.625 +
   1.626 +  switch (aCompositeOperator) {
   1.627 +
   1.628 +    case COMPOSITE_OPERATOR_OVER:
   1.629 +    {
   1.630 +      // val = dest * (255 - sourceAlpha) + source * 255;
   1.631 +      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   1.632 +
   1.633 +      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   1.634 +      u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
   1.635 +      i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   1.636 +
   1.637 +      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   1.638 +      u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
   1.639 +      i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   1.640 +
   1.641 +      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   1.642 +                                          simd::FastDivideBy255(result2));
   1.643 +    }
   1.644 +
   1.645 +    case COMPOSITE_OPERATOR_IN:
   1.646 +    {
   1.647 +      // val = source * destAlpha;
   1.648 +      return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
   1.649 +    }
   1.650 +
   1.651 +    case COMPOSITE_OPERATOR_OUT:
   1.652 +    {
   1.653 +      // val = source * (255 - destAlpha);
   1.654 +      u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
   1.655 +      return simd::FastDivideBy255_16(prod);
   1.656 +    }
   1.657 +
   1.658 +    case COMPOSITE_OPERATOR_ATOP:
   1.659 +    {
   1.660 +      // val = dest * (255 - sourceAlpha) + source * destAlpha;
   1.661 +      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   1.662 +
   1.663 +      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   1.664 +      u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
   1.665 +      i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   1.666 +
   1.667 +      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   1.668 +      u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
   1.669 +      i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   1.670 +
   1.671 +      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   1.672 +                                          simd::FastDivideBy255(result2));
   1.673 +    }
   1.674 +
   1.675 +    case COMPOSITE_OPERATOR_XOR:
   1.676 +    {
   1.677 +      // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
   1.678 +      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   1.679 +      u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   1.680 +
   1.681 +      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   1.682 +      u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
   1.683 +                                                     twoFiftyFiveMinusDestAlpha);
   1.684 +      i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   1.685 +
   1.686 +      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   1.687 +      u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
   1.688 +                                                     twoFiftyFiveMinusDestAlpha);
   1.689 +      i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   1.690 +
   1.691 +      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   1.692 +                                          simd::FastDivideBy255(result2));
   1.693 +    }
   1.694 +
   1.695 +    default:
   1.696 +      return simd::FromU16<u16x8_t>(0);
   1.697 +
   1.698 +  }
   1.699 +}
   1.700 +
   1.701 +template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
   1.702 +static void
   1.703 +ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
   1.704 +{
   1.705 +  IntSize size = aDest->GetSize();
   1.706 +
   1.707 +  uint8_t* sourceData = aSource->GetData();
   1.708 +  uint8_t* destData = aDest->GetData();
   1.709 +  uint32_t sourceStride = aSource->Stride();
   1.710 +  uint32_t destStride = aDest->Stride();
   1.711 +
   1.712 +  for (int32_t y = 0; y < size.height; y++) {
   1.713 +    for (int32_t x = 0; x < size.width; x += 4) {
   1.714 +      uint32_t sourceIndex = y * sourceStride + 4 * x;
   1.715 +      uint32_t destIndex = y * destStride + 4 * x;
   1.716 +
   1.717 +      u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   1.718 +      u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
   1.719 +
   1.720 +      u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
   1.721 +      u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
   1.722 +      u16x8_t sa12 = simd::Splat16<3,3>(s12);
   1.723 +      u16x8_t da12 = simd::Splat16<3,3>(d12);
   1.724 +      u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
   1.725 +
   1.726 +      u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
   1.727 +      u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
   1.728 +      u16x8_t sa34 = simd::Splat16<3,3>(s34);
   1.729 +      u16x8_t da34 = simd::Splat16<3,3>(d34);
   1.730 +      u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
   1.731 +
   1.732 +      u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
   1.733 +      simd::Store8(&destData[destIndex], result1234);
   1.734 +    }
   1.735 +  }
   1.736 +}
   1.737 +
   1.738 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1.739 +static void
   1.740 +ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
   1.741 +                      CompositeOperator aOperator)
   1.742 +{
   1.743 +  switch (aOperator) {
   1.744 +    case COMPOSITE_OPERATOR_OVER:
   1.745 +      ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
   1.746 +      break;
   1.747 +    case COMPOSITE_OPERATOR_IN:
   1.748 +      ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
   1.749 +      break;
   1.750 +    case COMPOSITE_OPERATOR_OUT:
   1.751 +      ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
   1.752 +      break;
   1.753 +    case COMPOSITE_OPERATOR_ATOP:
   1.754 +      ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
   1.755 +      break;
   1.756 +    case COMPOSITE_OPERATOR_XOR:
   1.757 +      ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
   1.758 +      break;
   1.759 +    default:
   1.760 +      MOZ_CRASH();
   1.761 +  }
   1.762 +}
   1.763 +
   1.764 +template<typename u8x16_t>
   1.765 +static void
   1.766 +SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
   1.767 +                           uint8_t* channel0Data, uint8_t* channel1Data,
   1.768 +                           uint8_t* channel2Data, uint8_t* channel3Data,
   1.769 +                           int32_t channelStride)
   1.770 +{
   1.771 +  for (int32_t y = 0; y < size.height; y++) {
   1.772 +    for (int32_t x = 0; x < size.width; x += 16) {
   1.773 +      // Process 16 pixels at a time.
   1.774 +      int32_t sourceIndex = y * sourceStride + 4 * x;
   1.775 +      int32_t targetIndex = y * channelStride + x;
   1.776 +
   1.777 +      u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
   1.778 +      u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
   1.779 +      u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
   1.780 +      u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
   1.781 +
   1.782 +      bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   1.783 +      if (4 * (x + 4) < sourceStride) {
   1.784 +        bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
   1.785 +      }
   1.786 +      if (4 * (x + 8) < sourceStride) {
   1.787 +        bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
   1.788 +      }
   1.789 +      if (4 * (x + 12) < sourceStride) {
   1.790 +        bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
   1.791 +      }
   1.792 +
   1.793 +      u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   1.794 +      u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   1.795 +      u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   1.796 +      u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   1.797 +      u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   1.798 +      u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   1.799 +      u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   1.800 +      u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   1.801 +      u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   1.802 +      u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   1.803 +      u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   1.804 +      u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   1.805 +      u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
   1.806 +      u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
   1.807 +      u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   1.808 +      u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   1.809 +
   1.810 +      simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
   1.811 +      simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
   1.812 +      simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
   1.813 +      simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
   1.814 +    }
   1.815 +  }
   1.816 +}
   1.817 +
   1.818 +template<typename u8x16_t>
   1.819 +static void
   1.820 +CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
   1.821 +{
   1.822 +  for (int32_t y = 0; y < size.height; y++) {
   1.823 +    for (int32_t x = 0; x < size.width; x += 16) {
   1.824 +      // Process 16 pixels at a time.
   1.825 +      int32_t resultIndex = y * resultStride + 4 * x;
   1.826 +      int32_t channelIndex = y * channelStride + x;
   1.827 +
   1.828 +      u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
   1.829 +      u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
   1.830 +      u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
   1.831 +      u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
   1.832 +
   1.833 +      u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
   1.834 +      u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
   1.835 +      u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
   1.836 +      u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
   1.837 +
   1.838 +      u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
   1.839 +      u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
   1.840 +      u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
   1.841 +      u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
   1.842 +
   1.843 +      simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
   1.844 +      if (4 * (x + 4) < resultStride) {
   1.845 +        simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
   1.846 +      }
   1.847 +      if (4 * (x + 8) < resultStride) {
   1.848 +        simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
   1.849 +      }
   1.850 +      if (4 * (x + 12) < resultStride) {
   1.851 +        simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
   1.852 +      }
   1.853 +    }
   1.854 +  }
   1.855 +}
   1.856 +
   1.857 +
   1.858 +template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
   1.859 +static void
   1.860 +DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
   1.861 +                                    uint8_t* aTargetData, int32_t aTargetStride,
   1.862 +                                    uint8_t* aSourceData, int32_t aSourceStride)
   1.863 +{
   1.864 +  const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
   1.865 +  for (int32_t y = 0; y < aSize.height; y++) {
   1.866 +    for (int32_t x = 0; x < aSize.width; x += 4) {
   1.867 +      int32_t inputIndex = y * aSourceStride + 4 * x;
   1.868 +      int32_t targetIndex = y * aTargetStride + 4 * x;
   1.869 +
   1.870 +      u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
   1.871 +      u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
   1.872 +      u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
   1.873 +
   1.874 +      // Multiply all components with alpha.
   1.875 +      p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
   1.876 +      p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
   1.877 +
   1.878 +      // Divide by 255 and pack.
   1.879 +      u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
   1.880 +                                                  simd::FastDivideBy255_16(p34));
   1.881 +
   1.882 +      // Get the original alpha channel value back from p1234.
   1.883 +      result = simd::Pick(alphaMask, result, p1234);
   1.884 +
   1.885 +      simd::Store8(&aTargetData[targetIndex], result);
   1.886 +    }
   1.887 +  }
   1.888 +}
   1.889 +
   1.890 +// We use a table of precomputed factors for unpremultiplying.
   1.891 +// We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
   1.892 +// r and alpha in constant time. This table of factors has the property that
   1.893 +// (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
   1.894 +// a maximum deviation of 1).
   1.895 +//
   1.896 +// sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
   1.897 +//
   1.898 +// This table has been created using the python code
   1.899 +// ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
   1.900 +static const uint16_t sAlphaFactors[256] = {
   1.901 +  0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
   1.902 +  5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
   1.903 +  2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
   1.904 +  1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
   1.905 +  1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
   1.906 +  1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
   1.907 +  826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
   1.908 +  694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
   1.909 +  599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
   1.910 +  526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
   1.911 +  470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
   1.912 +  424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
   1.913 +  386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
   1.914 +  355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
   1.915 +  328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
   1.916 +  305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
   1.917 +  285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
   1.918 +  268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
   1.919 +};
   1.920 +
   1.921 +template<typename u16x8_t, typename u8x16_t>
   1.922 +static void
   1.923 +DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
   1.924 +                                 uint8_t* aTargetData, int32_t aTargetStride,
   1.925 +                                 uint8_t* aSourceData, int32_t aSourceStride)
   1.926 +{
   1.927 +  for (int32_t y = 0; y < aSize.height; y++) {
   1.928 +    for (int32_t x = 0; x < aSize.width; x += 4) {
   1.929 +      int32_t inputIndex = y * aSourceStride + 4 * x;
   1.930 +      int32_t targetIndex = y * aTargetStride + 4 * x;
   1.931 +      union {
   1.932 +        u8x16_t p1234;
   1.933 +        uint8_t u8[4][4];
   1.934 +      };
   1.935 +      p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
   1.936 +
   1.937 +      // Prepare the alpha factors.
   1.938 +      uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   1.939 +      uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   1.940 +      uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   1.941 +      uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   1.942 +      u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
   1.943 +      u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
   1.944 +
   1.945 +      u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
   1.946 +      u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
   1.947 +
   1.948 +      // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
   1.949 +      p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
   1.950 +      p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
   1.951 +
   1.952 +      u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
   1.953 +      simd::Store8(&aTargetData[targetIndex], result);
   1.954 +    }
   1.955 +  }
   1.956 +}
   1.957 +
   1.958 +template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
   1.959 +static TemporaryRef<DataSourceSurface>
   1.960 +RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
   1.961 +                      int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
   1.962 +{
   1.963 +#define RETURN_TURBULENCE(Type, Stitch) \
   1.964 +  SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
   1.965 +    renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
   1.966 +  return renderer.Render(aSize, aOffset);
   1.967 +
   1.968 +  switch (aType) {
   1.969 +    case TURBULENCE_TYPE_TURBULENCE:
   1.970 +    {
   1.971 +      if (aStitch) {
   1.972 +        RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
   1.973 +      }
   1.974 +      RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
   1.975 +    }
   1.976 +    case TURBULENCE_TYPE_FRACTAL_NOISE:
   1.977 +    {
   1.978 +      if (aStitch) {
   1.979 +        RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
   1.980 +      }
   1.981 +      RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
   1.982 +    }
   1.983 +  }
   1.984 +  return nullptr;
   1.985 +#undef RETURN_TURBULENCE
   1.986 +}
   1.987 +
   1.988 +// k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
   1.989 +template<typename i32x4_t, typename i16x8_t>
   1.990 +static MOZ_ALWAYS_INLINE i16x8_t
   1.991 +ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
   1.992 +                           const i16x8_t &k1And4, const i16x8_t &k2And3)
   1.993 +{
   1.994 +  // Calculate input product: inProd = (in1 * in2) / 255.
   1.995 +  i32x4_t inProd_1, inProd_2;
   1.996 +  simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
   1.997 +  i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
   1.998 +
   1.999 +  // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
  1.1000 +  i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
  1.1001 +  i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
  1.1002 +  i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
  1.1003 +  i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
  1.1004 +  i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
  1.1005 +
  1.1006 +  // Calculate k2 * in1 + k3 * in2
  1.1007 +  i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
  1.1008 +  i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
  1.1009 +  i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
  1.1010 +  i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
  1.1011 +
  1.1012 +  // Sum everything up and truncate the fractional part.
  1.1013 +  i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
  1.1014 +  i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
  1.1015 +  return simd::PackAndSaturate32To16(result_1, result_2);
  1.1016 +}
  1.1017 +
  1.1018 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
  1.1019 +static TemporaryRef<DataSourceSurface>
  1.1020 +ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
  1.1021 +                            Float aK1, Float aK2, Float aK3, Float aK4)
  1.1022 +{
  1.1023 +  IntSize size = aInput1->GetSize();
  1.1024 +  RefPtr<DataSourceSurface> target =
  1.1025 +  Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
  1.1026 +  if (!target) {
  1.1027 +    return nullptr;
  1.1028 +  }
  1.1029 +
  1.1030 +  uint8_t* source1Data = aInput1->GetData();
  1.1031 +  uint8_t* source2Data = aInput2->GetData();
  1.1032 +  uint8_t* targetData = target->GetData();
  1.1033 +  uint32_t source1Stride = aInput1->Stride();
  1.1034 +  uint32_t source2Stride = aInput2->Stride();
  1.1035 +  uint32_t targetStride = target->Stride();
  1.1036 +
  1.1037 +  // The arithmetic combine filter does the following calculation:
  1.1038 +  // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
  1.1039 +  //
  1.1040 +  // Or, with in1/2 integers between 0 and 255:
  1.1041 +  // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
  1.1042 +  //
  1.1043 +  // We want the whole calculation to happen in integer, with 16-bit factors.
  1.1044 +  // So we convert our factors to fixed-point with precision 1.8.7.
  1.1045 +  // K4 is premultiplied with 255, and it will be multiplied with 128 later
  1.1046 +  // during the actual calculation, because premultiplying it with 255 * 128
  1.1047 +  // would overflow int16.
  1.1048 +
  1.1049 +  i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
  1.1050 +  i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
  1.1051 +  i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
  1.1052 +  i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
  1.1053 +
  1.1054 +  i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
  1.1055 +  i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
  1.1056 +
  1.1057 +  for (int32_t y = 0; y < size.height; y++) {
  1.1058 +    for (int32_t x = 0; x < size.width; x += 4) {
  1.1059 +      uint32_t source1Index = y * source1Stride + 4 * x;
  1.1060 +      uint32_t source2Index = y * source2Stride + 4 * x;
  1.1061 +      uint32_t targetIndex = y * targetStride + 4 * x;
  1.1062 +
  1.1063 +      // Load and unpack.
  1.1064 +      u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
  1.1065 +      u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
  1.1066 +      i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
  1.1067 +      i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
  1.1068 +      i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
  1.1069 +      i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
  1.1070 +
  1.1071 +      // Multiply and add.
  1.1072 +      i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
  1.1073 +      i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
  1.1074 +
  1.1075 +      // Pack and store.
  1.1076 +      simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
  1.1077 +    }
  1.1078 +  }
  1.1079 +
  1.1080 +  return target;
  1.1081 +}
  1.1082 +
  1.1083 +} // namespace mozilla
  1.1084 +} // namespace gfx

mercurial