gfx/2d/FilterProcessingSIMD-inl.h

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     2  * This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "FilterProcessing.h"
     8 #include "SIMD.h"
     9 #include "SVGTurbulenceRenderer-inl.h"
    11 namespace mozilla {
    12 namespace gfx {
    14 template<typename u8x16_t>
    15 inline TemporaryRef<DataSourceSurface>
    16 ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
    17 {
    18   IntSize size = aSurface->GetSize();
    19   RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
    20   RefPtr<DataSourceSurface> output =
    21     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
    22   uint8_t *inputData = input->GetData();
    23   uint8_t *outputData = output->GetData();
    24   int32_t inputStride = input->Stride();
    25   int32_t outputStride = output->Stride();
    26   switch (input->GetFormat()) {
    27     case SurfaceFormat::B8G8R8A8:
    28       output = input;
    29       break;
    30     case SurfaceFormat::B8G8R8X8:
    31       for (int32_t y = 0; y < size.height; y++) {
    32         for (int32_t x = 0; x < size.width; x++) {
    33           int32_t inputIndex = y * inputStride + 4 * x;
    34           int32_t outputIndex = y * outputStride + 4 * x;
    35           outputData[outputIndex + 0] = inputData[inputIndex + 0];
    36           outputData[outputIndex + 1] = inputData[inputIndex + 1];
    37           outputData[outputIndex + 2] = inputData[inputIndex + 2];
    38           outputData[outputIndex + 3] = 255;
    39         }
    40       }
    41       break;
    42     case SurfaceFormat::R8G8B8A8:
    43       for (int32_t y = 0; y < size.height; y++) {
    44         for (int32_t x = 0; x < size.width; x++) {
    45           int32_t inputIndex = y * inputStride + 4 * x;
    46           int32_t outputIndex = y * outputStride + 4 * x;
    47           outputData[outputIndex + 2] = inputData[inputIndex + 0];
    48           outputData[outputIndex + 1] = inputData[inputIndex + 1];
    49           outputData[outputIndex + 0] = inputData[inputIndex + 2];
    50           outputData[outputIndex + 3] = inputData[inputIndex + 3];
    51         }
    52       }
    53       break;
    54     case SurfaceFormat::R8G8B8X8:
    55       for (int32_t y = 0; y < size.height; y++) {
    56         for (int32_t x = 0; x < size.width; x++) {
    57           int32_t inputIndex = y * inputStride + 4 * x;
    58           int32_t outputIndex = y * outputStride + 4 * x;
    59           outputData[outputIndex + 2] = inputData[inputIndex + 0];
    60           outputData[outputIndex + 1] = inputData[inputIndex + 1];
    61           outputData[outputIndex + 0] = inputData[inputIndex + 2];
    62           outputData[outputIndex + 3] = 255;
    63         }
    64       }
    65       break;
    66     case SurfaceFormat::A8:
    67       for (int32_t y = 0; y < size.height; y++) {
    68         for (int32_t x = 0; x < size.width; x += 16) {
    69           int32_t inputIndex = y * inputStride + x;
    70           int32_t outputIndex = y * outputStride + 4 * x;
    71           u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
    72           // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
    73           // interleaving with 0000000000000000 twice.
    74           u8x16_t zero = simd::FromZero8<u8x16_t>();
    75           u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
    76           u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
    77           u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
    78           u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
    79           u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
    80           u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
    81           simd::Store8(&outputData[outputIndex], p1To4);
    82           if ((x + 4) * 4 < outputStride) {
    83             simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
    84           }
    85           if ((x + 8) * 4 < outputStride) {
    86             simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
    87           }
    88           if ((x + 12) * 4 < outputStride) {
    89             simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
    90           }
    91         }
    92       }
    93       break;
    94     default:
    95       output = nullptr;
    96       break;
    97   }
    98   return output;
    99 }
   101 template<typename u8x16_t>
   102 inline void
   103 ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
   104 {
   105   for (int32_t y = 0; y < size.height; y++) {
   106     for (int32_t x = 0; x < size.width; x += 16) {
   107       // Process 16 pixels at a time.
   108       // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
   109       int32_t sourceIndex = y * sourceStride + 4 * x;
   110       int32_t targetIndex = y * alphaStride + x;
   112       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
   113       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
   114       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
   115       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
   117       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   118       if (4 * (x + 4) < sourceStride) {
   119         bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
   120       }
   121       if (4 * (x + 8) < sourceStride) {
   122         bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
   123       }
   124       if (4 * (x + 12) < sourceStride) {
   125         bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
   126       }
   128       u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   129       u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   130       u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   131       u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   132       u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   133       u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   134       u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   135       u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   136       u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   137       u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   138       u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   140       simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
   141     }
   142   }
   143 }
   145 // This function calculates the result color values for four pixels, but for
   146 // only two color channels - either b & r or g & a. However, the a result will
   147 // not be used.
   148 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
   149 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
   150 // alpha of all four pixels (and both aaaa's are the same).
   151 // blendendComponent1 and blendedComponent2 are the out parameters.
   152 template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
   153 inline void
   154 BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
   155                                i16x8_t dest, const i16x8_t& destAlpha,
   156                                i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
   157 {
   158   i16x8_t x255 = simd::FromI16<i16x8_t>(255);
   160   switch (aBlendMode) {
   162     case BLEND_MODE_MULTIPLY:
   163     {
   164       // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
   165       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   166       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   167       i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
   169       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
   170       i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
   171       blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
   172       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   174       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
   175       i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
   176       blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
   177       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   179       break;
   180     }
   182     case BLEND_MODE_SCREEN:
   183     {
   184       // val = 255 * (source + dest) + (0 - dest) * source;
   185       i16x8_t sourcePlusDest = simd::Add16(source, dest);
   186       i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
   188       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
   189       i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
   190       blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
   191       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   193       i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
   194       i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
   195       blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
   196       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   198       break;
   199     }
   201     case BLEND_MODE_DARKEN:
   202     case BLEND_MODE_LIGHTEN:
   203     {
   204       // Darken:
   205       // val = min((255 - destAlpha) * source + 255                 * dest,
   206       //           255               * source + (255 - sourceAlpha) * dest);
   207       //
   208       // Lighten:
   209       // val = max((255 - destAlpha) * source + 255                 * dest,
   210       //           255               * source + (255 - sourceAlpha) * dest);
   212       i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   213       i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   215       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
   216       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
   217       i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
   218       i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
   219       i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
   220       blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
   221       blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
   223       i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
   224       i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
   225       i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
   226       i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
   227       i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
   228       blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
   229       blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
   231       break;
   232     }
   234   }
   235 }
   237 // The alpha channel is subject to a different calculation than the RGB
   238 // channels, and this calculation is the same for all blend modes:
   239 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
   240 template<typename i16x8_t, typename i32x4_t>
   241 inline i32x4_t
   242 BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
   243 {
   244   // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
   245   // appropriately. The calculation is rewritten as follows:
   246   // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
   247   //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
   248   //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
   249   //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
   250   i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
   251   i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
   252   i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
   253   i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
   254   return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
   255 }
   257 template<typename u8x16_t, typename i16x8_t>
   258 inline void
   259 UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
   260                            i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
   261 {
   262   // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
   263   i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
   264   i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
   265   i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
   266   i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
   267   bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
   268   rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
   269 }
   271 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   272 inline u8x16_t
   273 ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
   274                          i32x4_t rrrr1234, const i32x4_t& aaaa1234)
   275 {
   276   // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
   277   i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
   278   i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
   279   i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
   280   i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
   281   i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
   282   i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
   283   return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
   284 }
   286 template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
   287 inline TemporaryRef<DataSourceSurface>
   288 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
   289 {
   290   IntSize size = aInput1->GetSize();
   291   RefPtr<DataSourceSurface> target =
   292     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
   293   if (!target) {
   294     return nullptr;
   295   }
   297   uint8_t* source1Data = aInput1->GetData();
   298   uint8_t* source2Data = aInput2->GetData();
   299   uint8_t* targetData = target->GetData();
   300   int32_t targetStride = target->Stride();
   301   int32_t source1Stride = aInput1->Stride();
   302   int32_t source2Stride = aInput2->Stride();
   304   for (int32_t y = 0; y < size.height; y++) {
   305     for (int32_t x = 0; x < size.width; x += 4) {
   306       int32_t targetIndex = y * targetStride + 4 * x;
   307       int32_t source1Index = y * source1Stride + 4 * x;
   308       int32_t source2Index = y * source2Stride + 4 * x;
   310       u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
   311       u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
   313       // The blending calculation for the RGB channels all need access to the
   314       // alpha channel of their pixel, and the alpha calculation is different,
   315       // so it makes sense to separate by channel.
   317       i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
   318       i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
   319       UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
   320       UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
   321       i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
   322       i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
   324       // We only use blendedB, blendedG and blendedR.
   325       i32x4_t blendedB, blendedG, blendedR, blendedA;
   326       BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
   327       BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
   329       // Throw away blendedA and overwrite it with the correct blended alpha.
   330       blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
   332       u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
   333       simd::Store8(&targetData[targetIndex], result1234);
   334     }
   335   }
   337   return target;
   338 }
   340 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   341 static TemporaryRef<DataSourceSurface>
   342 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
   343                       BlendMode aBlendMode)
   344 {
   345   switch (aBlendMode) {
   346     case BLEND_MODE_MULTIPLY:
   347       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
   348     case BLEND_MODE_SCREEN:
   349       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
   350     case BLEND_MODE_DARKEN:
   351       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
   352     case BLEND_MODE_LIGHTEN:
   353       return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
   354     default:
   355       return nullptr;
   356   }
   357 }
   359 template<MorphologyOperator Operator, typename u8x16_t>
   360 static u8x16_t
   361 Morph8(u8x16_t a, u8x16_t b)
   362 {
   363   return Operator == MORPHOLOGY_OPERATOR_ERODE ?
   364     simd::Min8(a, b) : simd::Max8(a, b);
   365 }
   367 // Set every pixel to the per-component minimum or maximum of the pixels around
   368 // it that are up to aRadius pixels away from it (horizontally).
   369 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
   370 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   371                                            uint8_t* aDestData, int32_t aDestStride,
   372                                            const IntRect& aDestRect, int32_t aRadius)
   373 {
   374   static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
   375                 op == MORPHOLOGY_OPERATOR_DILATE,
   376                 "unexpected morphology operator");
   378   int32_t kernelSize = aRadius + 1 + aRadius;
   379   MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
   380   MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
   381   int32_t completeKernelSizeForFourPixels = kernelSize + 3;
   382   MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
   383              completeKernelSizeForFourPixels % 4 == 2);
   385   // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
   386   // the way we need them to be.
   388   IntRect sourceRect = aDestRect;
   389   sourceRect.Inflate(aRadius, 0);
   391   for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
   392     int32_t kernelStartX = aDestRect.x - aRadius;
   393     for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
   394       // We process four pixels (16 color values) at a time.
   395       // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
   396       // source values can be read beyond that because the source is extended
   397       // by aRadius pixels.
   399       int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
   400       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   401       u8x16_t m1234 = p1234;
   403       for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
   404         u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
   405           simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
   406           simd::FromZero8<u8x16_t>();
   407         u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
   408         u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
   409         m1234 = Morph8<op,u8x16_t>(m1234, p2345);
   410         m1234 = Morph8<op,u8x16_t>(m1234, p3456);
   411         if (i + 2 < completeKernelSizeForFourPixels) {
   412           u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
   413           m1234 = Morph8<op,u8x16_t>(m1234, p4567);
   414           m1234 = Morph8<op,u8x16_t>(m1234, p5678);
   415         }
   416         p1234 = p5678;
   417       }
   419       int32_t destIndex = y * aDestStride + 4 * x;
   420       simd::Store8(&aDestData[destIndex], m1234);
   421     }
   422   }
   423 }
   425 template<typename i16x8_t, typename u8x16_t>
   426 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   427                                            uint8_t* aDestData, int32_t aDestStride,
   428                                            const IntRect& aDestRect, int32_t aRadius,
   429                                            MorphologyOperator aOp)
   430 {
   431   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
   432     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
   433       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   434   } else {
   435     ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
   436       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   437   }
   438 }
   440 // Set every pixel to the per-component minimum or maximum of the pixels around
   441 // it that are up to aRadius pixels away from it (vertically).
   442 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
   443 static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   444                                          uint8_t* aDestData, int32_t aDestStride,
   445                                          const IntRect& aDestRect, int32_t aRadius)
   446 {
   447   static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
   448                 op == MORPHOLOGY_OPERATOR_DILATE,
   449                 "unexpected morphology operator");
   451   int32_t startY = aDestRect.y - aRadius;
   452   int32_t endY = aDestRect.y + aRadius;
   453   for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
   454     for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
   455       int32_t sourceIndex = startY * aSourceStride + 4 * x;
   456       u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   457       sourceIndex += aSourceStride;
   458       for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
   459         u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
   460         u = Morph8<op,u8x16_t>(u, u2);
   461       }
   463       int32_t destIndex = y * aDestStride + 4 * x;
   464       simd::Store8(&aDestData[destIndex], u);
   465     }
   466   }
   467 }
   469 template<typename i16x8_t, typename u8x16_t>
   470 inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
   471                                            uint8_t* aDestData, int32_t aDestStride,
   472                                            const IntRect& aDestRect, int32_t aRadius,
   473                                            MorphologyOperator aOp)
   474 {
   475   if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
   476     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
   477       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   478   } else {
   479     ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
   480       aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
   481   }
   482 }
   484 template<typename i32x4_t, typename i16x8_t>
   485 static i32x4_t
   486 ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
   487 {
   488   // int16_t p[8] == { b, g, r, a, b, g, r, a }.
   489   // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
   490   // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
   491   // int32_t bias[4] == { _B, _G, _R, _A }.
   493   i32x4_t sum = bias;
   495   // int16_t bg[8] = { b, g, b, g, b, g, b, g };
   496   i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
   497   // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
   498   i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
   499   sum = simd::Add32(sum, prodsum_bg);
   501   // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
   502   i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
   503   // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
   504   i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
   505   sum = simd::Add32(sum, prodsum_ra);
   507   // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
   508   return sum;
   509 }
   511 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   512 static TemporaryRef<DataSourceSurface>
   513 ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
   514 {
   515   IntSize size = aInput->GetSize();
   516   RefPtr<DataSourceSurface> target =
   517     Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
   518   if (!target) {
   519     return nullptr;
   520   }
   522   uint8_t* sourceData = aInput->GetData();
   523   uint8_t* targetData = target->GetData();
   524   int32_t sourceStride = aInput->Stride();
   525   int32_t targetStride = target->Stride();
   527   const int16_t factor = 128;
   528   const Float floatElementMax = INT16_MAX / factor; // 255
   529   MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
   531   const Float *floats = &aMatrix._11;
   533   ptrdiff_t componentOffsets[4] = {
   534     B8G8R8A8_COMPONENT_BYTEOFFSET_R,
   535     B8G8R8A8_COMPONENT_BYTEOFFSET_G,
   536     B8G8R8A8_COMPONENT_BYTEOFFSET_B,
   537     B8G8R8A8_COMPONENT_BYTEOFFSET_A
   538   };
   540   // We store the color matrix in rows_bgra in the following format:
   541   // { bB, bG, bR, bA, gB, gG, gR, gA }.
   542   // { bB, gB, bG, gG, bR, gR, bA, gA }
   543   // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
   544   // which works especially well for our use case.
   545   int16_t rows_bgra[2][8];
   546   for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
   547     for (size_t colIndex = 0; colIndex < 4; colIndex++) {
   548       const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
   549       Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
   550       int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
   551       int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
   552       int8_t g_or_a = componentOffsets[rowIndex] % 2;
   553       int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
   554       rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
   555     }
   556   }
   558   int32_t rowBias[4];
   559   Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
   560   for (size_t colIndex = 0; colIndex < 4; colIndex++) {
   561     size_t rowIndex = 4;
   562     const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
   563     Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
   564     int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
   565     rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
   566   }
   568   i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
   569     rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
   570     rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
   572   i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
   573     rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
   574     rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
   576   i32x4_t rowsBias_v =
   577     simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
   579   for (int32_t y = 0; y < size.height; y++) {
   580     for (int32_t x = 0; x < size.width; x += 4) {
   581       MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
   582       MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
   583       int32_t sourceIndex = y * sourceStride + 4 * x;
   584       int32_t targetIndex = y * targetStride + 4 * x;
   586       // We load 4 pixels, unpack them, process them 1 pixel at a time, and
   587       // finally pack and store the 4 result pixels.
   589       u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   591       // Splat needed to get each pixel twice into i16x8
   592       i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
   593       i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
   594       i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
   595       i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
   597       i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
   598       i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
   599       i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
   600       i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
   602       static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
   603       u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
   604                                                         simd::ShiftRight32<7>(result_p2),
   605                                                         simd::ShiftRight32<7>(result_p3),
   606                                                         simd::ShiftRight32<7>(result_p4));
   607       simd::Store8(&targetData[targetIndex], result_p1234);
   608     }
   609   }
   611   return target;
   612 }
   614 // source / dest: bgra bgra
   615 // sourceAlpha / destAlpha: aaaa aaaa
   616 // result: bgra bgra
   617 template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
   618 static inline u16x8_t
   619 CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
   620 {
   621   u16x8_t x255 = simd::FromU16<u16x8_t>(255);
   623   switch (aCompositeOperator) {
   625     case COMPOSITE_OPERATOR_OVER:
   626     {
   627       // val = dest * (255 - sourceAlpha) + source * 255;
   628       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   630       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   631       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
   632       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   634       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   635       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
   636       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   638       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   639                                           simd::FastDivideBy255(result2));
   640     }
   642     case COMPOSITE_OPERATOR_IN:
   643     {
   644       // val = source * destAlpha;
   645       return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
   646     }
   648     case COMPOSITE_OPERATOR_OUT:
   649     {
   650       // val = source * (255 - destAlpha);
   651       u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
   652       return simd::FastDivideBy255_16(prod);
   653     }
   655     case COMPOSITE_OPERATOR_ATOP:
   656     {
   657       // val = dest * (255 - sourceAlpha) + source * destAlpha;
   658       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   660       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   661       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
   662       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   664       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   665       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
   666       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   668       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   669                                           simd::FastDivideBy255(result2));
   670     }
   672     case COMPOSITE_OPERATOR_XOR:
   673     {
   674       // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
   675       u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
   676       u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
   678       u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
   679       u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
   680                                                      twoFiftyFiveMinusDestAlpha);
   681       i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
   683       u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
   684       u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
   685                                                      twoFiftyFiveMinusDestAlpha);
   686       i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
   688       return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
   689                                           simd::FastDivideBy255(result2));
   690     }
   692     default:
   693       return simd::FromU16<u16x8_t>(0);
   695   }
   696 }
   698 template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
   699 static void
   700 ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
   701 {
   702   IntSize size = aDest->GetSize();
   704   uint8_t* sourceData = aSource->GetData();
   705   uint8_t* destData = aDest->GetData();
   706   uint32_t sourceStride = aSource->Stride();
   707   uint32_t destStride = aDest->Stride();
   709   for (int32_t y = 0; y < size.height; y++) {
   710     for (int32_t x = 0; x < size.width; x += 4) {
   711       uint32_t sourceIndex = y * sourceStride + 4 * x;
   712       uint32_t destIndex = y * destStride + 4 * x;
   714       u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   715       u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
   717       u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
   718       u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
   719       u16x8_t sa12 = simd::Splat16<3,3>(s12);
   720       u16x8_t da12 = simd::Splat16<3,3>(d12);
   721       u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
   723       u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
   724       u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
   725       u16x8_t sa34 = simd::Splat16<3,3>(s34);
   726       u16x8_t da34 = simd::Splat16<3,3>(d34);
   727       u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
   729       u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
   730       simd::Store8(&destData[destIndex], result1234);
   731     }
   732   }
   733 }
   735 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
   736 static void
   737 ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
   738                       CompositeOperator aOperator)
   739 {
   740   switch (aOperator) {
   741     case COMPOSITE_OPERATOR_OVER:
   742       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
   743       break;
   744     case COMPOSITE_OPERATOR_IN:
   745       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
   746       break;
   747     case COMPOSITE_OPERATOR_OUT:
   748       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
   749       break;
   750     case COMPOSITE_OPERATOR_ATOP:
   751       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
   752       break;
   753     case COMPOSITE_OPERATOR_XOR:
   754       ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
   755       break;
   756     default:
   757       MOZ_CRASH();
   758   }
   759 }
   761 template<typename u8x16_t>
   762 static void
   763 SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
   764                            uint8_t* channel0Data, uint8_t* channel1Data,
   765                            uint8_t* channel2Data, uint8_t* channel3Data,
   766                            int32_t channelStride)
   767 {
   768   for (int32_t y = 0; y < size.height; y++) {
   769     for (int32_t x = 0; x < size.width; x += 16) {
   770       // Process 16 pixels at a time.
   771       int32_t sourceIndex = y * sourceStride + 4 * x;
   772       int32_t targetIndex = y * channelStride + x;
   774       u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
   775       u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
   776       u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
   777       u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
   779       bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
   780       if (4 * (x + 4) < sourceStride) {
   781         bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
   782       }
   783       if (4 * (x + 8) < sourceStride) {
   784         bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
   785       }
   786       if (4 * (x + 12) < sourceStride) {
   787         bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
   788       }
   790       u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   791       u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
   792       u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   793       u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
   794       u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   795       u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
   796       u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   797       u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
   798       u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   799       u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
   800       u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   801       u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
   802       u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
   803       u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
   804       u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   805       u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
   807       simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
   808       simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
   809       simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
   810       simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
   811     }
   812   }
   813 }
   815 template<typename u8x16_t>
   816 static void
   817 CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
   818 {
   819   for (int32_t y = 0; y < size.height; y++) {
   820     for (int32_t x = 0; x < size.width; x += 16) {
   821       // Process 16 pixels at a time.
   822       int32_t resultIndex = y * resultStride + 4 * x;
   823       int32_t channelIndex = y * channelStride + x;
   825       u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
   826       u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
   827       u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
   828       u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
   830       u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
   831       u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
   832       u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
   833       u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
   835       u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
   836       u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
   837       u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
   838       u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
   840       simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
   841       if (4 * (x + 4) < resultStride) {
   842         simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
   843       }
   844       if (4 * (x + 8) < resultStride) {
   845         simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
   846       }
   847       if (4 * (x + 12) < resultStride) {
   848         simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
   849       }
   850     }
   851   }
   852 }
   855 template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
   856 static void
   857 DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
   858                                     uint8_t* aTargetData, int32_t aTargetStride,
   859                                     uint8_t* aSourceData, int32_t aSourceStride)
   860 {
   861   const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
   862   for (int32_t y = 0; y < aSize.height; y++) {
   863     for (int32_t x = 0; x < aSize.width; x += 4) {
   864       int32_t inputIndex = y * aSourceStride + 4 * x;
   865       int32_t targetIndex = y * aTargetStride + 4 * x;
   867       u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
   868       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
   869       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
   871       // Multiply all components with alpha.
   872       p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
   873       p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
   875       // Divide by 255 and pack.
   876       u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
   877                                                   simd::FastDivideBy255_16(p34));
   879       // Get the original alpha channel value back from p1234.
   880       result = simd::Pick(alphaMask, result, p1234);
   882       simd::Store8(&aTargetData[targetIndex], result);
   883     }
   884   }
   885 }
   887 // We use a table of precomputed factors for unpremultiplying.
   888 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
   889 // r and alpha in constant time. This table of factors has the property that
   890 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
   891 // a maximum deviation of 1).
   892 //
   893 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
   894 //
   895 // This table has been created using the python code
   896 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
   897 static const uint16_t sAlphaFactors[256] = {
   898   0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
   899   5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
   900   2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
   901   1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
   902   1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
   903   1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
   904   826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
   905   694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
   906   599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
   907   526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
   908   470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
   909   424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
   910   386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
   911   355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
   912   328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
   913   305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
   914   285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
   915   268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
   916 };
   918 template<typename u16x8_t, typename u8x16_t>
   919 static void
   920 DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
   921                                  uint8_t* aTargetData, int32_t aTargetStride,
   922                                  uint8_t* aSourceData, int32_t aSourceStride)
   923 {
   924   for (int32_t y = 0; y < aSize.height; y++) {
   925     for (int32_t x = 0; x < aSize.width; x += 4) {
   926       int32_t inputIndex = y * aSourceStride + 4 * x;
   927       int32_t targetIndex = y * aTargetStride + 4 * x;
   928       union {
   929         u8x16_t p1234;
   930         uint8_t u8[4][4];
   931       };
   932       p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
   934       // Prepare the alpha factors.
   935       uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   936       uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   937       uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   938       uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
   939       u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
   940       u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
   942       u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
   943       u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
   945       // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
   946       p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
   947       p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
   949       u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
   950       simd::Store8(&aTargetData[targetIndex], result);
   951     }
   952   }
   953 }
   955 template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
   956 static TemporaryRef<DataSourceSurface>
   957 RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
   958                       int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
   959 {
   960 #define RETURN_TURBULENCE(Type, Stitch) \
   961   SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
   962     renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
   963   return renderer.Render(aSize, aOffset);
   965   switch (aType) {
   966     case TURBULENCE_TYPE_TURBULENCE:
   967     {
   968       if (aStitch) {
   969         RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
   970       }
   971       RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
   972     }
   973     case TURBULENCE_TYPE_FRACTAL_NOISE:
   974     {
   975       if (aStitch) {
   976         RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
   977       }
   978       RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
   979     }
   980   }
   981   return nullptr;
   982 #undef RETURN_TURBULENCE
   983 }
   985 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
   986 template<typename i32x4_t, typename i16x8_t>
   987 static MOZ_ALWAYS_INLINE i16x8_t
   988 ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
   989                            const i16x8_t &k1And4, const i16x8_t &k2And3)
   990 {
   991   // Calculate input product: inProd = (in1 * in2) / 255.
   992   i32x4_t inProd_1, inProd_2;
   993   simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
   994   i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
   996   // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
   997   i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
   998   i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
   999   i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
  1000   i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
  1001   i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
  1003   // Calculate k2 * in1 + k3 * in2
  1004   i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
  1005   i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
  1006   i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
  1007   i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
  1009   // Sum everything up and truncate the fractional part.
  1010   i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
  1011   i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
  1012   return simd::PackAndSaturate32To16(result_1, result_2);
  1015 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
  1016 static TemporaryRef<DataSourceSurface>
  1017 ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
  1018                             Float aK1, Float aK2, Float aK3, Float aK4)
  1020   IntSize size = aInput1->GetSize();
  1021   RefPtr<DataSourceSurface> target =
  1022   Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
  1023   if (!target) {
  1024     return nullptr;
  1027   uint8_t* source1Data = aInput1->GetData();
  1028   uint8_t* source2Data = aInput2->GetData();
  1029   uint8_t* targetData = target->GetData();
  1030   uint32_t source1Stride = aInput1->Stride();
  1031   uint32_t source2Stride = aInput2->Stride();
  1032   uint32_t targetStride = target->Stride();
  1034   // The arithmetic combine filter does the following calculation:
  1035   // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
  1036   //
  1037   // Or, with in1/2 integers between 0 and 255:
  1038   // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
  1039   //
  1040   // We want the whole calculation to happen in integer, with 16-bit factors.
  1041   // So we convert our factors to fixed-point with precision 1.8.7.
  1042   // K4 is premultiplied with 255, and it will be multiplied with 128 later
  1043   // during the actual calculation, because premultiplying it with 255 * 128
  1044   // would overflow int16.
  1046   i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
  1047   i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
  1048   i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
  1049   i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
  1051   i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
  1052   i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
  1054   for (int32_t y = 0; y < size.height; y++) {
  1055     for (int32_t x = 0; x < size.width; x += 4) {
  1056       uint32_t source1Index = y * source1Stride + 4 * x;
  1057       uint32_t source2Index = y * source2Stride + 4 * x;
  1058       uint32_t targetIndex = y * targetStride + 4 * x;
  1060       // Load and unpack.
  1061       u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
  1062       u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
  1063       i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
  1064       i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
  1065       i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
  1066       i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
  1068       // Multiply and add.
  1069       i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
  1070       i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
  1072       // Pack and store.
  1073       simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
  1077   return target;
  1080 } // namespace mozilla
  1081 } // namespace gfx

mercurial