gfx/2d/FilterProcessingSIMD-inl.h

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
michael@0 2 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "FilterProcessing.h"
michael@0 7
michael@0 8 #include "SIMD.h"
michael@0 9 #include "SVGTurbulenceRenderer-inl.h"
michael@0 10
michael@0 11 namespace mozilla {
michael@0 12 namespace gfx {
michael@0 13
michael@0 14 template<typename u8x16_t>
michael@0 15 inline TemporaryRef<DataSourceSurface>
michael@0 16 ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
michael@0 17 {
michael@0 18 IntSize size = aSurface->GetSize();
michael@0 19 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
michael@0 20 RefPtr<DataSourceSurface> output =
michael@0 21 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
michael@0 22 uint8_t *inputData = input->GetData();
michael@0 23 uint8_t *outputData = output->GetData();
michael@0 24 int32_t inputStride = input->Stride();
michael@0 25 int32_t outputStride = output->Stride();
michael@0 26 switch (input->GetFormat()) {
michael@0 27 case SurfaceFormat::B8G8R8A8:
michael@0 28 output = input;
michael@0 29 break;
michael@0 30 case SurfaceFormat::B8G8R8X8:
michael@0 31 for (int32_t y = 0; y < size.height; y++) {
michael@0 32 for (int32_t x = 0; x < size.width; x++) {
michael@0 33 int32_t inputIndex = y * inputStride + 4 * x;
michael@0 34 int32_t outputIndex = y * outputStride + 4 * x;
michael@0 35 outputData[outputIndex + 0] = inputData[inputIndex + 0];
michael@0 36 outputData[outputIndex + 1] = inputData[inputIndex + 1];
michael@0 37 outputData[outputIndex + 2] = inputData[inputIndex + 2];
michael@0 38 outputData[outputIndex + 3] = 255;
michael@0 39 }
michael@0 40 }
michael@0 41 break;
michael@0 42 case SurfaceFormat::R8G8B8A8:
michael@0 43 for (int32_t y = 0; y < size.height; y++) {
michael@0 44 for (int32_t x = 0; x < size.width; x++) {
michael@0 45 int32_t inputIndex = y * inputStride + 4 * x;
michael@0 46 int32_t outputIndex = y * outputStride + 4 * x;
michael@0 47 outputData[outputIndex + 2] = inputData[inputIndex + 0];
michael@0 48 outputData[outputIndex + 1] = inputData[inputIndex + 1];
michael@0 49 outputData[outputIndex + 0] = inputData[inputIndex + 2];
michael@0 50 outputData[outputIndex + 3] = inputData[inputIndex + 3];
michael@0 51 }
michael@0 52 }
michael@0 53 break;
michael@0 54 case SurfaceFormat::R8G8B8X8:
michael@0 55 for (int32_t y = 0; y < size.height; y++) {
michael@0 56 for (int32_t x = 0; x < size.width; x++) {
michael@0 57 int32_t inputIndex = y * inputStride + 4 * x;
michael@0 58 int32_t outputIndex = y * outputStride + 4 * x;
michael@0 59 outputData[outputIndex + 2] = inputData[inputIndex + 0];
michael@0 60 outputData[outputIndex + 1] = inputData[inputIndex + 1];
michael@0 61 outputData[outputIndex + 0] = inputData[inputIndex + 2];
michael@0 62 outputData[outputIndex + 3] = 255;
michael@0 63 }
michael@0 64 }
michael@0 65 break;
michael@0 66 case SurfaceFormat::A8:
michael@0 67 for (int32_t y = 0; y < size.height; y++) {
michael@0 68 for (int32_t x = 0; x < size.width; x += 16) {
michael@0 69 int32_t inputIndex = y * inputStride + x;
michael@0 70 int32_t outputIndex = y * outputStride + 4 * x;
michael@0 71 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
michael@0 72 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
michael@0 73 // interleaving with 0000000000000000 twice.
michael@0 74 u8x16_t zero = simd::FromZero8<u8x16_t>();
michael@0 75 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
michael@0 76 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
michael@0 77 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
michael@0 78 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
michael@0 79 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
michael@0 80 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
michael@0 81 simd::Store8(&outputData[outputIndex], p1To4);
michael@0 82 if ((x + 4) * 4 < outputStride) {
michael@0 83 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
michael@0 84 }
michael@0 85 if ((x + 8) * 4 < outputStride) {
michael@0 86 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
michael@0 87 }
michael@0 88 if ((x + 12) * 4 < outputStride) {
michael@0 89 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
michael@0 90 }
michael@0 91 }
michael@0 92 }
michael@0 93 break;
michael@0 94 default:
michael@0 95 output = nullptr;
michael@0 96 break;
michael@0 97 }
michael@0 98 return output;
michael@0 99 }
michael@0 100
michael@0 101 template<typename u8x16_t>
michael@0 102 inline void
michael@0 103 ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
michael@0 104 {
michael@0 105 for (int32_t y = 0; y < size.height; y++) {
michael@0 106 for (int32_t x = 0; x < size.width; x += 16) {
michael@0 107 // Process 16 pixels at a time.
michael@0 108 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
michael@0 109 int32_t sourceIndex = y * sourceStride + 4 * x;
michael@0 110 int32_t targetIndex = y * alphaStride + x;
michael@0 111
michael@0 112 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
michael@0 113 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
michael@0 114 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
michael@0 115 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
michael@0 116
michael@0 117 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
michael@0 118 if (4 * (x + 4) < sourceStride) {
michael@0 119 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
michael@0 120 }
michael@0 121 if (4 * (x + 8) < sourceStride) {
michael@0 122 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
michael@0 123 }
michael@0 124 if (4 * (x + 12) < sourceStride) {
michael@0 125 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
michael@0 126 }
michael@0 127
michael@0 128 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
michael@0 129 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
michael@0 130 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
michael@0 131 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
michael@0 132 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
michael@0 133 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
michael@0 134 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
michael@0 135 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
michael@0 136 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
michael@0 137 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
michael@0 138 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
michael@0 139
michael@0 140 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
michael@0 141 }
michael@0 142 }
michael@0 143 }
michael@0 144
michael@0 145 // This function calculates the result color values for four pixels, but for
michael@0 146 // only two color channels - either b & r or g & a. However, the a result will
michael@0 147 // not be used.
michael@0 148 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
michael@0 149 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
michael@0 150 // alpha of all four pixels (and both aaaa's are the same).
michael@0 151 // blendendComponent1 and blendedComponent2 are the out parameters.
michael@0 152 template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
michael@0 153 inline void
michael@0 154 BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
michael@0 155 i16x8_t dest, const i16x8_t& destAlpha,
michael@0 156 i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
michael@0 157 {
michael@0 158 i16x8_t x255 = simd::FromI16<i16x8_t>(255);
michael@0 159
michael@0 160 switch (aBlendMode) {
michael@0 161
michael@0 162 case BLEND_MODE_MULTIPLY:
michael@0 163 {
michael@0 164 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
michael@0 165 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
michael@0 166 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
michael@0 167 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
michael@0 168
michael@0 169 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
michael@0 170 i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
michael@0 171 blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
michael@0 172 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
michael@0 173
michael@0 174 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
michael@0 175 i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
michael@0 176 blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
michael@0 177 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
michael@0 178
michael@0 179 break;
michael@0 180 }
michael@0 181
michael@0 182 case BLEND_MODE_SCREEN:
michael@0 183 {
michael@0 184 // val = 255 * (source + dest) + (0 - dest) * source;
michael@0 185 i16x8_t sourcePlusDest = simd::Add16(source, dest);
michael@0 186 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
michael@0 187
michael@0 188 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
michael@0 189 i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
michael@0 190 blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
michael@0 191 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
michael@0 192
michael@0 193 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
michael@0 194 i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
michael@0 195 blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
michael@0 196 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
michael@0 197
michael@0 198 break;
michael@0 199 }
michael@0 200
michael@0 201 case BLEND_MODE_DARKEN:
michael@0 202 case BLEND_MODE_LIGHTEN:
michael@0 203 {
michael@0 204 // Darken:
michael@0 205 // val = min((255 - destAlpha) * source + 255 * dest,
michael@0 206 // 255 * source + (255 - sourceAlpha) * dest);
michael@0 207 //
michael@0 208 // Lighten:
michael@0 209 // val = max((255 - destAlpha) * source + 255 * dest,
michael@0 210 // 255 * source + (255 - sourceAlpha) * dest);
michael@0 211
michael@0 212 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
michael@0 213 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
michael@0 214
michael@0 215 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
michael@0 216 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
michael@0 217 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
michael@0 218 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
michael@0 219 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
michael@0 220 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
michael@0 221 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
michael@0 222
michael@0 223 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
michael@0 224 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
michael@0 225 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
michael@0 226 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
michael@0 227 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
michael@0 228 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
michael@0 229 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
michael@0 230
michael@0 231 break;
michael@0 232 }
michael@0 233
michael@0 234 }
michael@0 235 }
michael@0 236
michael@0 237 // The alpha channel is subject to a different calculation than the RGB
michael@0 238 // channels, and this calculation is the same for all blend modes:
michael@0 239 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
michael@0 240 template<typename i16x8_t, typename i32x4_t>
michael@0 241 inline i32x4_t
michael@0 242 BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
michael@0 243 {
michael@0 244 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
michael@0 245 // appropriately. The calculation is rewritten as follows:
michael@0 246 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
michael@0 247 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
michael@0 248 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
michael@0 249 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
michael@0 250 i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
michael@0 251 i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
michael@0 252 i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
michael@0 253 i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
michael@0 254 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
michael@0 255 }
michael@0 256
michael@0 257 template<typename u8x16_t, typename i16x8_t>
michael@0 258 inline void
michael@0 259 UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
michael@0 260 i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
michael@0 261 {
michael@0 262 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
michael@0 263 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
michael@0 264 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
michael@0 265 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
michael@0 266 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
michael@0 267 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
michael@0 268 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
michael@0 269 }
michael@0 270
michael@0 271 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
michael@0 272 inline u8x16_t
michael@0 273 ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
michael@0 274 i32x4_t rrrr1234, const i32x4_t& aaaa1234)
michael@0 275 {
michael@0 276 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
michael@0 277 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
michael@0 278 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
michael@0 279 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
michael@0 280 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
michael@0 281 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
michael@0 282 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
michael@0 283 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
michael@0 284 }
michael@0 285
michael@0 286 template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
michael@0 287 inline TemporaryRef<DataSourceSurface>
michael@0 288 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
michael@0 289 {
michael@0 290 IntSize size = aInput1->GetSize();
michael@0 291 RefPtr<DataSourceSurface> target =
michael@0 292 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
michael@0 293 if (!target) {
michael@0 294 return nullptr;
michael@0 295 }
michael@0 296
michael@0 297 uint8_t* source1Data = aInput1->GetData();
michael@0 298 uint8_t* source2Data = aInput2->GetData();
michael@0 299 uint8_t* targetData = target->GetData();
michael@0 300 int32_t targetStride = target->Stride();
michael@0 301 int32_t source1Stride = aInput1->Stride();
michael@0 302 int32_t source2Stride = aInput2->Stride();
michael@0 303
michael@0 304 for (int32_t y = 0; y < size.height; y++) {
michael@0 305 for (int32_t x = 0; x < size.width; x += 4) {
michael@0 306 int32_t targetIndex = y * targetStride + 4 * x;
michael@0 307 int32_t source1Index = y * source1Stride + 4 * x;
michael@0 308 int32_t source2Index = y * source2Stride + 4 * x;
michael@0 309
michael@0 310 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
michael@0 311 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
michael@0 312
michael@0 313 // The blending calculation for the RGB channels all need access to the
michael@0 314 // alpha channel of their pixel, and the alpha calculation is different,
michael@0 315 // so it makes sense to separate by channel.
michael@0 316
michael@0 317 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
michael@0 318 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
michael@0 319 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
michael@0 320 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
michael@0 321 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
michael@0 322 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
michael@0 323
michael@0 324 // We only use blendedB, blendedG and blendedR.
michael@0 325 i32x4_t blendedB, blendedG, blendedR, blendedA;
michael@0 326 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
michael@0 327 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
michael@0 328
michael@0 329 // Throw away blendedA and overwrite it with the correct blended alpha.
michael@0 330 blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
michael@0 331
michael@0 332 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
michael@0 333 simd::Store8(&targetData[targetIndex], result1234);
michael@0 334 }
michael@0 335 }
michael@0 336
michael@0 337 return target;
michael@0 338 }
michael@0 339
michael@0 340 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
michael@0 341 static TemporaryRef<DataSourceSurface>
michael@0 342 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
michael@0 343 BlendMode aBlendMode)
michael@0 344 {
michael@0 345 switch (aBlendMode) {
michael@0 346 case BLEND_MODE_MULTIPLY:
michael@0 347 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
michael@0 348 case BLEND_MODE_SCREEN:
michael@0 349 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
michael@0 350 case BLEND_MODE_DARKEN:
michael@0 351 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
michael@0 352 case BLEND_MODE_LIGHTEN:
michael@0 353 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
michael@0 354 default:
michael@0 355 return nullptr;
michael@0 356 }
michael@0 357 }
michael@0 358
michael@0 359 template<MorphologyOperator Operator, typename u8x16_t>
michael@0 360 static u8x16_t
michael@0 361 Morph8(u8x16_t a, u8x16_t b)
michael@0 362 {
michael@0 363 return Operator == MORPHOLOGY_OPERATOR_ERODE ?
michael@0 364 simd::Min8(a, b) : simd::Max8(a, b);
michael@0 365 }
michael@0 366
michael@0 367 // Set every pixel to the per-component minimum or maximum of the pixels around
michael@0 368 // it that are up to aRadius pixels away from it (horizontally).
michael@0 369 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
michael@0 370 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
michael@0 371 uint8_t* aDestData, int32_t aDestStride,
michael@0 372 const IntRect& aDestRect, int32_t aRadius)
michael@0 373 {
michael@0 374 static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
michael@0 375 op == MORPHOLOGY_OPERATOR_DILATE,
michael@0 376 "unexpected morphology operator");
michael@0 377
michael@0 378 int32_t kernelSize = aRadius + 1 + aRadius;
michael@0 379 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
michael@0 380 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
michael@0 381 int32_t completeKernelSizeForFourPixels = kernelSize + 3;
michael@0 382 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
michael@0 383 completeKernelSizeForFourPixels % 4 == 2);
michael@0 384
michael@0 385 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
michael@0 386 // the way we need them to be.
michael@0 387
michael@0 388 IntRect sourceRect = aDestRect;
michael@0 389 sourceRect.Inflate(aRadius, 0);
michael@0 390
michael@0 391 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
michael@0 392 int32_t kernelStartX = aDestRect.x - aRadius;
michael@0 393 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
michael@0 394 // We process four pixels (16 color values) at a time.
michael@0 395 // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
michael@0 396 // source values can be read beyond that because the source is extended
michael@0 397 // by aRadius pixels.
michael@0 398
michael@0 399 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
michael@0 400 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
michael@0 401 u8x16_t m1234 = p1234;
michael@0 402
michael@0 403 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
michael@0 404 u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
michael@0 405 simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
michael@0 406 simd::FromZero8<u8x16_t>();
michael@0 407 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
michael@0 408 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
michael@0 409 m1234 = Morph8<op,u8x16_t>(m1234, p2345);
michael@0 410 m1234 = Morph8<op,u8x16_t>(m1234, p3456);
michael@0 411 if (i + 2 < completeKernelSizeForFourPixels) {
michael@0 412 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
michael@0 413 m1234 = Morph8<op,u8x16_t>(m1234, p4567);
michael@0 414 m1234 = Morph8<op,u8x16_t>(m1234, p5678);
michael@0 415 }
michael@0 416 p1234 = p5678;
michael@0 417 }
michael@0 418
michael@0 419 int32_t destIndex = y * aDestStride + 4 * x;
michael@0 420 simd::Store8(&aDestData[destIndex], m1234);
michael@0 421 }
michael@0 422 }
michael@0 423 }
michael@0 424
michael@0 425 template<typename i16x8_t, typename u8x16_t>
michael@0 426 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
michael@0 427 uint8_t* aDestData, int32_t aDestStride,
michael@0 428 const IntRect& aDestRect, int32_t aRadius,
michael@0 429 MorphologyOperator aOp)
michael@0 430 {
michael@0 431 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
michael@0 432 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
michael@0 433 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
michael@0 434 } else {
michael@0 435 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
michael@0 436 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
michael@0 437 }
michael@0 438 }
michael@0 439
michael@0 440 // Set every pixel to the per-component minimum or maximum of the pixels around
michael@0 441 // it that are up to aRadius pixels away from it (vertically).
michael@0 442 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
michael@0 443 static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
michael@0 444 uint8_t* aDestData, int32_t aDestStride,
michael@0 445 const IntRect& aDestRect, int32_t aRadius)
michael@0 446 {
michael@0 447 static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
michael@0 448 op == MORPHOLOGY_OPERATOR_DILATE,
michael@0 449 "unexpected morphology operator");
michael@0 450
michael@0 451 int32_t startY = aDestRect.y - aRadius;
michael@0 452 int32_t endY = aDestRect.y + aRadius;
michael@0 453 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
michael@0 454 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
michael@0 455 int32_t sourceIndex = startY * aSourceStride + 4 * x;
michael@0 456 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
michael@0 457 sourceIndex += aSourceStride;
michael@0 458 for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
michael@0 459 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
michael@0 460 u = Morph8<op,u8x16_t>(u, u2);
michael@0 461 }
michael@0 462
michael@0 463 int32_t destIndex = y * aDestStride + 4 * x;
michael@0 464 simd::Store8(&aDestData[destIndex], u);
michael@0 465 }
michael@0 466 }
michael@0 467 }
michael@0 468
michael@0 469 template<typename i16x8_t, typename u8x16_t>
michael@0 470 inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
michael@0 471 uint8_t* aDestData, int32_t aDestStride,
michael@0 472 const IntRect& aDestRect, int32_t aRadius,
michael@0 473 MorphologyOperator aOp)
michael@0 474 {
michael@0 475 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
michael@0 476 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
michael@0 477 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
michael@0 478 } else {
michael@0 479 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
michael@0 480 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
michael@0 481 }
michael@0 482 }
michael@0 483
michael@0 484 template<typename i32x4_t, typename i16x8_t>
michael@0 485 static i32x4_t
michael@0 486 ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
michael@0 487 {
michael@0 488 // int16_t p[8] == { b, g, r, a, b, g, r, a }.
michael@0 489 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
michael@0 490 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
michael@0 491 // int32_t bias[4] == { _B, _G, _R, _A }.
michael@0 492
michael@0 493 i32x4_t sum = bias;
michael@0 494
michael@0 495 // int16_t bg[8] = { b, g, b, g, b, g, b, g };
michael@0 496 i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
michael@0 497 // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
michael@0 498 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
michael@0 499 sum = simd::Add32(sum, prodsum_bg);
michael@0 500
michael@0 501 // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
michael@0 502 i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
michael@0 503 // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
michael@0 504 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
michael@0 505 sum = simd::Add32(sum, prodsum_ra);
michael@0 506
michael@0 507 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
michael@0 508 return sum;
michael@0 509 }
michael@0 510
michael@0 511 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
michael@0 512 static TemporaryRef<DataSourceSurface>
michael@0 513 ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
michael@0 514 {
michael@0 515 IntSize size = aInput->GetSize();
michael@0 516 RefPtr<DataSourceSurface> target =
michael@0 517 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
michael@0 518 if (!target) {
michael@0 519 return nullptr;
michael@0 520 }
michael@0 521
michael@0 522 uint8_t* sourceData = aInput->GetData();
michael@0 523 uint8_t* targetData = target->GetData();
michael@0 524 int32_t sourceStride = aInput->Stride();
michael@0 525 int32_t targetStride = target->Stride();
michael@0 526
michael@0 527 const int16_t factor = 128;
michael@0 528 const Float floatElementMax = INT16_MAX / factor; // 255
michael@0 529 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
michael@0 530
michael@0 531 const Float *floats = &aMatrix._11;
michael@0 532
michael@0 533 ptrdiff_t componentOffsets[4] = {
michael@0 534 B8G8R8A8_COMPONENT_BYTEOFFSET_R,
michael@0 535 B8G8R8A8_COMPONENT_BYTEOFFSET_G,
michael@0 536 B8G8R8A8_COMPONENT_BYTEOFFSET_B,
michael@0 537 B8G8R8A8_COMPONENT_BYTEOFFSET_A
michael@0 538 };
michael@0 539
michael@0 540 // We store the color matrix in rows_bgra in the following format:
michael@0 541 // { bB, bG, bR, bA, gB, gG, gR, gA }.
michael@0 542 // { bB, gB, bG, gG, bR, gR, bA, gA }
michael@0 543 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
michael@0 544 // which works especially well for our use case.
michael@0 545 int16_t rows_bgra[2][8];
michael@0 546 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
michael@0 547 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
michael@0 548 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
michael@0 549 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
michael@0 550 int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
michael@0 551 int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
michael@0 552 int8_t g_or_a = componentOffsets[rowIndex] % 2;
michael@0 553 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
michael@0 554 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
michael@0 555 }
michael@0 556 }
michael@0 557
michael@0 558 int32_t rowBias[4];
michael@0 559 Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
michael@0 560 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
michael@0 561 size_t rowIndex = 4;
michael@0 562 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
michael@0 563 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
michael@0 564 int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
michael@0 565 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
michael@0 566 }
michael@0 567
michael@0 568 i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
michael@0 569 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
michael@0 570 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
michael@0 571
michael@0 572 i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
michael@0 573 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
michael@0 574 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
michael@0 575
michael@0 576 i32x4_t rowsBias_v =
michael@0 577 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
michael@0 578
michael@0 579 for (int32_t y = 0; y < size.height; y++) {
michael@0 580 for (int32_t x = 0; x < size.width; x += 4) {
michael@0 581 MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
michael@0 582 MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
michael@0 583 int32_t sourceIndex = y * sourceStride + 4 * x;
michael@0 584 int32_t targetIndex = y * targetStride + 4 * x;
michael@0 585
michael@0 586 // We load 4 pixels, unpack them, process them 1 pixel at a time, and
michael@0 587 // finally pack and store the 4 result pixels.
michael@0 588
michael@0 589 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
michael@0 590
michael@0 591 // Splat needed to get each pixel twice into i16x8
michael@0 592 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
michael@0 593 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
michael@0 594 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
michael@0 595 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
michael@0 596
michael@0 597 i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
michael@0 598 i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
michael@0 599 i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
michael@0 600 i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
michael@0 601
michael@0 602 static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
michael@0 603 u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
michael@0 604 simd::ShiftRight32<7>(result_p2),
michael@0 605 simd::ShiftRight32<7>(result_p3),
michael@0 606 simd::ShiftRight32<7>(result_p4));
michael@0 607 simd::Store8(&targetData[targetIndex], result_p1234);
michael@0 608 }
michael@0 609 }
michael@0 610
michael@0 611 return target;
michael@0 612 }
michael@0 613
michael@0 614 // source / dest: bgra bgra
michael@0 615 // sourceAlpha / destAlpha: aaaa aaaa
michael@0 616 // result: bgra bgra
michael@0 617 template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
michael@0 618 static inline u16x8_t
michael@0 619 CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
michael@0 620 {
michael@0 621 u16x8_t x255 = simd::FromU16<u16x8_t>(255);
michael@0 622
michael@0 623 switch (aCompositeOperator) {
michael@0 624
michael@0 625 case COMPOSITE_OPERATOR_OVER:
michael@0 626 {
michael@0 627 // val = dest * (255 - sourceAlpha) + source * 255;
michael@0 628 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
michael@0 629
michael@0 630 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
michael@0 631 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
michael@0 632 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
michael@0 633
michael@0 634 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
michael@0 635 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
michael@0 636 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
michael@0 637
michael@0 638 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
michael@0 639 simd::FastDivideBy255(result2));
michael@0 640 }
michael@0 641
michael@0 642 case COMPOSITE_OPERATOR_IN:
michael@0 643 {
michael@0 644 // val = source * destAlpha;
michael@0 645 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
michael@0 646 }
michael@0 647
michael@0 648 case COMPOSITE_OPERATOR_OUT:
michael@0 649 {
michael@0 650 // val = source * (255 - destAlpha);
michael@0 651 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
michael@0 652 return simd::FastDivideBy255_16(prod);
michael@0 653 }
michael@0 654
michael@0 655 case COMPOSITE_OPERATOR_ATOP:
michael@0 656 {
michael@0 657 // val = dest * (255 - sourceAlpha) + source * destAlpha;
michael@0 658 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
michael@0 659
michael@0 660 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
michael@0 661 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
michael@0 662 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
michael@0 663
michael@0 664 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
michael@0 665 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
michael@0 666 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
michael@0 667
michael@0 668 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
michael@0 669 simd::FastDivideBy255(result2));
michael@0 670 }
michael@0 671
michael@0 672 case COMPOSITE_OPERATOR_XOR:
michael@0 673 {
michael@0 674 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
michael@0 675 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
michael@0 676 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
michael@0 677
michael@0 678 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
michael@0 679 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
michael@0 680 twoFiftyFiveMinusDestAlpha);
michael@0 681 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
michael@0 682
michael@0 683 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
michael@0 684 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
michael@0 685 twoFiftyFiveMinusDestAlpha);
michael@0 686 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
michael@0 687
michael@0 688 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
michael@0 689 simd::FastDivideBy255(result2));
michael@0 690 }
michael@0 691
michael@0 692 default:
michael@0 693 return simd::FromU16<u16x8_t>(0);
michael@0 694
michael@0 695 }
michael@0 696 }
michael@0 697
michael@0 698 template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
michael@0 699 static void
michael@0 700 ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
michael@0 701 {
michael@0 702 IntSize size = aDest->GetSize();
michael@0 703
michael@0 704 uint8_t* sourceData = aSource->GetData();
michael@0 705 uint8_t* destData = aDest->GetData();
michael@0 706 uint32_t sourceStride = aSource->Stride();
michael@0 707 uint32_t destStride = aDest->Stride();
michael@0 708
michael@0 709 for (int32_t y = 0; y < size.height; y++) {
michael@0 710 for (int32_t x = 0; x < size.width; x += 4) {
michael@0 711 uint32_t sourceIndex = y * sourceStride + 4 * x;
michael@0 712 uint32_t destIndex = y * destStride + 4 * x;
michael@0 713
michael@0 714 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
michael@0 715 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
michael@0 716
michael@0 717 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
michael@0 718 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
michael@0 719 u16x8_t sa12 = simd::Splat16<3,3>(s12);
michael@0 720 u16x8_t da12 = simd::Splat16<3,3>(d12);
michael@0 721 u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
michael@0 722
michael@0 723 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
michael@0 724 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
michael@0 725 u16x8_t sa34 = simd::Splat16<3,3>(s34);
michael@0 726 u16x8_t da34 = simd::Splat16<3,3>(d34);
michael@0 727 u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
michael@0 728
michael@0 729 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
michael@0 730 simd::Store8(&destData[destIndex], result1234);
michael@0 731 }
michael@0 732 }
michael@0 733 }
michael@0 734
michael@0 735 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
michael@0 736 static void
michael@0 737 ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
michael@0 738 CompositeOperator aOperator)
michael@0 739 {
michael@0 740 switch (aOperator) {
michael@0 741 case COMPOSITE_OPERATOR_OVER:
michael@0 742 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
michael@0 743 break;
michael@0 744 case COMPOSITE_OPERATOR_IN:
michael@0 745 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
michael@0 746 break;
michael@0 747 case COMPOSITE_OPERATOR_OUT:
michael@0 748 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
michael@0 749 break;
michael@0 750 case COMPOSITE_OPERATOR_ATOP:
michael@0 751 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
michael@0 752 break;
michael@0 753 case COMPOSITE_OPERATOR_XOR:
michael@0 754 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
michael@0 755 break;
michael@0 756 default:
michael@0 757 MOZ_CRASH();
michael@0 758 }
michael@0 759 }
michael@0 760
michael@0 761 template<typename u8x16_t>
michael@0 762 static void
michael@0 763 SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
michael@0 764 uint8_t* channel0Data, uint8_t* channel1Data,
michael@0 765 uint8_t* channel2Data, uint8_t* channel3Data,
michael@0 766 int32_t channelStride)
michael@0 767 {
michael@0 768 for (int32_t y = 0; y < size.height; y++) {
michael@0 769 for (int32_t x = 0; x < size.width; x += 16) {
michael@0 770 // Process 16 pixels at a time.
michael@0 771 int32_t sourceIndex = y * sourceStride + 4 * x;
michael@0 772 int32_t targetIndex = y * channelStride + x;
michael@0 773
michael@0 774 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
michael@0 775 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
michael@0 776 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
michael@0 777 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
michael@0 778
michael@0 779 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
michael@0 780 if (4 * (x + 4) < sourceStride) {
michael@0 781 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
michael@0 782 }
michael@0 783 if (4 * (x + 8) < sourceStride) {
michael@0 784 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
michael@0 785 }
michael@0 786 if (4 * (x + 12) < sourceStride) {
michael@0 787 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
michael@0 788 }
michael@0 789
michael@0 790 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
michael@0 791 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
michael@0 792 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
michael@0 793 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
michael@0 794 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
michael@0 795 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
michael@0 796 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
michael@0 797 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
michael@0 798 u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
michael@0 799 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
michael@0 800 u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
michael@0 801 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
michael@0 802 u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
michael@0 803 u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
michael@0 804 u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
michael@0 805 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
michael@0 806
michael@0 807 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
michael@0 808 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
michael@0 809 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
michael@0 810 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
michael@0 811 }
michael@0 812 }
michael@0 813 }
michael@0 814
michael@0 815 template<typename u8x16_t>
michael@0 816 static void
michael@0 817 CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
michael@0 818 {
michael@0 819 for (int32_t y = 0; y < size.height; y++) {
michael@0 820 for (int32_t x = 0; x < size.width; x += 16) {
michael@0 821 // Process 16 pixels at a time.
michael@0 822 int32_t resultIndex = y * resultStride + 4 * x;
michael@0 823 int32_t channelIndex = y * channelStride + x;
michael@0 824
michael@0 825 u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
michael@0 826 u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
michael@0 827 u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
michael@0 828 u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
michael@0 829
michael@0 830 u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
michael@0 831 u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
michael@0 832 u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
michael@0 833 u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
michael@0 834
michael@0 835 u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
michael@0 836 u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
michael@0 837 u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
michael@0 838 u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
michael@0 839
michael@0 840 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
michael@0 841 if (4 * (x + 4) < resultStride) {
michael@0 842 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
michael@0 843 }
michael@0 844 if (4 * (x + 8) < resultStride) {
michael@0 845 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
michael@0 846 }
michael@0 847 if (4 * (x + 12) < resultStride) {
michael@0 848 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
michael@0 849 }
michael@0 850 }
michael@0 851 }
michael@0 852 }
michael@0 853
michael@0 854
michael@0 855 template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
michael@0 856 static void
michael@0 857 DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
michael@0 858 uint8_t* aTargetData, int32_t aTargetStride,
michael@0 859 uint8_t* aSourceData, int32_t aSourceStride)
michael@0 860 {
michael@0 861 const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
michael@0 862 for (int32_t y = 0; y < aSize.height; y++) {
michael@0 863 for (int32_t x = 0; x < aSize.width; x += 4) {
michael@0 864 int32_t inputIndex = y * aSourceStride + 4 * x;
michael@0 865 int32_t targetIndex = y * aTargetStride + 4 * x;
michael@0 866
michael@0 867 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
michael@0 868 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
michael@0 869 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
michael@0 870
michael@0 871 // Multiply all components with alpha.
michael@0 872 p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
michael@0 873 p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
michael@0 874
michael@0 875 // Divide by 255 and pack.
michael@0 876 u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
michael@0 877 simd::FastDivideBy255_16(p34));
michael@0 878
michael@0 879 // Get the original alpha channel value back from p1234.
michael@0 880 result = simd::Pick(alphaMask, result, p1234);
michael@0 881
michael@0 882 simd::Store8(&aTargetData[targetIndex], result);
michael@0 883 }
michael@0 884 }
michael@0 885 }
michael@0 886
michael@0 887 // We use a table of precomputed factors for unpremultiplying.
michael@0 888 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
michael@0 889 // r and alpha in constant time. This table of factors has the property that
michael@0 890 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
michael@0 891 // a maximum deviation of 1).
michael@0 892 //
michael@0 893 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
michael@0 894 //
michael@0 895 // This table has been created using the python code
michael@0 896 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
michael@0 897 static const uint16_t sAlphaFactors[256] = {
michael@0 898 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
michael@0 899 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
michael@0 900 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
michael@0 901 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
michael@0 902 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
michael@0 903 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
michael@0 904 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
michael@0 905 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
michael@0 906 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
michael@0 907 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
michael@0 908 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
michael@0 909 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
michael@0 910 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
michael@0 911 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
michael@0 912 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
michael@0 913 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
michael@0 914 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
michael@0 915 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
michael@0 916 };
michael@0 917
michael@0 918 template<typename u16x8_t, typename u8x16_t>
michael@0 919 static void
michael@0 920 DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
michael@0 921 uint8_t* aTargetData, int32_t aTargetStride,
michael@0 922 uint8_t* aSourceData, int32_t aSourceStride)
michael@0 923 {
michael@0 924 for (int32_t y = 0; y < aSize.height; y++) {
michael@0 925 for (int32_t x = 0; x < aSize.width; x += 4) {
michael@0 926 int32_t inputIndex = y * aSourceStride + 4 * x;
michael@0 927 int32_t targetIndex = y * aTargetStride + 4 * x;
michael@0 928 union {
michael@0 929 u8x16_t p1234;
michael@0 930 uint8_t u8[4][4];
michael@0 931 };
michael@0 932 p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
michael@0 933
michael@0 934 // Prepare the alpha factors.
michael@0 935 uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
michael@0 936 uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
michael@0 937 uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
michael@0 938 uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
michael@0 939 u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
michael@0 940 u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
michael@0 941
michael@0 942 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
michael@0 943 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
michael@0 944
michael@0 945 // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
michael@0 946 p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
michael@0 947 p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
michael@0 948
michael@0 949 u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
michael@0 950 simd::Store8(&aTargetData[targetIndex], result);
michael@0 951 }
michael@0 952 }
michael@0 953 }
michael@0 954
michael@0 955 template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
michael@0 956 static TemporaryRef<DataSourceSurface>
michael@0 957 RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
michael@0 958 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
michael@0 959 {
michael@0 960 #define RETURN_TURBULENCE(Type, Stitch) \
michael@0 961 SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
michael@0 962 renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
michael@0 963 return renderer.Render(aSize, aOffset);
michael@0 964
michael@0 965 switch (aType) {
michael@0 966 case TURBULENCE_TYPE_TURBULENCE:
michael@0 967 {
michael@0 968 if (aStitch) {
michael@0 969 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
michael@0 970 }
michael@0 971 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
michael@0 972 }
michael@0 973 case TURBULENCE_TYPE_FRACTAL_NOISE:
michael@0 974 {
michael@0 975 if (aStitch) {
michael@0 976 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
michael@0 977 }
michael@0 978 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
michael@0 979 }
michael@0 980 }
michael@0 981 return nullptr;
michael@0 982 #undef RETURN_TURBULENCE
michael@0 983 }
michael@0 984
michael@0 985 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
michael@0 986 template<typename i32x4_t, typename i16x8_t>
michael@0 987 static MOZ_ALWAYS_INLINE i16x8_t
michael@0 988 ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
michael@0 989 const i16x8_t &k1And4, const i16x8_t &k2And3)
michael@0 990 {
michael@0 991 // Calculate input product: inProd = (in1 * in2) / 255.
michael@0 992 i32x4_t inProd_1, inProd_2;
michael@0 993 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
michael@0 994 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
michael@0 995
michael@0 996 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
michael@0 997 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
michael@0 998 i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
michael@0 999 i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
michael@0 1000 i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
michael@0 1001 i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
michael@0 1002
michael@0 1003 // Calculate k2 * in1 + k3 * in2
michael@0 1004 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
michael@0 1005 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
michael@0 1006 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
michael@0 1007 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
michael@0 1008
michael@0 1009 // Sum everything up and truncate the fractional part.
michael@0 1010 i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
michael@0 1011 i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
michael@0 1012 return simd::PackAndSaturate32To16(result_1, result_2);
michael@0 1013 }
michael@0 1014
michael@0 1015 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
michael@0 1016 static TemporaryRef<DataSourceSurface>
michael@0 1017 ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
michael@0 1018 Float aK1, Float aK2, Float aK3, Float aK4)
michael@0 1019 {
michael@0 1020 IntSize size = aInput1->GetSize();
michael@0 1021 RefPtr<DataSourceSurface> target =
michael@0 1022 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
michael@0 1023 if (!target) {
michael@0 1024 return nullptr;
michael@0 1025 }
michael@0 1026
michael@0 1027 uint8_t* source1Data = aInput1->GetData();
michael@0 1028 uint8_t* source2Data = aInput2->GetData();
michael@0 1029 uint8_t* targetData = target->GetData();
michael@0 1030 uint32_t source1Stride = aInput1->Stride();
michael@0 1031 uint32_t source2Stride = aInput2->Stride();
michael@0 1032 uint32_t targetStride = target->Stride();
michael@0 1033
michael@0 1034 // The arithmetic combine filter does the following calculation:
michael@0 1035 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
michael@0 1036 //
michael@0 1037 // Or, with in1/2 integers between 0 and 255:
michael@0 1038 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
michael@0 1039 //
michael@0 1040 // We want the whole calculation to happen in integer, with 16-bit factors.
michael@0 1041 // So we convert our factors to fixed-point with precision 1.8.7.
michael@0 1042 // K4 is premultiplied with 255, and it will be multiplied with 128 later
michael@0 1043 // during the actual calculation, because premultiplying it with 255 * 128
michael@0 1044 // would overflow int16.
michael@0 1045
michael@0 1046 i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
michael@0 1047 i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
michael@0 1048 i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
michael@0 1049 i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
michael@0 1050
michael@0 1051 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
michael@0 1052 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
michael@0 1053
michael@0 1054 for (int32_t y = 0; y < size.height; y++) {
michael@0 1055 for (int32_t x = 0; x < size.width; x += 4) {
michael@0 1056 uint32_t source1Index = y * source1Stride + 4 * x;
michael@0 1057 uint32_t source2Index = y * source2Stride + 4 * x;
michael@0 1058 uint32_t targetIndex = y * targetStride + 4 * x;
michael@0 1059
michael@0 1060 // Load and unpack.
michael@0 1061 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
michael@0 1062 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
michael@0 1063 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
michael@0 1064 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
michael@0 1065 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
michael@0 1066 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
michael@0 1067
michael@0 1068 // Multiply and add.
michael@0 1069 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
michael@0 1070 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
michael@0 1071
michael@0 1072 // Pack and store.
michael@0 1073 simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
michael@0 1074 }
michael@0 1075 }
michael@0 1076
michael@0 1077 return target;
michael@0 1078 }
michael@0 1079
michael@0 1080 } // namespace mozilla
michael@0 1081 } // namespace gfx

mercurial