1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/2d/FilterProcessingSIMD-inl.h Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1081 @@ 1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- 1.5 + * This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "FilterProcessing.h" 1.10 + 1.11 +#include "SIMD.h" 1.12 +#include "SVGTurbulenceRenderer-inl.h" 1.13 + 1.14 +namespace mozilla { 1.15 +namespace gfx { 1.16 + 1.17 +template<typename u8x16_t> 1.18 +inline TemporaryRef<DataSourceSurface> 1.19 +ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface) 1.20 +{ 1.21 + IntSize size = aSurface->GetSize(); 1.22 + RefPtr<DataSourceSurface> input = aSurface->GetDataSurface(); 1.23 + RefPtr<DataSourceSurface> output = 1.24 + Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 1.25 + uint8_t *inputData = input->GetData(); 1.26 + uint8_t *outputData = output->GetData(); 1.27 + int32_t inputStride = input->Stride(); 1.28 + int32_t outputStride = output->Stride(); 1.29 + switch (input->GetFormat()) { 1.30 + case SurfaceFormat::B8G8R8A8: 1.31 + output = input; 1.32 + break; 1.33 + case SurfaceFormat::B8G8R8X8: 1.34 + for (int32_t y = 0; y < size.height; y++) { 1.35 + for (int32_t x = 0; x < size.width; x++) { 1.36 + int32_t inputIndex = y * inputStride + 4 * x; 1.37 + int32_t outputIndex = y * outputStride + 4 * x; 1.38 + outputData[outputIndex + 0] = inputData[inputIndex + 0]; 1.39 + outputData[outputIndex + 1] = inputData[inputIndex + 1]; 1.40 + outputData[outputIndex + 2] = inputData[inputIndex + 2]; 1.41 + outputData[outputIndex + 3] = 255; 1.42 + } 1.43 + } 1.44 + break; 1.45 + case SurfaceFormat::R8G8B8A8: 1.46 + for (int32_t y = 0; y < size.height; y++) { 1.47 + for (int32_t x = 0; x < size.width; x++) { 1.48 + int32_t inputIndex = y * inputStride + 4 * x; 1.49 + int32_t outputIndex = y * outputStride + 4 * x; 1.50 + outputData[outputIndex + 2] = inputData[inputIndex + 0]; 1.51 + outputData[outputIndex + 1] = inputData[inputIndex + 1]; 1.52 + outputData[outputIndex + 0] = inputData[inputIndex + 2]; 1.53 + outputData[outputIndex + 3] = inputData[inputIndex + 3]; 1.54 + } 1.55 + } 1.56 + break; 1.57 + case SurfaceFormat::R8G8B8X8: 1.58 + for (int32_t y = 0; y < size.height; y++) { 1.59 + for (int32_t x = 0; x < size.width; x++) { 1.60 + int32_t inputIndex = y * inputStride + 4 * x; 1.61 + int32_t outputIndex = y * outputStride + 4 * x; 1.62 + outputData[outputIndex + 2] = inputData[inputIndex + 0]; 1.63 + outputData[outputIndex + 1] = inputData[inputIndex + 1]; 1.64 + outputData[outputIndex + 0] = inputData[inputIndex + 2]; 1.65 + outputData[outputIndex + 3] = 255; 1.66 + } 1.67 + } 1.68 + break; 1.69 + case SurfaceFormat::A8: 1.70 + for (int32_t y = 0; y < size.height; y++) { 1.71 + for (int32_t x = 0; x < size.width; x += 16) { 1.72 + int32_t inputIndex = y * inputStride + x; 1.73 + int32_t outputIndex = y * outputStride + 4 * x; 1.74 + u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]); 1.75 + // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by 1.76 + // interleaving with 0000000000000000 twice. 1.77 + u8x16_t zero = simd::FromZero8<u8x16_t>(); 1.78 + u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16); 1.79 + u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16); 1.80 + u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8); 1.81 + u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8); 1.82 + u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16); 1.83 + u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16); 1.84 + simd::Store8(&outputData[outputIndex], p1To4); 1.85 + if ((x + 4) * 4 < outputStride) { 1.86 + simd::Store8(&outputData[outputIndex + 4 * 4], p5To8); 1.87 + } 1.88 + if ((x + 8) * 4 < outputStride) { 1.89 + simd::Store8(&outputData[outputIndex + 4 * 8], p9To12); 1.90 + } 1.91 + if ((x + 12) * 4 < outputStride) { 1.92 + simd::Store8(&outputData[outputIndex + 4 * 12], p13To16); 1.93 + } 1.94 + } 1.95 + } 1.96 + break; 1.97 + default: 1.98 + output = nullptr; 1.99 + break; 1.100 + } 1.101 + return output; 1.102 +} 1.103 + 1.104 +template<typename u8x16_t> 1.105 +inline void 1.106 +ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride) 1.107 +{ 1.108 + for (int32_t y = 0; y < size.height; y++) { 1.109 + for (int32_t x = 0; x < size.width; x += 16) { 1.110 + // Process 16 pixels at a time. 1.111 + // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA. 1.112 + int32_t sourceIndex = y * sourceStride + 4 * x; 1.113 + int32_t targetIndex = y * alphaStride + x; 1.114 + 1.115 + u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); 1.116 + u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); 1.117 + u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); 1.118 + u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); 1.119 + 1.120 + bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 1.121 + if (4 * (x + 4) < sourceStride) { 1.122 + bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); 1.123 + } 1.124 + if (4 * (x + 8) < sourceStride) { 1.125 + bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); 1.126 + } 1.127 + if (4 * (x + 12) < sourceStride) { 1.128 + bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); 1.129 + } 1.130 + 1.131 + u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 1.132 + u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 1.133 + u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 1.134 + u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 1.135 + u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); 1.136 + u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); 1.137 + u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); 1.138 + u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); 1.139 + u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 1.140 + u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 1.141 + u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 1.142 + 1.143 + simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa); 1.144 + } 1.145 + } 1.146 +} 1.147 + 1.148 +// This function calculates the result color values for four pixels, but for 1.149 +// only two color channels - either b & r or g & a. However, the a result will 1.150 +// not be used. 1.151 +// source and dest each contain 8 values, either bbbb gggg or rrrr aaaa. 1.152 +// sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the 1.153 +// alpha of all four pixels (and both aaaa's are the same). 1.154 +// blendendComponent1 and blendedComponent2 are the out parameters. 1.155 +template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode> 1.156 +inline void 1.157 +BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha, 1.158 + i16x8_t dest, const i16x8_t& destAlpha, 1.159 + i32x4_t& blendedComponent1, i32x4_t& blendedComponent2) 1.160 +{ 1.161 + i16x8_t x255 = simd::FromI16<i16x8_t>(255); 1.162 + 1.163 + switch (aBlendMode) { 1.164 + 1.165 + case BLEND_MODE_MULTIPLY: 1.166 + { 1.167 + // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest); 1.168 + i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 1.169 + i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 1.170 + i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source); 1.171 + 1.172 + i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); 1.173 + i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); 1.174 + blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1); 1.175 + blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 1.176 + 1.177 + i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); 1.178 + i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); 1.179 + blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2); 1.180 + blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 1.181 + 1.182 + break; 1.183 + } 1.184 + 1.185 + case BLEND_MODE_SCREEN: 1.186 + { 1.187 + // val = 255 * (source + dest) + (0 - dest) * source; 1.188 + i16x8_t sourcePlusDest = simd::Add16(source, dest); 1.189 + i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest); 1.190 + 1.191 + i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest); 1.192 + i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source); 1.193 + blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1); 1.194 + blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 1.195 + 1.196 + i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest); 1.197 + i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source); 1.198 + blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2); 1.199 + blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 1.200 + 1.201 + break; 1.202 + } 1.203 + 1.204 + case BLEND_MODE_DARKEN: 1.205 + case BLEND_MODE_LIGHTEN: 1.206 + { 1.207 + // Darken: 1.208 + // val = min((255 - destAlpha) * source + 255 * dest, 1.209 + // 255 * source + (255 - sourceAlpha) * dest); 1.210 + // 1.211 + // Lighten: 1.212 + // val = max((255 - destAlpha) * source + 255 * dest, 1.213 + // 255 * source + (255 - sourceAlpha) * dest); 1.214 + 1.215 + i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 1.216 + i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 1.217 + 1.218 + i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255); 1.219 + i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha); 1.220 + i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); 1.221 + i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1); 1.222 + i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1); 1.223 + blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2); 1.224 + blendedComponent1 = simd::FastDivideBy255(blendedComponent1); 1.225 + 1.226 + i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255); 1.227 + i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha); 1.228 + i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); 1.229 + i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2); 1.230 + i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2); 1.231 + blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2); 1.232 + blendedComponent2 = simd::FastDivideBy255(blendedComponent2); 1.233 + 1.234 + break; 1.235 + } 1.236 + 1.237 + } 1.238 +} 1.239 + 1.240 +// The alpha channel is subject to a different calculation than the RGB 1.241 +// channels, and this calculation is the same for all blend modes: 1.242 +// resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha) 1.243 +template<typename i16x8_t, typename i32x4_t> 1.244 +inline i32x4_t 1.245 +BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234) 1.246 +{ 1.247 + // We're using MulAdd16x8x2To32x4, so we need to interleave our factors 1.248 + // appropriately. The calculation is rewritten as follows: 1.249 + // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0]) 1.250 + // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) 1.251 + // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) 1.252 + // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0] 1.253 + i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234); 1.254 + i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234); 1.255 + i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha); 1.256 + i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255)); 1.257 + return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2)); 1.258 +} 1.259 + 1.260 +template<typename u8x16_t, typename i16x8_t> 1.261 +inline void 1.262 +UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234, 1.263 + i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234) 1.264 +{ 1.265 + // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234 1.266 + i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234); 1.267 + i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234); 1.268 + i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34); 1.269 + i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34); 1.270 + bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24); 1.271 + rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24); 1.272 +} 1.273 + 1.274 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t> 1.275 +inline u8x16_t 1.276 +ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234, 1.277 + i32x4_t rrrr1234, const i32x4_t& aaaa1234) 1.278 +{ 1.279 + // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234 1.280 + i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234); 1.281 + i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234); 1.282 + i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234); 1.283 + i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234); 1.284 + i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234); 1.285 + i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234); 1.286 + return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34); 1.287 +} 1.288 + 1.289 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode> 1.290 +inline TemporaryRef<DataSourceSurface> 1.291 +ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2) 1.292 +{ 1.293 + IntSize size = aInput1->GetSize(); 1.294 + RefPtr<DataSourceSurface> target = 1.295 + Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 1.296 + if (!target) { 1.297 + return nullptr; 1.298 + } 1.299 + 1.300 + uint8_t* source1Data = aInput1->GetData(); 1.301 + uint8_t* source2Data = aInput2->GetData(); 1.302 + uint8_t* targetData = target->GetData(); 1.303 + int32_t targetStride = target->Stride(); 1.304 + int32_t source1Stride = aInput1->Stride(); 1.305 + int32_t source2Stride = aInput2->Stride(); 1.306 + 1.307 + for (int32_t y = 0; y < size.height; y++) { 1.308 + for (int32_t x = 0; x < size.width; x += 4) { 1.309 + int32_t targetIndex = y * targetStride + 4 * x; 1.310 + int32_t source1Index = y * source1Stride + 4 * x; 1.311 + int32_t source2Index = y * source2Stride + 4 * x; 1.312 + 1.313 + u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]); 1.314 + u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]); 1.315 + 1.316 + // The blending calculation for the RGB channels all need access to the 1.317 + // alpha channel of their pixel, and the alpha calculation is different, 1.318 + // so it makes sense to separate by channel. 1.319 + 1.320 + i16x8_t s_bbbbgggg1234, s_rrrraaaa1234; 1.321 + i16x8_t d_bbbbgggg1234, d_rrrraaaa1234; 1.322 + UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234); 1.323 + UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234); 1.324 + i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234); 1.325 + i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234); 1.326 + 1.327 + // We only use blendedB, blendedG and blendedR. 1.328 + i32x4_t blendedB, blendedG, blendedR, blendedA; 1.329 + BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG); 1.330 + BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA); 1.331 + 1.332 + // Throw away blendedA and overwrite it with the correct blended alpha. 1.333 + blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234); 1.334 + 1.335 + u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA); 1.336 + simd::Store8(&targetData[targetIndex], result1234); 1.337 + } 1.338 + } 1.339 + 1.340 + return target; 1.341 +} 1.342 + 1.343 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t> 1.344 +static TemporaryRef<DataSourceSurface> 1.345 +ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, 1.346 + BlendMode aBlendMode) 1.347 +{ 1.348 + switch (aBlendMode) { 1.349 + case BLEND_MODE_MULTIPLY: 1.350 + return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2); 1.351 + case BLEND_MODE_SCREEN: 1.352 + return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2); 1.353 + case BLEND_MODE_DARKEN: 1.354 + return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2); 1.355 + case BLEND_MODE_LIGHTEN: 1.356 + return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2); 1.357 + default: 1.358 + return nullptr; 1.359 + } 1.360 +} 1.361 + 1.362 +template<MorphologyOperator Operator, typename u8x16_t> 1.363 +static u8x16_t 1.364 +Morph8(u8x16_t a, u8x16_t b) 1.365 +{ 1.366 + return Operator == MORPHOLOGY_OPERATOR_ERODE ? 1.367 + simd::Min8(a, b) : simd::Max8(a, b); 1.368 +} 1.369 + 1.370 +// Set every pixel to the per-component minimum or maximum of the pixels around 1.371 +// it that are up to aRadius pixels away from it (horizontally). 1.372 +template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> 1.373 +inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, 1.374 + uint8_t* aDestData, int32_t aDestStride, 1.375 + const IntRect& aDestRect, int32_t aRadius) 1.376 +{ 1.377 + static_assert(op == MORPHOLOGY_OPERATOR_ERODE || 1.378 + op == MORPHOLOGY_OPERATOR_DILATE, 1.379 + "unexpected morphology operator"); 1.380 + 1.381 + int32_t kernelSize = aRadius + 1 + aRadius; 1.382 + MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0"); 1.383 + MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3); 1.384 + int32_t completeKernelSizeForFourPixels = kernelSize + 3; 1.385 + MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 || 1.386 + completeKernelSizeForFourPixels % 4 == 2); 1.387 + 1.388 + // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just 1.389 + // the way we need them to be. 1.390 + 1.391 + IntRect sourceRect = aDestRect; 1.392 + sourceRect.Inflate(aRadius, 0); 1.393 + 1.394 + for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) { 1.395 + int32_t kernelStartX = aDestRect.x - aRadius; 1.396 + for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) { 1.397 + // We process four pixels (16 color values) at a time. 1.398 + // aSourceData[0] points to the pixel located at aDestRect.TopLeft(); 1.399 + // source values can be read beyond that because the source is extended 1.400 + // by aRadius pixels. 1.401 + 1.402 + int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX; 1.403 + u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 1.404 + u8x16_t m1234 = p1234; 1.405 + 1.406 + for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) { 1.407 + u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ? 1.408 + simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) : 1.409 + simd::FromZero8<u8x16_t>(); 1.410 + u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678); 1.411 + u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678); 1.412 + m1234 = Morph8<op,u8x16_t>(m1234, p2345); 1.413 + m1234 = Morph8<op,u8x16_t>(m1234, p3456); 1.414 + if (i + 2 < completeKernelSizeForFourPixels) { 1.415 + u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678); 1.416 + m1234 = Morph8<op,u8x16_t>(m1234, p4567); 1.417 + m1234 = Morph8<op,u8x16_t>(m1234, p5678); 1.418 + } 1.419 + p1234 = p5678; 1.420 + } 1.421 + 1.422 + int32_t destIndex = y * aDestStride + 4 * x; 1.423 + simd::Store8(&aDestData[destIndex], m1234); 1.424 + } 1.425 + } 1.426 +} 1.427 + 1.428 +template<typename i16x8_t, typename u8x16_t> 1.429 +inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, 1.430 + uint8_t* aDestData, int32_t aDestStride, 1.431 + const IntRect& aDestRect, int32_t aRadius, 1.432 + MorphologyOperator aOp) 1.433 +{ 1.434 + if (aOp == MORPHOLOGY_OPERATOR_ERODE) { 1.435 + ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( 1.436 + aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 1.437 + } else { 1.438 + ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( 1.439 + aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 1.440 + } 1.441 +} 1.442 + 1.443 +// Set every pixel to the per-component minimum or maximum of the pixels around 1.444 +// it that are up to aRadius pixels away from it (vertically). 1.445 +template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> 1.446 +static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, 1.447 + uint8_t* aDestData, int32_t aDestStride, 1.448 + const IntRect& aDestRect, int32_t aRadius) 1.449 +{ 1.450 + static_assert(op == MORPHOLOGY_OPERATOR_ERODE || 1.451 + op == MORPHOLOGY_OPERATOR_DILATE, 1.452 + "unexpected morphology operator"); 1.453 + 1.454 + int32_t startY = aDestRect.y - aRadius; 1.455 + int32_t endY = aDestRect.y + aRadius; 1.456 + for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) { 1.457 + for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) { 1.458 + int32_t sourceIndex = startY * aSourceStride + 4 * x; 1.459 + u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 1.460 + sourceIndex += aSourceStride; 1.461 + for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) { 1.462 + u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); 1.463 + u = Morph8<op,u8x16_t>(u, u2); 1.464 + } 1.465 + 1.466 + int32_t destIndex = y * aDestStride + 4 * x; 1.467 + simd::Store8(&aDestData[destIndex], u); 1.468 + } 1.469 + } 1.470 +} 1.471 + 1.472 +template<typename i16x8_t, typename u8x16_t> 1.473 +inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, 1.474 + uint8_t* aDestData, int32_t aDestStride, 1.475 + const IntRect& aDestRect, int32_t aRadius, 1.476 + MorphologyOperator aOp) 1.477 +{ 1.478 + if (aOp == MORPHOLOGY_OPERATOR_ERODE) { 1.479 + ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( 1.480 + aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 1.481 + } else { 1.482 + ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( 1.483 + aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); 1.484 + } 1.485 +} 1.486 + 1.487 +template<typename i32x4_t, typename i16x8_t> 1.488 +static i32x4_t 1.489 +ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias) 1.490 +{ 1.491 + // int16_t p[8] == { b, g, r, a, b, g, r, a }. 1.492 + // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }. 1.493 + // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }. 1.494 + // int32_t bias[4] == { _B, _G, _R, _A }. 1.495 + 1.496 + i32x4_t sum = bias; 1.497 + 1.498 + // int16_t bg[8] = { b, g, b, g, b, g, b, g }; 1.499 + i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p)); 1.500 + // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA } 1.501 + i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg); 1.502 + sum = simd::Add32(sum, prodsum_bg); 1.503 + 1.504 + // uint16_t ra[8] = { r, a, r, a, r, a, r, a }; 1.505 + i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p)); 1.506 + // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA } 1.507 + i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra); 1.508 + sum = simd::Add32(sum, prodsum_ra); 1.509 + 1.510 + // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }. 1.511 + return sum; 1.512 +} 1.513 + 1.514 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t> 1.515 +static TemporaryRef<DataSourceSurface> 1.516 +ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix) 1.517 +{ 1.518 + IntSize size = aInput->GetSize(); 1.519 + RefPtr<DataSourceSurface> target = 1.520 + Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 1.521 + if (!target) { 1.522 + return nullptr; 1.523 + } 1.524 + 1.525 + uint8_t* sourceData = aInput->GetData(); 1.526 + uint8_t* targetData = target->GetData(); 1.527 + int32_t sourceStride = aInput->Stride(); 1.528 + int32_t targetStride = target->Stride(); 1.529 + 1.530 + const int16_t factor = 128; 1.531 + const Float floatElementMax = INT16_MAX / factor; // 255 1.532 + MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale"); 1.533 + 1.534 + const Float *floats = &aMatrix._11; 1.535 + 1.536 + ptrdiff_t componentOffsets[4] = { 1.537 + B8G8R8A8_COMPONENT_BYTEOFFSET_R, 1.538 + B8G8R8A8_COMPONENT_BYTEOFFSET_G, 1.539 + B8G8R8A8_COMPONENT_BYTEOFFSET_B, 1.540 + B8G8R8A8_COMPONENT_BYTEOFFSET_A 1.541 + }; 1.542 + 1.543 + // We store the color matrix in rows_bgra in the following format: 1.544 + // { bB, bG, bR, bA, gB, gG, gR, gA }. 1.545 + // { bB, gB, bG, gG, bR, gR, bA, gA } 1.546 + // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16 1.547 + // which works especially well for our use case. 1.548 + int16_t rows_bgra[2][8]; 1.549 + for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) { 1.550 + for (size_t colIndex = 0; colIndex < 4; colIndex++) { 1.551 + const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; 1.552 + Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax); 1.553 + int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5); 1.554 + int8_t bg_or_ra = componentOffsets[rowIndex] / 2; 1.555 + int8_t g_or_a = componentOffsets[rowIndex] % 2; 1.556 + int8_t B_or_G_or_R_or_A = componentOffsets[colIndex]; 1.557 + rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement; 1.558 + } 1.559 + } 1.560 + 1.561 + int32_t rowBias[4]; 1.562 + Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255); 1.563 + for (size_t colIndex = 0; colIndex < 4; colIndex++) { 1.564 + size_t rowIndex = 4; 1.565 + const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; 1.566 + Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax); 1.567 + int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5); 1.568 + rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement; 1.569 + } 1.570 + 1.571 + i16x8_t row_bg_v = simd::FromI16<i16x8_t>( 1.572 + rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3], 1.573 + rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]); 1.574 + 1.575 + i16x8_t row_ra_v = simd::FromI16<i16x8_t>( 1.576 + rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3], 1.577 + rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]); 1.578 + 1.579 + i32x4_t rowsBias_v = 1.580 + simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]); 1.581 + 1.582 + for (int32_t y = 0; y < size.height; y++) { 1.583 + for (int32_t x = 0; x < size.width; x += 4) { 1.584 + MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position"); 1.585 + MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position"); 1.586 + int32_t sourceIndex = y * sourceStride + 4 * x; 1.587 + int32_t targetIndex = y * targetStride + 4 * x; 1.588 + 1.589 + // We load 4 pixels, unpack them, process them 1 pixel at a time, and 1.590 + // finally pack and store the 4 result pixels. 1.591 + 1.592 + u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 1.593 + 1.594 + // Splat needed to get each pixel twice into i16x8 1.595 + i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234)); 1.596 + i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234)); 1.597 + i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234)); 1.598 + i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234)); 1.599 + 1.600 + i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v); 1.601 + i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v); 1.602 + i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v); 1.603 + i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v); 1.604 + 1.605 + static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor."); 1.606 + u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1), 1.607 + simd::ShiftRight32<7>(result_p2), 1.608 + simd::ShiftRight32<7>(result_p3), 1.609 + simd::ShiftRight32<7>(result_p4)); 1.610 + simd::Store8(&targetData[targetIndex], result_p1234); 1.611 + } 1.612 + } 1.613 + 1.614 + return target; 1.615 +} 1.616 + 1.617 +// source / dest: bgra bgra 1.618 +// sourceAlpha / destAlpha: aaaa aaaa 1.619 +// result: bgra bgra 1.620 +template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator> 1.621 +static inline u16x8_t 1.622 +CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha) 1.623 +{ 1.624 + u16x8_t x255 = simd::FromU16<u16x8_t>(255); 1.625 + 1.626 + switch (aCompositeOperator) { 1.627 + 1.628 + case COMPOSITE_OPERATOR_OVER: 1.629 + { 1.630 + // val = dest * (255 - sourceAlpha) + source * 255; 1.631 + u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 1.632 + 1.633 + u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 1.634 + u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255); 1.635 + i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 1.636 + 1.637 + u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 1.638 + u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255); 1.639 + i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 1.640 + 1.641 + return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 1.642 + simd::FastDivideBy255(result2)); 1.643 + } 1.644 + 1.645 + case COMPOSITE_OPERATOR_IN: 1.646 + { 1.647 + // val = source * destAlpha; 1.648 + return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha)); 1.649 + } 1.650 + 1.651 + case COMPOSITE_OPERATOR_OUT: 1.652 + { 1.653 + // val = source * (255 - destAlpha); 1.654 + u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha)); 1.655 + return simd::FastDivideBy255_16(prod); 1.656 + } 1.657 + 1.658 + case COMPOSITE_OPERATOR_ATOP: 1.659 + { 1.660 + // val = dest * (255 - sourceAlpha) + source * destAlpha; 1.661 + u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 1.662 + 1.663 + u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 1.664 + u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha); 1.665 + i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 1.666 + 1.667 + u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 1.668 + u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha); 1.669 + i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 1.670 + 1.671 + return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 1.672 + simd::FastDivideBy255(result2)); 1.673 + } 1.674 + 1.675 + case COMPOSITE_OPERATOR_XOR: 1.676 + { 1.677 + // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha); 1.678 + u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); 1.679 + u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); 1.680 + 1.681 + u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); 1.682 + u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, 1.683 + twoFiftyFiveMinusDestAlpha); 1.684 + i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); 1.685 + 1.686 + u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); 1.687 + u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, 1.688 + twoFiftyFiveMinusDestAlpha); 1.689 + i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); 1.690 + 1.691 + return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), 1.692 + simd::FastDivideBy255(result2)); 1.693 + } 1.694 + 1.695 + default: 1.696 + return simd::FromU16<u16x8_t>(0); 1.697 + 1.698 + } 1.699 +} 1.700 + 1.701 +template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op> 1.702 +static void 1.703 +ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest) 1.704 +{ 1.705 + IntSize size = aDest->GetSize(); 1.706 + 1.707 + uint8_t* sourceData = aSource->GetData(); 1.708 + uint8_t* destData = aDest->GetData(); 1.709 + uint32_t sourceStride = aSource->Stride(); 1.710 + uint32_t destStride = aDest->Stride(); 1.711 + 1.712 + for (int32_t y = 0; y < size.height; y++) { 1.713 + for (int32_t x = 0; x < size.width; x += 4) { 1.714 + uint32_t sourceIndex = y * sourceStride + 4 * x; 1.715 + uint32_t destIndex = y * destStride + 4 * x; 1.716 + 1.717 + u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 1.718 + u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]); 1.719 + 1.720 + u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234); 1.721 + u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234); 1.722 + u16x8_t sa12 = simd::Splat16<3,3>(s12); 1.723 + u16x8_t da12 = simd::Splat16<3,3>(d12); 1.724 + u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12); 1.725 + 1.726 + u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234); 1.727 + u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234); 1.728 + u16x8_t sa34 = simd::Splat16<3,3>(s34); 1.729 + u16x8_t da34 = simd::Splat16<3,3>(d34); 1.730 + u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34); 1.731 + 1.732 + u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34); 1.733 + simd::Store8(&destData[destIndex], result1234); 1.734 + } 1.735 + } 1.736 +} 1.737 + 1.738 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t> 1.739 +static void 1.740 +ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest, 1.741 + CompositeOperator aOperator) 1.742 +{ 1.743 + switch (aOperator) { 1.744 + case COMPOSITE_OPERATOR_OVER: 1.745 + ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest); 1.746 + break; 1.747 + case COMPOSITE_OPERATOR_IN: 1.748 + ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest); 1.749 + break; 1.750 + case COMPOSITE_OPERATOR_OUT: 1.751 + ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest); 1.752 + break; 1.753 + case COMPOSITE_OPERATOR_ATOP: 1.754 + ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest); 1.755 + break; 1.756 + case COMPOSITE_OPERATOR_XOR: 1.757 + ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest); 1.758 + break; 1.759 + default: 1.760 + MOZ_CRASH(); 1.761 + } 1.762 +} 1.763 + 1.764 +template<typename u8x16_t> 1.765 +static void 1.766 +SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, 1.767 + uint8_t* channel0Data, uint8_t* channel1Data, 1.768 + uint8_t* channel2Data, uint8_t* channel3Data, 1.769 + int32_t channelStride) 1.770 +{ 1.771 + for (int32_t y = 0; y < size.height; y++) { 1.772 + for (int32_t x = 0; x < size.width; x += 16) { 1.773 + // Process 16 pixels at a time. 1.774 + int32_t sourceIndex = y * sourceStride + 4 * x; 1.775 + int32_t targetIndex = y * channelStride + x; 1.776 + 1.777 + u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); 1.778 + u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); 1.779 + u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); 1.780 + u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); 1.781 + 1.782 + bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); 1.783 + if (4 * (x + 4) < sourceStride) { 1.784 + bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); 1.785 + } 1.786 + if (4 * (x + 8) < sourceStride) { 1.787 + bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); 1.788 + } 1.789 + if (4 * (x + 12) < sourceStride) { 1.790 + bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); 1.791 + } 1.792 + 1.793 + u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 1.794 + u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); 1.795 + u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 1.796 + u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); 1.797 + u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); 1.798 + u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); 1.799 + u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); 1.800 + u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); 1.801 + u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 1.802 + u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); 1.803 + u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 1.804 + u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); 1.805 + u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); 1.806 + u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); 1.807 + u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 1.808 + u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); 1.809 + 1.810 + simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb); 1.811 + simd::Store8(&channel1Data[targetIndex], gggggggggggggggg); 1.812 + simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr); 1.813 + simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa); 1.814 + } 1.815 + } 1.816 +} 1.817 + 1.818 +template<typename u8x16_t> 1.819 +static void 1.820 +CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data) 1.821 +{ 1.822 + for (int32_t y = 0; y < size.height; y++) { 1.823 + for (int32_t x = 0; x < size.width; x += 16) { 1.824 + // Process 16 pixels at a time. 1.825 + int32_t resultIndex = y * resultStride + 4 * x; 1.826 + int32_t channelIndex = y * channelStride + x; 1.827 + 1.828 + u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]); 1.829 + u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]); 1.830 + u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]); 1.831 + u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]); 1.832 + 1.833 + u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); 1.834 + u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); 1.835 + u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa); 1.836 + u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa); 1.837 + 1.838 + u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); 1.839 + u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); 1.840 + u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); 1.841 + u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); 1.842 + 1.843 + simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1); 1.844 + if (4 * (x + 4) < resultStride) { 1.845 + simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2); 1.846 + } 1.847 + if (4 * (x + 8) < resultStride) { 1.848 + simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3); 1.849 + } 1.850 + if (4 * (x + 12) < resultStride) { 1.851 + simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4); 1.852 + } 1.853 + } 1.854 + } 1.855 +} 1.856 + 1.857 + 1.858 +template<typename i32x4_t, typename u16x8_t, typename u8x16_t> 1.859 +static void 1.860 +DoPremultiplicationCalculation_SIMD(const IntSize& aSize, 1.861 + uint8_t* aTargetData, int32_t aTargetStride, 1.862 + uint8_t* aSourceData, int32_t aSourceStride) 1.863 +{ 1.864 + const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff); 1.865 + for (int32_t y = 0; y < aSize.height; y++) { 1.866 + for (int32_t x = 0; x < aSize.width; x += 4) { 1.867 + int32_t inputIndex = y * aSourceStride + 4 * x; 1.868 + int32_t targetIndex = y * aTargetStride + 4 * x; 1.869 + 1.870 + u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); 1.871 + u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); 1.872 + u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); 1.873 + 1.874 + // Multiply all components with alpha. 1.875 + p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12)); 1.876 + p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34)); 1.877 + 1.878 + // Divide by 255 and pack. 1.879 + u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12), 1.880 + simd::FastDivideBy255_16(p34)); 1.881 + 1.882 + // Get the original alpha channel value back from p1234. 1.883 + result = simd::Pick(alphaMask, result, p1234); 1.884 + 1.885 + simd::Store8(&aTargetData[targetIndex], result); 1.886 + } 1.887 + } 1.888 +} 1.889 + 1.890 +// We use a table of precomputed factors for unpremultiplying. 1.891 +// We want to compute round(r / (alpha / 255.0f)) for arbitrary values of 1.892 +// r and alpha in constant time. This table of factors has the property that 1.893 +// (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with 1.894 +// a maximum deviation of 1). 1.895 +// 1.896 +// sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha) 1.897 +// 1.898 +// This table has been created using the python code 1.899 +// ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256)) 1.900 +static const uint16_t sAlphaFactors[256] = { 1.901 + 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935, 1.902 + 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720, 1.903 + 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764, 1.904 + 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306, 1.905 + 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036, 1.906 + 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837, 1.907 + 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702, 1.908 + 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604, 1.909 + 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531, 1.910 + 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473, 1.911 + 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427, 1.912 + 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389, 1.913 + 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357, 1.914 + 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330, 1.915 + 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306, 1.916 + 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286, 1.917 + 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269, 1.918 + 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256 1.919 +}; 1.920 + 1.921 +template<typename u16x8_t, typename u8x16_t> 1.922 +static void 1.923 +DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize, 1.924 + uint8_t* aTargetData, int32_t aTargetStride, 1.925 + uint8_t* aSourceData, int32_t aSourceStride) 1.926 +{ 1.927 + for (int32_t y = 0; y < aSize.height; y++) { 1.928 + for (int32_t x = 0; x < aSize.width; x += 4) { 1.929 + int32_t inputIndex = y * aSourceStride + 4 * x; 1.930 + int32_t targetIndex = y * aTargetStride + 4 * x; 1.931 + union { 1.932 + u8x16_t p1234; 1.933 + uint8_t u8[4][4]; 1.934 + }; 1.935 + p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); 1.936 + 1.937 + // Prepare the alpha factors. 1.938 + uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; 1.939 + uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; 1.940 + uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; 1.941 + uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; 1.942 + u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8); 1.943 + u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8); 1.944 + 1.945 + u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); 1.946 + u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); 1.947 + 1.948 + // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits. 1.949 + p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128))); 1.950 + p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128))); 1.951 + 1.952 + u8x16_t result = simd::PackAndSaturate16To8(p12, p34); 1.953 + simd::Store8(&aTargetData[targetIndex], result); 1.954 + } 1.955 + } 1.956 +} 1.957 + 1.958 +template<typename f32x4_t, typename i32x4_t, typename u8x16_t> 1.959 +static TemporaryRef<DataSourceSurface> 1.960 +RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency, 1.961 + int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect) 1.962 +{ 1.963 +#define RETURN_TURBULENCE(Type, Stitch) \ 1.964 + SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \ 1.965 + renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \ 1.966 + return renderer.Render(aSize, aOffset); 1.967 + 1.968 + switch (aType) { 1.969 + case TURBULENCE_TYPE_TURBULENCE: 1.970 + { 1.971 + if (aStitch) { 1.972 + RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true); 1.973 + } 1.974 + RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false); 1.975 + } 1.976 + case TURBULENCE_TYPE_FRACTAL_NOISE: 1.977 + { 1.978 + if (aStitch) { 1.979 + RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true); 1.980 + } 1.981 + RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false); 1.982 + } 1.983 + } 1.984 + return nullptr; 1.985 +#undef RETURN_TURBULENCE 1.986 +} 1.987 + 1.988 +// k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 1.989 +template<typename i32x4_t, typename i16x8_t> 1.990 +static MOZ_ALWAYS_INLINE i16x8_t 1.991 +ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2, 1.992 + const i16x8_t &k1And4, const i16x8_t &k2And3) 1.993 +{ 1.994 + // Calculate input product: inProd = (in1 * in2) / 255. 1.995 + i32x4_t inProd_1, inProd_2; 1.996 + simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2); 1.997 + i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2)); 1.998 + 1.999 + // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128 1.1000 + i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128); 1.1001 + i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight); 1.1002 + i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight); 1.1003 + i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight); 1.1004 + i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight); 1.1005 + 1.1006 + // Calculate k2 * in1 + k3 * in2 1.1007 + i16x8_t in12_1 = simd::InterleaveLo16(in1, in2); 1.1008 + i16x8_t in12_2 = simd::InterleaveHi16(in1, in2); 1.1009 + i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1); 1.1010 + i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2); 1.1011 + 1.1012 + // Sum everything up and truncate the fractional part. 1.1013 + i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1)); 1.1014 + i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2)); 1.1015 + return simd::PackAndSaturate32To16(result_1, result_2); 1.1016 +} 1.1017 + 1.1018 +template<typename i32x4_t, typename i16x8_t, typename u8x16_t> 1.1019 +static TemporaryRef<DataSourceSurface> 1.1020 +ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, 1.1021 + Float aK1, Float aK2, Float aK3, Float aK4) 1.1022 +{ 1.1023 + IntSize size = aInput1->GetSize(); 1.1024 + RefPtr<DataSourceSurface> target = 1.1025 + Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); 1.1026 + if (!target) { 1.1027 + return nullptr; 1.1028 + } 1.1029 + 1.1030 + uint8_t* source1Data = aInput1->GetData(); 1.1031 + uint8_t* source2Data = aInput2->GetData(); 1.1032 + uint8_t* targetData = target->GetData(); 1.1033 + uint32_t source1Stride = aInput1->Stride(); 1.1034 + uint32_t source2Stride = aInput2->Stride(); 1.1035 + uint32_t targetStride = target->Stride(); 1.1036 + 1.1037 + // The arithmetic combine filter does the following calculation: 1.1038 + // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 1.1039 + // 1.1040 + // Or, with in1/2 integers between 0 and 255: 1.1041 + // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255 1.1042 + // 1.1043 + // We want the whole calculation to happen in integer, with 16-bit factors. 1.1044 + // So we convert our factors to fixed-point with precision 1.8.7. 1.1045 + // K4 is premultiplied with 255, and it will be multiplied with 128 later 1.1046 + // during the actual calculation, because premultiplying it with 255 * 128 1.1047 + // would overflow int16. 1.1048 + 1.1049 + i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f))); 1.1050 + i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f))); 1.1051 + i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f))); 1.1052 + i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f))); 1.1053 + 1.1054 + i16x8_t k1And4 = simd::InterleaveLo16(k1, k4); 1.1055 + i16x8_t k2And3 = simd::InterleaveLo16(k2, k3); 1.1056 + 1.1057 + for (int32_t y = 0; y < size.height; y++) { 1.1058 + for (int32_t x = 0; x < size.width; x += 4) { 1.1059 + uint32_t source1Index = y * source1Stride + 4 * x; 1.1060 + uint32_t source2Index = y * source2Stride + 4 * x; 1.1061 + uint32_t targetIndex = y * targetStride + 4 * x; 1.1062 + 1.1063 + // Load and unpack. 1.1064 + u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]); 1.1065 + u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]); 1.1066 + i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1); 1.1067 + i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1); 1.1068 + i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2); 1.1069 + i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2); 1.1070 + 1.1071 + // Multiply and add. 1.1072 + i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3); 1.1073 + i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3); 1.1074 + 1.1075 + // Pack and store. 1.1076 + simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34)); 1.1077 + } 1.1078 + } 1.1079 + 1.1080 + return target; 1.1081 +} 1.1082 + 1.1083 +} // namespace mozilla 1.1084 +} // namespace gfx