Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
michael@0 | 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "FilterProcessing.h" |
michael@0 | 7 | |
michael@0 | 8 | #include "SIMD.h" |
michael@0 | 9 | #include "SVGTurbulenceRenderer-inl.h" |
michael@0 | 10 | |
michael@0 | 11 | namespace mozilla { |
michael@0 | 12 | namespace gfx { |
michael@0 | 13 | |
michael@0 | 14 | template<typename u8x16_t> |
michael@0 | 15 | inline TemporaryRef<DataSourceSurface> |
michael@0 | 16 | ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface) |
michael@0 | 17 | { |
michael@0 | 18 | IntSize size = aSurface->GetSize(); |
michael@0 | 19 | RefPtr<DataSourceSurface> input = aSurface->GetDataSurface(); |
michael@0 | 20 | RefPtr<DataSourceSurface> output = |
michael@0 | 21 | Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
michael@0 | 22 | uint8_t *inputData = input->GetData(); |
michael@0 | 23 | uint8_t *outputData = output->GetData(); |
michael@0 | 24 | int32_t inputStride = input->Stride(); |
michael@0 | 25 | int32_t outputStride = output->Stride(); |
michael@0 | 26 | switch (input->GetFormat()) { |
michael@0 | 27 | case SurfaceFormat::B8G8R8A8: |
michael@0 | 28 | output = input; |
michael@0 | 29 | break; |
michael@0 | 30 | case SurfaceFormat::B8G8R8X8: |
michael@0 | 31 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 32 | for (int32_t x = 0; x < size.width; x++) { |
michael@0 | 33 | int32_t inputIndex = y * inputStride + 4 * x; |
michael@0 | 34 | int32_t outputIndex = y * outputStride + 4 * x; |
michael@0 | 35 | outputData[outputIndex + 0] = inputData[inputIndex + 0]; |
michael@0 | 36 | outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
michael@0 | 37 | outputData[outputIndex + 2] = inputData[inputIndex + 2]; |
michael@0 | 38 | outputData[outputIndex + 3] = 255; |
michael@0 | 39 | } |
michael@0 | 40 | } |
michael@0 | 41 | break; |
michael@0 | 42 | case SurfaceFormat::R8G8B8A8: |
michael@0 | 43 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 44 | for (int32_t x = 0; x < size.width; x++) { |
michael@0 | 45 | int32_t inputIndex = y * inputStride + 4 * x; |
michael@0 | 46 | int32_t outputIndex = y * outputStride + 4 * x; |
michael@0 | 47 | outputData[outputIndex + 2] = inputData[inputIndex + 0]; |
michael@0 | 48 | outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
michael@0 | 49 | outputData[outputIndex + 0] = inputData[inputIndex + 2]; |
michael@0 | 50 | outputData[outputIndex + 3] = inputData[inputIndex + 3]; |
michael@0 | 51 | } |
michael@0 | 52 | } |
michael@0 | 53 | break; |
michael@0 | 54 | case SurfaceFormat::R8G8B8X8: |
michael@0 | 55 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 56 | for (int32_t x = 0; x < size.width; x++) { |
michael@0 | 57 | int32_t inputIndex = y * inputStride + 4 * x; |
michael@0 | 58 | int32_t outputIndex = y * outputStride + 4 * x; |
michael@0 | 59 | outputData[outputIndex + 2] = inputData[inputIndex + 0]; |
michael@0 | 60 | outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
michael@0 | 61 | outputData[outputIndex + 0] = inputData[inputIndex + 2]; |
michael@0 | 62 | outputData[outputIndex + 3] = 255; |
michael@0 | 63 | } |
michael@0 | 64 | } |
michael@0 | 65 | break; |
michael@0 | 66 | case SurfaceFormat::A8: |
michael@0 | 67 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 68 | for (int32_t x = 0; x < size.width; x += 16) { |
michael@0 | 69 | int32_t inputIndex = y * inputStride + x; |
michael@0 | 70 | int32_t outputIndex = y * outputStride + 4 * x; |
michael@0 | 71 | u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]); |
michael@0 | 72 | // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by |
michael@0 | 73 | // interleaving with 0000000000000000 twice. |
michael@0 | 74 | u8x16_t zero = simd::FromZero8<u8x16_t>(); |
michael@0 | 75 | u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16); |
michael@0 | 76 | u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16); |
michael@0 | 77 | u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8); |
michael@0 | 78 | u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8); |
michael@0 | 79 | u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16); |
michael@0 | 80 | u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16); |
michael@0 | 81 | simd::Store8(&outputData[outputIndex], p1To4); |
michael@0 | 82 | if ((x + 4) * 4 < outputStride) { |
michael@0 | 83 | simd::Store8(&outputData[outputIndex + 4 * 4], p5To8); |
michael@0 | 84 | } |
michael@0 | 85 | if ((x + 8) * 4 < outputStride) { |
michael@0 | 86 | simd::Store8(&outputData[outputIndex + 4 * 8], p9To12); |
michael@0 | 87 | } |
michael@0 | 88 | if ((x + 12) * 4 < outputStride) { |
michael@0 | 89 | simd::Store8(&outputData[outputIndex + 4 * 12], p13To16); |
michael@0 | 90 | } |
michael@0 | 91 | } |
michael@0 | 92 | } |
michael@0 | 93 | break; |
michael@0 | 94 | default: |
michael@0 | 95 | output = nullptr; |
michael@0 | 96 | break; |
michael@0 | 97 | } |
michael@0 | 98 | return output; |
michael@0 | 99 | } |
michael@0 | 100 | |
michael@0 | 101 | template<typename u8x16_t> |
michael@0 | 102 | inline void |
michael@0 | 103 | ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride) |
michael@0 | 104 | { |
michael@0 | 105 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 106 | for (int32_t x = 0; x < size.width; x += 16) { |
michael@0 | 107 | // Process 16 pixels at a time. |
michael@0 | 108 | // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA. |
michael@0 | 109 | int32_t sourceIndex = y * sourceStride + 4 * x; |
michael@0 | 110 | int32_t targetIndex = y * alphaStride + x; |
michael@0 | 111 | |
michael@0 | 112 | u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); |
michael@0 | 113 | u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); |
michael@0 | 114 | u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); |
michael@0 | 115 | u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); |
michael@0 | 116 | |
michael@0 | 117 | bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
michael@0 | 118 | if (4 * (x + 4) < sourceStride) { |
michael@0 | 119 | bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); |
michael@0 | 120 | } |
michael@0 | 121 | if (4 * (x + 8) < sourceStride) { |
michael@0 | 122 | bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); |
michael@0 | 123 | } |
michael@0 | 124 | if (4 * (x + 12) < sourceStride) { |
michael@0 | 125 | bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); |
michael@0 | 126 | } |
michael@0 | 127 | |
michael@0 | 128 | u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
michael@0 | 129 | u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
michael@0 | 130 | u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
michael@0 | 131 | u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
michael@0 | 132 | u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
michael@0 | 133 | u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
michael@0 | 134 | u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
michael@0 | 135 | u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
michael@0 | 136 | u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
michael@0 | 137 | u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
michael@0 | 138 | u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
michael@0 | 139 | |
michael@0 | 140 | simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa); |
michael@0 | 141 | } |
michael@0 | 142 | } |
michael@0 | 143 | } |
michael@0 | 144 | |
michael@0 | 145 | // This function calculates the result color values for four pixels, but for |
michael@0 | 146 | // only two color channels - either b & r or g & a. However, the a result will |
michael@0 | 147 | // not be used. |
michael@0 | 148 | // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa. |
michael@0 | 149 | // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the |
michael@0 | 150 | // alpha of all four pixels (and both aaaa's are the same). |
michael@0 | 151 | // blendendComponent1 and blendedComponent2 are the out parameters. |
michael@0 | 152 | template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode> |
michael@0 | 153 | inline void |
michael@0 | 154 | BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha, |
michael@0 | 155 | i16x8_t dest, const i16x8_t& destAlpha, |
michael@0 | 156 | i32x4_t& blendedComponent1, i32x4_t& blendedComponent2) |
michael@0 | 157 | { |
michael@0 | 158 | i16x8_t x255 = simd::FromI16<i16x8_t>(255); |
michael@0 | 159 | |
michael@0 | 160 | switch (aBlendMode) { |
michael@0 | 161 | |
michael@0 | 162 | case BLEND_MODE_MULTIPLY: |
michael@0 | 163 | { |
michael@0 | 164 | // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest); |
michael@0 | 165 | i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
michael@0 | 166 | i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
michael@0 | 167 | i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source); |
michael@0 | 168 | |
michael@0 | 169 | i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); |
michael@0 | 170 | i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); |
michael@0 | 171 | blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1); |
michael@0 | 172 | blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
michael@0 | 173 | |
michael@0 | 174 | i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); |
michael@0 | 175 | i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); |
michael@0 | 176 | blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2); |
michael@0 | 177 | blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
michael@0 | 178 | |
michael@0 | 179 | break; |
michael@0 | 180 | } |
michael@0 | 181 | |
michael@0 | 182 | case BLEND_MODE_SCREEN: |
michael@0 | 183 | { |
michael@0 | 184 | // val = 255 * (source + dest) + (0 - dest) * source; |
michael@0 | 185 | i16x8_t sourcePlusDest = simd::Add16(source, dest); |
michael@0 | 186 | i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest); |
michael@0 | 187 | |
michael@0 | 188 | i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest); |
michael@0 | 189 | i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source); |
michael@0 | 190 | blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1); |
michael@0 | 191 | blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
michael@0 | 192 | |
michael@0 | 193 | i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest); |
michael@0 | 194 | i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source); |
michael@0 | 195 | blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2); |
michael@0 | 196 | blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
michael@0 | 197 | |
michael@0 | 198 | break; |
michael@0 | 199 | } |
michael@0 | 200 | |
michael@0 | 201 | case BLEND_MODE_DARKEN: |
michael@0 | 202 | case BLEND_MODE_LIGHTEN: |
michael@0 | 203 | { |
michael@0 | 204 | // Darken: |
michael@0 | 205 | // val = min((255 - destAlpha) * source + 255 * dest, |
michael@0 | 206 | // 255 * source + (255 - sourceAlpha) * dest); |
michael@0 | 207 | // |
michael@0 | 208 | // Lighten: |
michael@0 | 209 | // val = max((255 - destAlpha) * source + 255 * dest, |
michael@0 | 210 | // 255 * source + (255 - sourceAlpha) * dest); |
michael@0 | 211 | |
michael@0 | 212 | i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
michael@0 | 213 | i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
michael@0 | 214 | |
michael@0 | 215 | i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255); |
michael@0 | 216 | i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha); |
michael@0 | 217 | i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); |
michael@0 | 218 | i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1); |
michael@0 | 219 | i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1); |
michael@0 | 220 | blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2); |
michael@0 | 221 | blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
michael@0 | 222 | |
michael@0 | 223 | i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255); |
michael@0 | 224 | i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha); |
michael@0 | 225 | i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); |
michael@0 | 226 | i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2); |
michael@0 | 227 | i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2); |
michael@0 | 228 | blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2); |
michael@0 | 229 | blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
michael@0 | 230 | |
michael@0 | 231 | break; |
michael@0 | 232 | } |
michael@0 | 233 | |
michael@0 | 234 | } |
michael@0 | 235 | } |
michael@0 | 236 | |
michael@0 | 237 | // The alpha channel is subject to a different calculation than the RGB |
michael@0 | 238 | // channels, and this calculation is the same for all blend modes: |
michael@0 | 239 | // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha) |
michael@0 | 240 | template<typename i16x8_t, typename i32x4_t> |
michael@0 | 241 | inline i32x4_t |
michael@0 | 242 | BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234) |
michael@0 | 243 | { |
michael@0 | 244 | // We're using MulAdd16x8x2To32x4, so we need to interleave our factors |
michael@0 | 245 | // appropriately. The calculation is rewritten as follows: |
michael@0 | 246 | // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0]) |
michael@0 | 247 | // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) |
michael@0 | 248 | // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) |
michael@0 | 249 | // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0] |
michael@0 | 250 | i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234); |
michael@0 | 251 | i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234); |
michael@0 | 252 | i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha); |
michael@0 | 253 | i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255)); |
michael@0 | 254 | return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2)); |
michael@0 | 255 | } |
michael@0 | 256 | |
michael@0 | 257 | template<typename u8x16_t, typename i16x8_t> |
michael@0 | 258 | inline void |
michael@0 | 259 | UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234, |
michael@0 | 260 | i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234) |
michael@0 | 261 | { |
michael@0 | 262 | // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234 |
michael@0 | 263 | i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234); |
michael@0 | 264 | i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234); |
michael@0 | 265 | i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34); |
michael@0 | 266 | i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34); |
michael@0 | 267 | bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24); |
michael@0 | 268 | rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24); |
michael@0 | 269 | } |
michael@0 | 270 | |
michael@0 | 271 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
michael@0 | 272 | inline u8x16_t |
michael@0 | 273 | ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234, |
michael@0 | 274 | i32x4_t rrrr1234, const i32x4_t& aaaa1234) |
michael@0 | 275 | { |
michael@0 | 276 | // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234 |
michael@0 | 277 | i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234); |
michael@0 | 278 | i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234); |
michael@0 | 279 | i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234); |
michael@0 | 280 | i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234); |
michael@0 | 281 | i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234); |
michael@0 | 282 | i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234); |
michael@0 | 283 | return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34); |
michael@0 | 284 | } |
michael@0 | 285 | |
michael@0 | 286 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode> |
michael@0 | 287 | inline TemporaryRef<DataSourceSurface> |
michael@0 | 288 | ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2) |
michael@0 | 289 | { |
michael@0 | 290 | IntSize size = aInput1->GetSize(); |
michael@0 | 291 | RefPtr<DataSourceSurface> target = |
michael@0 | 292 | Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
michael@0 | 293 | if (!target) { |
michael@0 | 294 | return nullptr; |
michael@0 | 295 | } |
michael@0 | 296 | |
michael@0 | 297 | uint8_t* source1Data = aInput1->GetData(); |
michael@0 | 298 | uint8_t* source2Data = aInput2->GetData(); |
michael@0 | 299 | uint8_t* targetData = target->GetData(); |
michael@0 | 300 | int32_t targetStride = target->Stride(); |
michael@0 | 301 | int32_t source1Stride = aInput1->Stride(); |
michael@0 | 302 | int32_t source2Stride = aInput2->Stride(); |
michael@0 | 303 | |
michael@0 | 304 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 305 | for (int32_t x = 0; x < size.width; x += 4) { |
michael@0 | 306 | int32_t targetIndex = y * targetStride + 4 * x; |
michael@0 | 307 | int32_t source1Index = y * source1Stride + 4 * x; |
michael@0 | 308 | int32_t source2Index = y * source2Stride + 4 * x; |
michael@0 | 309 | |
michael@0 | 310 | u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]); |
michael@0 | 311 | u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]); |
michael@0 | 312 | |
michael@0 | 313 | // The blending calculation for the RGB channels all need access to the |
michael@0 | 314 | // alpha channel of their pixel, and the alpha calculation is different, |
michael@0 | 315 | // so it makes sense to separate by channel. |
michael@0 | 316 | |
michael@0 | 317 | i16x8_t s_bbbbgggg1234, s_rrrraaaa1234; |
michael@0 | 318 | i16x8_t d_bbbbgggg1234, d_rrrraaaa1234; |
michael@0 | 319 | UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234); |
michael@0 | 320 | UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234); |
michael@0 | 321 | i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234); |
michael@0 | 322 | i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234); |
michael@0 | 323 | |
michael@0 | 324 | // We only use blendedB, blendedG and blendedR. |
michael@0 | 325 | i32x4_t blendedB, blendedG, blendedR, blendedA; |
michael@0 | 326 | BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG); |
michael@0 | 327 | BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA); |
michael@0 | 328 | |
michael@0 | 329 | // Throw away blendedA and overwrite it with the correct blended alpha. |
michael@0 | 330 | blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234); |
michael@0 | 331 | |
michael@0 | 332 | u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA); |
michael@0 | 333 | simd::Store8(&targetData[targetIndex], result1234); |
michael@0 | 334 | } |
michael@0 | 335 | } |
michael@0 | 336 | |
michael@0 | 337 | return target; |
michael@0 | 338 | } |
michael@0 | 339 | |
michael@0 | 340 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
michael@0 | 341 | static TemporaryRef<DataSourceSurface> |
michael@0 | 342 | ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, |
michael@0 | 343 | BlendMode aBlendMode) |
michael@0 | 344 | { |
michael@0 | 345 | switch (aBlendMode) { |
michael@0 | 346 | case BLEND_MODE_MULTIPLY: |
michael@0 | 347 | return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2); |
michael@0 | 348 | case BLEND_MODE_SCREEN: |
michael@0 | 349 | return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2); |
michael@0 | 350 | case BLEND_MODE_DARKEN: |
michael@0 | 351 | return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2); |
michael@0 | 352 | case BLEND_MODE_LIGHTEN: |
michael@0 | 353 | return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2); |
michael@0 | 354 | default: |
michael@0 | 355 | return nullptr; |
michael@0 | 356 | } |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | template<MorphologyOperator Operator, typename u8x16_t> |
michael@0 | 360 | static u8x16_t |
michael@0 | 361 | Morph8(u8x16_t a, u8x16_t b) |
michael@0 | 362 | { |
michael@0 | 363 | return Operator == MORPHOLOGY_OPERATOR_ERODE ? |
michael@0 | 364 | simd::Min8(a, b) : simd::Max8(a, b); |
michael@0 | 365 | } |
michael@0 | 366 | |
michael@0 | 367 | // Set every pixel to the per-component minimum or maximum of the pixels around |
michael@0 | 368 | // it that are up to aRadius pixels away from it (horizontally). |
michael@0 | 369 | template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> |
michael@0 | 370 | inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
michael@0 | 371 | uint8_t* aDestData, int32_t aDestStride, |
michael@0 | 372 | const IntRect& aDestRect, int32_t aRadius) |
michael@0 | 373 | { |
michael@0 | 374 | static_assert(op == MORPHOLOGY_OPERATOR_ERODE || |
michael@0 | 375 | op == MORPHOLOGY_OPERATOR_DILATE, |
michael@0 | 376 | "unexpected morphology operator"); |
michael@0 | 377 | |
michael@0 | 378 | int32_t kernelSize = aRadius + 1 + aRadius; |
michael@0 | 379 | MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0"); |
michael@0 | 380 | MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3); |
michael@0 | 381 | int32_t completeKernelSizeForFourPixels = kernelSize + 3; |
michael@0 | 382 | MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 || |
michael@0 | 383 | completeKernelSizeForFourPixels % 4 == 2); |
michael@0 | 384 | |
michael@0 | 385 | // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just |
michael@0 | 386 | // the way we need them to be. |
michael@0 | 387 | |
michael@0 | 388 | IntRect sourceRect = aDestRect; |
michael@0 | 389 | sourceRect.Inflate(aRadius, 0); |
michael@0 | 390 | |
michael@0 | 391 | for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) { |
michael@0 | 392 | int32_t kernelStartX = aDestRect.x - aRadius; |
michael@0 | 393 | for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) { |
michael@0 | 394 | // We process four pixels (16 color values) at a time. |
michael@0 | 395 | // aSourceData[0] points to the pixel located at aDestRect.TopLeft(); |
michael@0 | 396 | // source values can be read beyond that because the source is extended |
michael@0 | 397 | // by aRadius pixels. |
michael@0 | 398 | |
michael@0 | 399 | int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX; |
michael@0 | 400 | u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
michael@0 | 401 | u8x16_t m1234 = p1234; |
michael@0 | 402 | |
michael@0 | 403 | for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) { |
michael@0 | 404 | u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ? |
michael@0 | 405 | simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) : |
michael@0 | 406 | simd::FromZero8<u8x16_t>(); |
michael@0 | 407 | u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678); |
michael@0 | 408 | u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678); |
michael@0 | 409 | m1234 = Morph8<op,u8x16_t>(m1234, p2345); |
michael@0 | 410 | m1234 = Morph8<op,u8x16_t>(m1234, p3456); |
michael@0 | 411 | if (i + 2 < completeKernelSizeForFourPixels) { |
michael@0 | 412 | u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678); |
michael@0 | 413 | m1234 = Morph8<op,u8x16_t>(m1234, p4567); |
michael@0 | 414 | m1234 = Morph8<op,u8x16_t>(m1234, p5678); |
michael@0 | 415 | } |
michael@0 | 416 | p1234 = p5678; |
michael@0 | 417 | } |
michael@0 | 418 | |
michael@0 | 419 | int32_t destIndex = y * aDestStride + 4 * x; |
michael@0 | 420 | simd::Store8(&aDestData[destIndex], m1234); |
michael@0 | 421 | } |
michael@0 | 422 | } |
michael@0 | 423 | } |
michael@0 | 424 | |
michael@0 | 425 | template<typename i16x8_t, typename u8x16_t> |
michael@0 | 426 | inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
michael@0 | 427 | uint8_t* aDestData, int32_t aDestStride, |
michael@0 | 428 | const IntRect& aDestRect, int32_t aRadius, |
michael@0 | 429 | MorphologyOperator aOp) |
michael@0 | 430 | { |
michael@0 | 431 | if (aOp == MORPHOLOGY_OPERATOR_ERODE) { |
michael@0 | 432 | ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( |
michael@0 | 433 | aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
michael@0 | 434 | } else { |
michael@0 | 435 | ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( |
michael@0 | 436 | aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
michael@0 | 437 | } |
michael@0 | 438 | } |
michael@0 | 439 | |
michael@0 | 440 | // Set every pixel to the per-component minimum or maximum of the pixels around |
michael@0 | 441 | // it that are up to aRadius pixels away from it (vertically). |
michael@0 | 442 | template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> |
michael@0 | 443 | static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
michael@0 | 444 | uint8_t* aDestData, int32_t aDestStride, |
michael@0 | 445 | const IntRect& aDestRect, int32_t aRadius) |
michael@0 | 446 | { |
michael@0 | 447 | static_assert(op == MORPHOLOGY_OPERATOR_ERODE || |
michael@0 | 448 | op == MORPHOLOGY_OPERATOR_DILATE, |
michael@0 | 449 | "unexpected morphology operator"); |
michael@0 | 450 | |
michael@0 | 451 | int32_t startY = aDestRect.y - aRadius; |
michael@0 | 452 | int32_t endY = aDestRect.y + aRadius; |
michael@0 | 453 | for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) { |
michael@0 | 454 | for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) { |
michael@0 | 455 | int32_t sourceIndex = startY * aSourceStride + 4 * x; |
michael@0 | 456 | u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
michael@0 | 457 | sourceIndex += aSourceStride; |
michael@0 | 458 | for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) { |
michael@0 | 459 | u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
michael@0 | 460 | u = Morph8<op,u8x16_t>(u, u2); |
michael@0 | 461 | } |
michael@0 | 462 | |
michael@0 | 463 | int32_t destIndex = y * aDestStride + 4 * x; |
michael@0 | 464 | simd::Store8(&aDestData[destIndex], u); |
michael@0 | 465 | } |
michael@0 | 466 | } |
michael@0 | 467 | } |
michael@0 | 468 | |
michael@0 | 469 | template<typename i16x8_t, typename u8x16_t> |
michael@0 | 470 | inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
michael@0 | 471 | uint8_t* aDestData, int32_t aDestStride, |
michael@0 | 472 | const IntRect& aDestRect, int32_t aRadius, |
michael@0 | 473 | MorphologyOperator aOp) |
michael@0 | 474 | { |
michael@0 | 475 | if (aOp == MORPHOLOGY_OPERATOR_ERODE) { |
michael@0 | 476 | ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( |
michael@0 | 477 | aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
michael@0 | 478 | } else { |
michael@0 | 479 | ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( |
michael@0 | 480 | aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
michael@0 | 481 | } |
michael@0 | 482 | } |
michael@0 | 483 | |
michael@0 | 484 | template<typename i32x4_t, typename i16x8_t> |
michael@0 | 485 | static i32x4_t |
michael@0 | 486 | ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias) |
michael@0 | 487 | { |
michael@0 | 488 | // int16_t p[8] == { b, g, r, a, b, g, r, a }. |
michael@0 | 489 | // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }. |
michael@0 | 490 | // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }. |
michael@0 | 491 | // int32_t bias[4] == { _B, _G, _R, _A }. |
michael@0 | 492 | |
michael@0 | 493 | i32x4_t sum = bias; |
michael@0 | 494 | |
michael@0 | 495 | // int16_t bg[8] = { b, g, b, g, b, g, b, g }; |
michael@0 | 496 | i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p)); |
michael@0 | 497 | // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA } |
michael@0 | 498 | i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg); |
michael@0 | 499 | sum = simd::Add32(sum, prodsum_bg); |
michael@0 | 500 | |
michael@0 | 501 | // uint16_t ra[8] = { r, a, r, a, r, a, r, a }; |
michael@0 | 502 | i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p)); |
michael@0 | 503 | // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA } |
michael@0 | 504 | i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra); |
michael@0 | 505 | sum = simd::Add32(sum, prodsum_ra); |
michael@0 | 506 | |
michael@0 | 507 | // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }. |
michael@0 | 508 | return sum; |
michael@0 | 509 | } |
michael@0 | 510 | |
michael@0 | 511 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
michael@0 | 512 | static TemporaryRef<DataSourceSurface> |
michael@0 | 513 | ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix) |
michael@0 | 514 | { |
michael@0 | 515 | IntSize size = aInput->GetSize(); |
michael@0 | 516 | RefPtr<DataSourceSurface> target = |
michael@0 | 517 | Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
michael@0 | 518 | if (!target) { |
michael@0 | 519 | return nullptr; |
michael@0 | 520 | } |
michael@0 | 521 | |
michael@0 | 522 | uint8_t* sourceData = aInput->GetData(); |
michael@0 | 523 | uint8_t* targetData = target->GetData(); |
michael@0 | 524 | int32_t sourceStride = aInput->Stride(); |
michael@0 | 525 | int32_t targetStride = target->Stride(); |
michael@0 | 526 | |
michael@0 | 527 | const int16_t factor = 128; |
michael@0 | 528 | const Float floatElementMax = INT16_MAX / factor; // 255 |
michael@0 | 529 | MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale"); |
michael@0 | 530 | |
michael@0 | 531 | const Float *floats = &aMatrix._11; |
michael@0 | 532 | |
michael@0 | 533 | ptrdiff_t componentOffsets[4] = { |
michael@0 | 534 | B8G8R8A8_COMPONENT_BYTEOFFSET_R, |
michael@0 | 535 | B8G8R8A8_COMPONENT_BYTEOFFSET_G, |
michael@0 | 536 | B8G8R8A8_COMPONENT_BYTEOFFSET_B, |
michael@0 | 537 | B8G8R8A8_COMPONENT_BYTEOFFSET_A |
michael@0 | 538 | }; |
michael@0 | 539 | |
michael@0 | 540 | // We store the color matrix in rows_bgra in the following format: |
michael@0 | 541 | // { bB, bG, bR, bA, gB, gG, gR, gA }. |
michael@0 | 542 | // { bB, gB, bG, gG, bR, gR, bA, gA } |
michael@0 | 543 | // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16 |
michael@0 | 544 | // which works especially well for our use case. |
michael@0 | 545 | int16_t rows_bgra[2][8]; |
michael@0 | 546 | for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) { |
michael@0 | 547 | for (size_t colIndex = 0; colIndex < 4; colIndex++) { |
michael@0 | 548 | const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; |
michael@0 | 549 | Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax); |
michael@0 | 550 | int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5); |
michael@0 | 551 | int8_t bg_or_ra = componentOffsets[rowIndex] / 2; |
michael@0 | 552 | int8_t g_or_a = componentOffsets[rowIndex] % 2; |
michael@0 | 553 | int8_t B_or_G_or_R_or_A = componentOffsets[colIndex]; |
michael@0 | 554 | rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement; |
michael@0 | 555 | } |
michael@0 | 556 | } |
michael@0 | 557 | |
michael@0 | 558 | int32_t rowBias[4]; |
michael@0 | 559 | Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255); |
michael@0 | 560 | for (size_t colIndex = 0; colIndex < 4; colIndex++) { |
michael@0 | 561 | size_t rowIndex = 4; |
michael@0 | 562 | const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; |
michael@0 | 563 | Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax); |
michael@0 | 564 | int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5); |
michael@0 | 565 | rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement; |
michael@0 | 566 | } |
michael@0 | 567 | |
michael@0 | 568 | i16x8_t row_bg_v = simd::FromI16<i16x8_t>( |
michael@0 | 569 | rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3], |
michael@0 | 570 | rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]); |
michael@0 | 571 | |
michael@0 | 572 | i16x8_t row_ra_v = simd::FromI16<i16x8_t>( |
michael@0 | 573 | rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3], |
michael@0 | 574 | rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]); |
michael@0 | 575 | |
michael@0 | 576 | i32x4_t rowsBias_v = |
michael@0 | 577 | simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]); |
michael@0 | 578 | |
michael@0 | 579 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 580 | for (int32_t x = 0; x < size.width; x += 4) { |
michael@0 | 581 | MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position"); |
michael@0 | 582 | MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position"); |
michael@0 | 583 | int32_t sourceIndex = y * sourceStride + 4 * x; |
michael@0 | 584 | int32_t targetIndex = y * targetStride + 4 * x; |
michael@0 | 585 | |
michael@0 | 586 | // We load 4 pixels, unpack them, process them 1 pixel at a time, and |
michael@0 | 587 | // finally pack and store the 4 result pixels. |
michael@0 | 588 | |
michael@0 | 589 | u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
michael@0 | 590 | |
michael@0 | 591 | // Splat needed to get each pixel twice into i16x8 |
michael@0 | 592 | i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234)); |
michael@0 | 593 | i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234)); |
michael@0 | 594 | i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234)); |
michael@0 | 595 | i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234)); |
michael@0 | 596 | |
michael@0 | 597 | i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v); |
michael@0 | 598 | i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v); |
michael@0 | 599 | i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v); |
michael@0 | 600 | i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v); |
michael@0 | 601 | |
michael@0 | 602 | static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor."); |
michael@0 | 603 | u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1), |
michael@0 | 604 | simd::ShiftRight32<7>(result_p2), |
michael@0 | 605 | simd::ShiftRight32<7>(result_p3), |
michael@0 | 606 | simd::ShiftRight32<7>(result_p4)); |
michael@0 | 607 | simd::Store8(&targetData[targetIndex], result_p1234); |
michael@0 | 608 | } |
michael@0 | 609 | } |
michael@0 | 610 | |
michael@0 | 611 | return target; |
michael@0 | 612 | } |
michael@0 | 613 | |
michael@0 | 614 | // source / dest: bgra bgra |
michael@0 | 615 | // sourceAlpha / destAlpha: aaaa aaaa |
michael@0 | 616 | // result: bgra bgra |
michael@0 | 617 | template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator> |
michael@0 | 618 | static inline u16x8_t |
michael@0 | 619 | CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha) |
michael@0 | 620 | { |
michael@0 | 621 | u16x8_t x255 = simd::FromU16<u16x8_t>(255); |
michael@0 | 622 | |
michael@0 | 623 | switch (aCompositeOperator) { |
michael@0 | 624 | |
michael@0 | 625 | case COMPOSITE_OPERATOR_OVER: |
michael@0 | 626 | { |
michael@0 | 627 | // val = dest * (255 - sourceAlpha) + source * 255; |
michael@0 | 628 | u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
michael@0 | 629 | |
michael@0 | 630 | u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
michael@0 | 631 | u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255); |
michael@0 | 632 | i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
michael@0 | 633 | |
michael@0 | 634 | u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
michael@0 | 635 | u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255); |
michael@0 | 636 | i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
michael@0 | 637 | |
michael@0 | 638 | return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
michael@0 | 639 | simd::FastDivideBy255(result2)); |
michael@0 | 640 | } |
michael@0 | 641 | |
michael@0 | 642 | case COMPOSITE_OPERATOR_IN: |
michael@0 | 643 | { |
michael@0 | 644 | // val = source * destAlpha; |
michael@0 | 645 | return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha)); |
michael@0 | 646 | } |
michael@0 | 647 | |
michael@0 | 648 | case COMPOSITE_OPERATOR_OUT: |
michael@0 | 649 | { |
michael@0 | 650 | // val = source * (255 - destAlpha); |
michael@0 | 651 | u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha)); |
michael@0 | 652 | return simd::FastDivideBy255_16(prod); |
michael@0 | 653 | } |
michael@0 | 654 | |
michael@0 | 655 | case COMPOSITE_OPERATOR_ATOP: |
michael@0 | 656 | { |
michael@0 | 657 | // val = dest * (255 - sourceAlpha) + source * destAlpha; |
michael@0 | 658 | u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
michael@0 | 659 | |
michael@0 | 660 | u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
michael@0 | 661 | u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha); |
michael@0 | 662 | i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
michael@0 | 663 | |
michael@0 | 664 | u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
michael@0 | 665 | u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha); |
michael@0 | 666 | i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
michael@0 | 667 | |
michael@0 | 668 | return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
michael@0 | 669 | simd::FastDivideBy255(result2)); |
michael@0 | 670 | } |
michael@0 | 671 | |
michael@0 | 672 | case COMPOSITE_OPERATOR_XOR: |
michael@0 | 673 | { |
michael@0 | 674 | // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha); |
michael@0 | 675 | u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
michael@0 | 676 | u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
michael@0 | 677 | |
michael@0 | 678 | u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
michael@0 | 679 | u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, |
michael@0 | 680 | twoFiftyFiveMinusDestAlpha); |
michael@0 | 681 | i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
michael@0 | 682 | |
michael@0 | 683 | u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
michael@0 | 684 | u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, |
michael@0 | 685 | twoFiftyFiveMinusDestAlpha); |
michael@0 | 686 | i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
michael@0 | 687 | |
michael@0 | 688 | return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
michael@0 | 689 | simd::FastDivideBy255(result2)); |
michael@0 | 690 | } |
michael@0 | 691 | |
michael@0 | 692 | default: |
michael@0 | 693 | return simd::FromU16<u16x8_t>(0); |
michael@0 | 694 | |
michael@0 | 695 | } |
michael@0 | 696 | } |
michael@0 | 697 | |
michael@0 | 698 | template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op> |
michael@0 | 699 | static void |
michael@0 | 700 | ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest) |
michael@0 | 701 | { |
michael@0 | 702 | IntSize size = aDest->GetSize(); |
michael@0 | 703 | |
michael@0 | 704 | uint8_t* sourceData = aSource->GetData(); |
michael@0 | 705 | uint8_t* destData = aDest->GetData(); |
michael@0 | 706 | uint32_t sourceStride = aSource->Stride(); |
michael@0 | 707 | uint32_t destStride = aDest->Stride(); |
michael@0 | 708 | |
michael@0 | 709 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 710 | for (int32_t x = 0; x < size.width; x += 4) { |
michael@0 | 711 | uint32_t sourceIndex = y * sourceStride + 4 * x; |
michael@0 | 712 | uint32_t destIndex = y * destStride + 4 * x; |
michael@0 | 713 | |
michael@0 | 714 | u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
michael@0 | 715 | u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]); |
michael@0 | 716 | |
michael@0 | 717 | u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234); |
michael@0 | 718 | u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234); |
michael@0 | 719 | u16x8_t sa12 = simd::Splat16<3,3>(s12); |
michael@0 | 720 | u16x8_t da12 = simd::Splat16<3,3>(d12); |
michael@0 | 721 | u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12); |
michael@0 | 722 | |
michael@0 | 723 | u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234); |
michael@0 | 724 | u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234); |
michael@0 | 725 | u16x8_t sa34 = simd::Splat16<3,3>(s34); |
michael@0 | 726 | u16x8_t da34 = simd::Splat16<3,3>(d34); |
michael@0 | 727 | u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34); |
michael@0 | 728 | |
michael@0 | 729 | u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34); |
michael@0 | 730 | simd::Store8(&destData[destIndex], result1234); |
michael@0 | 731 | } |
michael@0 | 732 | } |
michael@0 | 733 | } |
michael@0 | 734 | |
michael@0 | 735 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
michael@0 | 736 | static void |
michael@0 | 737 | ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest, |
michael@0 | 738 | CompositeOperator aOperator) |
michael@0 | 739 | { |
michael@0 | 740 | switch (aOperator) { |
michael@0 | 741 | case COMPOSITE_OPERATOR_OVER: |
michael@0 | 742 | ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest); |
michael@0 | 743 | break; |
michael@0 | 744 | case COMPOSITE_OPERATOR_IN: |
michael@0 | 745 | ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest); |
michael@0 | 746 | break; |
michael@0 | 747 | case COMPOSITE_OPERATOR_OUT: |
michael@0 | 748 | ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest); |
michael@0 | 749 | break; |
michael@0 | 750 | case COMPOSITE_OPERATOR_ATOP: |
michael@0 | 751 | ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest); |
michael@0 | 752 | break; |
michael@0 | 753 | case COMPOSITE_OPERATOR_XOR: |
michael@0 | 754 | ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest); |
michael@0 | 755 | break; |
michael@0 | 756 | default: |
michael@0 | 757 | MOZ_CRASH(); |
michael@0 | 758 | } |
michael@0 | 759 | } |
michael@0 | 760 | |
michael@0 | 761 | template<typename u8x16_t> |
michael@0 | 762 | static void |
michael@0 | 763 | SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, |
michael@0 | 764 | uint8_t* channel0Data, uint8_t* channel1Data, |
michael@0 | 765 | uint8_t* channel2Data, uint8_t* channel3Data, |
michael@0 | 766 | int32_t channelStride) |
michael@0 | 767 | { |
michael@0 | 768 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 769 | for (int32_t x = 0; x < size.width; x += 16) { |
michael@0 | 770 | // Process 16 pixels at a time. |
michael@0 | 771 | int32_t sourceIndex = y * sourceStride + 4 * x; |
michael@0 | 772 | int32_t targetIndex = y * channelStride + x; |
michael@0 | 773 | |
michael@0 | 774 | u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); |
michael@0 | 775 | u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); |
michael@0 | 776 | u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); |
michael@0 | 777 | u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); |
michael@0 | 778 | |
michael@0 | 779 | bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
michael@0 | 780 | if (4 * (x + 4) < sourceStride) { |
michael@0 | 781 | bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); |
michael@0 | 782 | } |
michael@0 | 783 | if (4 * (x + 8) < sourceStride) { |
michael@0 | 784 | bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); |
michael@0 | 785 | } |
michael@0 | 786 | if (4 * (x + 12) < sourceStride) { |
michael@0 | 787 | bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); |
michael@0 | 788 | } |
michael@0 | 789 | |
michael@0 | 790 | u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
michael@0 | 791 | u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
michael@0 | 792 | u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
michael@0 | 793 | u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
michael@0 | 794 | u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
michael@0 | 795 | u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
michael@0 | 796 | u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
michael@0 | 797 | u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
michael@0 | 798 | u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
michael@0 | 799 | u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
michael@0 | 800 | u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
michael@0 | 801 | u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
michael@0 | 802 | u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); |
michael@0 | 803 | u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); |
michael@0 | 804 | u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
michael@0 | 805 | u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
michael@0 | 806 | |
michael@0 | 807 | simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb); |
michael@0 | 808 | simd::Store8(&channel1Data[targetIndex], gggggggggggggggg); |
michael@0 | 809 | simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr); |
michael@0 | 810 | simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa); |
michael@0 | 811 | } |
michael@0 | 812 | } |
michael@0 | 813 | } |
michael@0 | 814 | |
michael@0 | 815 | template<typename u8x16_t> |
michael@0 | 816 | static void |
michael@0 | 817 | CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data) |
michael@0 | 818 | { |
michael@0 | 819 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 820 | for (int32_t x = 0; x < size.width; x += 16) { |
michael@0 | 821 | // Process 16 pixels at a time. |
michael@0 | 822 | int32_t resultIndex = y * resultStride + 4 * x; |
michael@0 | 823 | int32_t channelIndex = y * channelStride + x; |
michael@0 | 824 | |
michael@0 | 825 | u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]); |
michael@0 | 826 | u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]); |
michael@0 | 827 | u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]); |
michael@0 | 828 | u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]); |
michael@0 | 829 | |
michael@0 | 830 | u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); |
michael@0 | 831 | u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); |
michael@0 | 832 | u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa); |
michael@0 | 833 | u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa); |
michael@0 | 834 | |
michael@0 | 835 | u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); |
michael@0 | 836 | u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); |
michael@0 | 837 | u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); |
michael@0 | 838 | u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); |
michael@0 | 839 | |
michael@0 | 840 | simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1); |
michael@0 | 841 | if (4 * (x + 4) < resultStride) { |
michael@0 | 842 | simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2); |
michael@0 | 843 | } |
michael@0 | 844 | if (4 * (x + 8) < resultStride) { |
michael@0 | 845 | simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3); |
michael@0 | 846 | } |
michael@0 | 847 | if (4 * (x + 12) < resultStride) { |
michael@0 | 848 | simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4); |
michael@0 | 849 | } |
michael@0 | 850 | } |
michael@0 | 851 | } |
michael@0 | 852 | } |
michael@0 | 853 | |
michael@0 | 854 | |
michael@0 | 855 | template<typename i32x4_t, typename u16x8_t, typename u8x16_t> |
michael@0 | 856 | static void |
michael@0 | 857 | DoPremultiplicationCalculation_SIMD(const IntSize& aSize, |
michael@0 | 858 | uint8_t* aTargetData, int32_t aTargetStride, |
michael@0 | 859 | uint8_t* aSourceData, int32_t aSourceStride) |
michael@0 | 860 | { |
michael@0 | 861 | const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff); |
michael@0 | 862 | for (int32_t y = 0; y < aSize.height; y++) { |
michael@0 | 863 | for (int32_t x = 0; x < aSize.width; x += 4) { |
michael@0 | 864 | int32_t inputIndex = y * aSourceStride + 4 * x; |
michael@0 | 865 | int32_t targetIndex = y * aTargetStride + 4 * x; |
michael@0 | 866 | |
michael@0 | 867 | u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); |
michael@0 | 868 | u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); |
michael@0 | 869 | u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); |
michael@0 | 870 | |
michael@0 | 871 | // Multiply all components with alpha. |
michael@0 | 872 | p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12)); |
michael@0 | 873 | p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34)); |
michael@0 | 874 | |
michael@0 | 875 | // Divide by 255 and pack. |
michael@0 | 876 | u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12), |
michael@0 | 877 | simd::FastDivideBy255_16(p34)); |
michael@0 | 878 | |
michael@0 | 879 | // Get the original alpha channel value back from p1234. |
michael@0 | 880 | result = simd::Pick(alphaMask, result, p1234); |
michael@0 | 881 | |
michael@0 | 882 | simd::Store8(&aTargetData[targetIndex], result); |
michael@0 | 883 | } |
michael@0 | 884 | } |
michael@0 | 885 | } |
michael@0 | 886 | |
michael@0 | 887 | // We use a table of precomputed factors for unpremultiplying. |
michael@0 | 888 | // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of |
michael@0 | 889 | // r and alpha in constant time. This table of factors has the property that |
michael@0 | 890 | // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with |
michael@0 | 891 | // a maximum deviation of 1). |
michael@0 | 892 | // |
michael@0 | 893 | // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha) |
michael@0 | 894 | // |
michael@0 | 895 | // This table has been created using the python code |
michael@0 | 896 | // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256)) |
michael@0 | 897 | static const uint16_t sAlphaFactors[256] = { |
michael@0 | 898 | 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935, |
michael@0 | 899 | 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720, |
michael@0 | 900 | 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764, |
michael@0 | 901 | 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306, |
michael@0 | 902 | 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036, |
michael@0 | 903 | 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837, |
michael@0 | 904 | 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702, |
michael@0 | 905 | 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604, |
michael@0 | 906 | 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531, |
michael@0 | 907 | 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473, |
michael@0 | 908 | 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427, |
michael@0 | 909 | 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389, |
michael@0 | 910 | 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357, |
michael@0 | 911 | 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330, |
michael@0 | 912 | 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306, |
michael@0 | 913 | 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286, |
michael@0 | 914 | 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269, |
michael@0 | 915 | 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256 |
michael@0 | 916 | }; |
michael@0 | 917 | |
michael@0 | 918 | template<typename u16x8_t, typename u8x16_t> |
michael@0 | 919 | static void |
michael@0 | 920 | DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize, |
michael@0 | 921 | uint8_t* aTargetData, int32_t aTargetStride, |
michael@0 | 922 | uint8_t* aSourceData, int32_t aSourceStride) |
michael@0 | 923 | { |
michael@0 | 924 | for (int32_t y = 0; y < aSize.height; y++) { |
michael@0 | 925 | for (int32_t x = 0; x < aSize.width; x += 4) { |
michael@0 | 926 | int32_t inputIndex = y * aSourceStride + 4 * x; |
michael@0 | 927 | int32_t targetIndex = y * aTargetStride + 4 * x; |
michael@0 | 928 | union { |
michael@0 | 929 | u8x16_t p1234; |
michael@0 | 930 | uint8_t u8[4][4]; |
michael@0 | 931 | }; |
michael@0 | 932 | p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); |
michael@0 | 933 | |
michael@0 | 934 | // Prepare the alpha factors. |
michael@0 | 935 | uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
michael@0 | 936 | uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
michael@0 | 937 | uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
michael@0 | 938 | uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
michael@0 | 939 | u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8); |
michael@0 | 940 | u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8); |
michael@0 | 941 | |
michael@0 | 942 | u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); |
michael@0 | 943 | u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); |
michael@0 | 944 | |
michael@0 | 945 | // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits. |
michael@0 | 946 | p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128))); |
michael@0 | 947 | p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128))); |
michael@0 | 948 | |
michael@0 | 949 | u8x16_t result = simd::PackAndSaturate16To8(p12, p34); |
michael@0 | 950 | simd::Store8(&aTargetData[targetIndex], result); |
michael@0 | 951 | } |
michael@0 | 952 | } |
michael@0 | 953 | } |
michael@0 | 954 | |
michael@0 | 955 | template<typename f32x4_t, typename i32x4_t, typename u8x16_t> |
michael@0 | 956 | static TemporaryRef<DataSourceSurface> |
michael@0 | 957 | RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency, |
michael@0 | 958 | int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect) |
michael@0 | 959 | { |
michael@0 | 960 | #define RETURN_TURBULENCE(Type, Stitch) \ |
michael@0 | 961 | SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \ |
michael@0 | 962 | renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \ |
michael@0 | 963 | return renderer.Render(aSize, aOffset); |
michael@0 | 964 | |
michael@0 | 965 | switch (aType) { |
michael@0 | 966 | case TURBULENCE_TYPE_TURBULENCE: |
michael@0 | 967 | { |
michael@0 | 968 | if (aStitch) { |
michael@0 | 969 | RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true); |
michael@0 | 970 | } |
michael@0 | 971 | RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false); |
michael@0 | 972 | } |
michael@0 | 973 | case TURBULENCE_TYPE_FRACTAL_NOISE: |
michael@0 | 974 | { |
michael@0 | 975 | if (aStitch) { |
michael@0 | 976 | RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true); |
michael@0 | 977 | } |
michael@0 | 978 | RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false); |
michael@0 | 979 | } |
michael@0 | 980 | } |
michael@0 | 981 | return nullptr; |
michael@0 | 982 | #undef RETURN_TURBULENCE |
michael@0 | 983 | } |
michael@0 | 984 | |
michael@0 | 985 | // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 |
michael@0 | 986 | template<typename i32x4_t, typename i16x8_t> |
michael@0 | 987 | static MOZ_ALWAYS_INLINE i16x8_t |
michael@0 | 988 | ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2, |
michael@0 | 989 | const i16x8_t &k1And4, const i16x8_t &k2And3) |
michael@0 | 990 | { |
michael@0 | 991 | // Calculate input product: inProd = (in1 * in2) / 255. |
michael@0 | 992 | i32x4_t inProd_1, inProd_2; |
michael@0 | 993 | simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2); |
michael@0 | 994 | i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2)); |
michael@0 | 995 | |
michael@0 | 996 | // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128 |
michael@0 | 997 | i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128); |
michael@0 | 998 | i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight); |
michael@0 | 999 | i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight); |
michael@0 | 1000 | i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight); |
michael@0 | 1001 | i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight); |
michael@0 | 1002 | |
michael@0 | 1003 | // Calculate k2 * in1 + k3 * in2 |
michael@0 | 1004 | i16x8_t in12_1 = simd::InterleaveLo16(in1, in2); |
michael@0 | 1005 | i16x8_t in12_2 = simd::InterleaveHi16(in1, in2); |
michael@0 | 1006 | i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1); |
michael@0 | 1007 | i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2); |
michael@0 | 1008 | |
michael@0 | 1009 | // Sum everything up and truncate the fractional part. |
michael@0 | 1010 | i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1)); |
michael@0 | 1011 | i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2)); |
michael@0 | 1012 | return simd::PackAndSaturate32To16(result_1, result_2); |
michael@0 | 1013 | } |
michael@0 | 1014 | |
michael@0 | 1015 | template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
michael@0 | 1016 | static TemporaryRef<DataSourceSurface> |
michael@0 | 1017 | ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, |
michael@0 | 1018 | Float aK1, Float aK2, Float aK3, Float aK4) |
michael@0 | 1019 | { |
michael@0 | 1020 | IntSize size = aInput1->GetSize(); |
michael@0 | 1021 | RefPtr<DataSourceSurface> target = |
michael@0 | 1022 | Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
michael@0 | 1023 | if (!target) { |
michael@0 | 1024 | return nullptr; |
michael@0 | 1025 | } |
michael@0 | 1026 | |
michael@0 | 1027 | uint8_t* source1Data = aInput1->GetData(); |
michael@0 | 1028 | uint8_t* source2Data = aInput2->GetData(); |
michael@0 | 1029 | uint8_t* targetData = target->GetData(); |
michael@0 | 1030 | uint32_t source1Stride = aInput1->Stride(); |
michael@0 | 1031 | uint32_t source2Stride = aInput2->Stride(); |
michael@0 | 1032 | uint32_t targetStride = target->Stride(); |
michael@0 | 1033 | |
michael@0 | 1034 | // The arithmetic combine filter does the following calculation: |
michael@0 | 1035 | // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 |
michael@0 | 1036 | // |
michael@0 | 1037 | // Or, with in1/2 integers between 0 and 255: |
michael@0 | 1038 | // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255 |
michael@0 | 1039 | // |
michael@0 | 1040 | // We want the whole calculation to happen in integer, with 16-bit factors. |
michael@0 | 1041 | // So we convert our factors to fixed-point with precision 1.8.7. |
michael@0 | 1042 | // K4 is premultiplied with 255, and it will be multiplied with 128 later |
michael@0 | 1043 | // during the actual calculation, because premultiplying it with 255 * 128 |
michael@0 | 1044 | // would overflow int16. |
michael@0 | 1045 | |
michael@0 | 1046 | i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f))); |
michael@0 | 1047 | i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f))); |
michael@0 | 1048 | i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f))); |
michael@0 | 1049 | i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f))); |
michael@0 | 1050 | |
michael@0 | 1051 | i16x8_t k1And4 = simd::InterleaveLo16(k1, k4); |
michael@0 | 1052 | i16x8_t k2And3 = simd::InterleaveLo16(k2, k3); |
michael@0 | 1053 | |
michael@0 | 1054 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 1055 | for (int32_t x = 0; x < size.width; x += 4) { |
michael@0 | 1056 | uint32_t source1Index = y * source1Stride + 4 * x; |
michael@0 | 1057 | uint32_t source2Index = y * source2Stride + 4 * x; |
michael@0 | 1058 | uint32_t targetIndex = y * targetStride + 4 * x; |
michael@0 | 1059 | |
michael@0 | 1060 | // Load and unpack. |
michael@0 | 1061 | u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]); |
michael@0 | 1062 | u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]); |
michael@0 | 1063 | i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1); |
michael@0 | 1064 | i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1); |
michael@0 | 1065 | i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2); |
michael@0 | 1066 | i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2); |
michael@0 | 1067 | |
michael@0 | 1068 | // Multiply and add. |
michael@0 | 1069 | i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3); |
michael@0 | 1070 | i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3); |
michael@0 | 1071 | |
michael@0 | 1072 | // Pack and store. |
michael@0 | 1073 | simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34)); |
michael@0 | 1074 | } |
michael@0 | 1075 | } |
michael@0 | 1076 | |
michael@0 | 1077 | return target; |
michael@0 | 1078 | } |
michael@0 | 1079 | |
michael@0 | 1080 | } // namespace mozilla |
michael@0 | 1081 | } // namespace gfx |