Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #include "FilterProcessing.h"
8 #include "SIMD.h"
9 #include "SVGTurbulenceRenderer-inl.h"
11 namespace mozilla {
12 namespace gfx {
14 template<typename u8x16_t>
15 inline TemporaryRef<DataSourceSurface>
16 ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface)
17 {
18 IntSize size = aSurface->GetSize();
19 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
20 RefPtr<DataSourceSurface> output =
21 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
22 uint8_t *inputData = input->GetData();
23 uint8_t *outputData = output->GetData();
24 int32_t inputStride = input->Stride();
25 int32_t outputStride = output->Stride();
26 switch (input->GetFormat()) {
27 case SurfaceFormat::B8G8R8A8:
28 output = input;
29 break;
30 case SurfaceFormat::B8G8R8X8:
31 for (int32_t y = 0; y < size.height; y++) {
32 for (int32_t x = 0; x < size.width; x++) {
33 int32_t inputIndex = y * inputStride + 4 * x;
34 int32_t outputIndex = y * outputStride + 4 * x;
35 outputData[outputIndex + 0] = inputData[inputIndex + 0];
36 outputData[outputIndex + 1] = inputData[inputIndex + 1];
37 outputData[outputIndex + 2] = inputData[inputIndex + 2];
38 outputData[outputIndex + 3] = 255;
39 }
40 }
41 break;
42 case SurfaceFormat::R8G8B8A8:
43 for (int32_t y = 0; y < size.height; y++) {
44 for (int32_t x = 0; x < size.width; x++) {
45 int32_t inputIndex = y * inputStride + 4 * x;
46 int32_t outputIndex = y * outputStride + 4 * x;
47 outputData[outputIndex + 2] = inputData[inputIndex + 0];
48 outputData[outputIndex + 1] = inputData[inputIndex + 1];
49 outputData[outputIndex + 0] = inputData[inputIndex + 2];
50 outputData[outputIndex + 3] = inputData[inputIndex + 3];
51 }
52 }
53 break;
54 case SurfaceFormat::R8G8B8X8:
55 for (int32_t y = 0; y < size.height; y++) {
56 for (int32_t x = 0; x < size.width; x++) {
57 int32_t inputIndex = y * inputStride + 4 * x;
58 int32_t outputIndex = y * outputStride + 4 * x;
59 outputData[outputIndex + 2] = inputData[inputIndex + 0];
60 outputData[outputIndex + 1] = inputData[inputIndex + 1];
61 outputData[outputIndex + 0] = inputData[inputIndex + 2];
62 outputData[outputIndex + 3] = 255;
63 }
64 }
65 break;
66 case SurfaceFormat::A8:
67 for (int32_t y = 0; y < size.height; y++) {
68 for (int32_t x = 0; x < size.width; x += 16) {
69 int32_t inputIndex = y * inputStride + x;
70 int32_t outputIndex = y * outputStride + 4 * x;
71 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
72 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
73 // interleaving with 0000000000000000 twice.
74 u8x16_t zero = simd::FromZero8<u8x16_t>();
75 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
76 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
77 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
78 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
79 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
80 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
81 simd::Store8(&outputData[outputIndex], p1To4);
82 if ((x + 4) * 4 < outputStride) {
83 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
84 }
85 if ((x + 8) * 4 < outputStride) {
86 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
87 }
88 if ((x + 12) * 4 < outputStride) {
89 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
90 }
91 }
92 }
93 break;
94 default:
95 output = nullptr;
96 break;
97 }
98 return output;
99 }
101 template<typename u8x16_t>
102 inline void
103 ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride)
104 {
105 for (int32_t y = 0; y < size.height; y++) {
106 for (int32_t x = 0; x < size.width; x += 16) {
107 // Process 16 pixels at a time.
108 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA.
109 int32_t sourceIndex = y * sourceStride + 4 * x;
110 int32_t targetIndex = y * alphaStride + x;
112 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
113 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
114 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
115 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
117 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
118 if (4 * (x + 4) < sourceStride) {
119 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
120 }
121 if (4 * (x + 8) < sourceStride) {
122 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
123 }
124 if (4 * (x + 12) < sourceStride) {
125 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
126 }
128 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
129 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
130 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
131 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
132 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
133 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
134 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
135 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
136 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
137 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
138 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
140 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
141 }
142 }
143 }
145 // This function calculates the result color values for four pixels, but for
146 // only two color channels - either b & r or g & a. However, the a result will
147 // not be used.
148 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
149 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
150 // alpha of all four pixels (and both aaaa's are the same).
151 // blendendComponent1 and blendedComponent2 are the out parameters.
152 template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
153 inline void
154 BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
155 i16x8_t dest, const i16x8_t& destAlpha,
156 i32x4_t& blendedComponent1, i32x4_t& blendedComponent2)
157 {
158 i16x8_t x255 = simd::FromI16<i16x8_t>(255);
160 switch (aBlendMode) {
162 case BLEND_MODE_MULTIPLY:
163 {
164 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest);
165 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
166 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
167 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
169 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
170 i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
171 blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
172 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
174 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
175 i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
176 blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
177 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
179 break;
180 }
182 case BLEND_MODE_SCREEN:
183 {
184 // val = 255 * (source + dest) + (0 - dest) * source;
185 i16x8_t sourcePlusDest = simd::Add16(source, dest);
186 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
188 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest);
189 i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source);
190 blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1);
191 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
193 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest);
194 i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source);
195 blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2);
196 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
198 break;
199 }
201 case BLEND_MODE_DARKEN:
202 case BLEND_MODE_LIGHTEN:
203 {
204 // Darken:
205 // val = min((255 - destAlpha) * source + 255 * dest,
206 // 255 * source + (255 - sourceAlpha) * dest);
207 //
208 // Lighten:
209 // val = max((255 - destAlpha) * source + 255 * dest,
210 // 255 * source + (255 - sourceAlpha) * dest);
212 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
213 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
215 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
216 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
217 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
218 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1);
219 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1);
220 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2);
221 blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
223 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
224 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
225 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
226 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2);
227 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2);
228 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2);
229 blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
231 break;
232 }
234 }
235 }
237 // The alpha channel is subject to a different calculation than the RGB
238 // channels, and this calculation is the same for all blend modes:
239 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
240 template<typename i16x8_t, typename i32x4_t>
241 inline i32x4_t
242 BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234)
243 {
244 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
245 // appropriately. The calculation is rewritten as follows:
246 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
247 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
248 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
249 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
250 i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
251 i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
252 i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
253 i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
254 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
255 }
257 template<typename u8x16_t, typename i16x8_t>
258 inline void
259 UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
260 i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234)
261 {
262 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
263 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
264 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
265 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
266 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
267 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
268 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
269 }
271 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
272 inline u8x16_t
273 ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
274 i32x4_t rrrr1234, const i32x4_t& aaaa1234)
275 {
276 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
277 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
278 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
279 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
280 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
281 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
282 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
283 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
284 }
286 template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
287 inline TemporaryRef<DataSourceSurface>
288 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2)
289 {
290 IntSize size = aInput1->GetSize();
291 RefPtr<DataSourceSurface> target =
292 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
293 if (!target) {
294 return nullptr;
295 }
297 uint8_t* source1Data = aInput1->GetData();
298 uint8_t* source2Data = aInput2->GetData();
299 uint8_t* targetData = target->GetData();
300 int32_t targetStride = target->Stride();
301 int32_t source1Stride = aInput1->Stride();
302 int32_t source2Stride = aInput2->Stride();
304 for (int32_t y = 0; y < size.height; y++) {
305 for (int32_t x = 0; x < size.width; x += 4) {
306 int32_t targetIndex = y * targetStride + 4 * x;
307 int32_t source1Index = y * source1Stride + 4 * x;
308 int32_t source2Index = y * source2Stride + 4 * x;
310 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
311 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
313 // The blending calculation for the RGB channels all need access to the
314 // alpha channel of their pixel, and the alpha calculation is different,
315 // so it makes sense to separate by channel.
317 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
318 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
319 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
320 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
321 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234);
322 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234);
324 // We only use blendedB, blendedG and blendedR.
325 i32x4_t blendedB, blendedG, blendedR, blendedA;
326 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG);
327 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA);
329 // Throw away blendedA and overwrite it with the correct blended alpha.
330 blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234);
332 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA);
333 simd::Store8(&targetData[targetIndex], result1234);
334 }
335 }
337 return target;
338 }
340 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
341 static TemporaryRef<DataSourceSurface>
342 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
343 BlendMode aBlendMode)
344 {
345 switch (aBlendMode) {
346 case BLEND_MODE_MULTIPLY:
347 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2);
348 case BLEND_MODE_SCREEN:
349 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2);
350 case BLEND_MODE_DARKEN:
351 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2);
352 case BLEND_MODE_LIGHTEN:
353 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2);
354 default:
355 return nullptr;
356 }
357 }
359 template<MorphologyOperator Operator, typename u8x16_t>
360 static u8x16_t
361 Morph8(u8x16_t a, u8x16_t b)
362 {
363 return Operator == MORPHOLOGY_OPERATOR_ERODE ?
364 simd::Min8(a, b) : simd::Max8(a, b);
365 }
367 // Set every pixel to the per-component minimum or maximum of the pixels around
368 // it that are up to aRadius pixels away from it (horizontally).
369 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
370 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
371 uint8_t* aDestData, int32_t aDestStride,
372 const IntRect& aDestRect, int32_t aRadius)
373 {
374 static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
375 op == MORPHOLOGY_OPERATOR_DILATE,
376 "unexpected morphology operator");
378 int32_t kernelSize = aRadius + 1 + aRadius;
379 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
380 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
381 int32_t completeKernelSizeForFourPixels = kernelSize + 3;
382 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
383 completeKernelSizeForFourPixels % 4 == 2);
385 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
386 // the way we need them to be.
388 IntRect sourceRect = aDestRect;
389 sourceRect.Inflate(aRadius, 0);
391 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) {
392 int32_t kernelStartX = aDestRect.x - aRadius;
393 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) {
394 // We process four pixels (16 color values) at a time.
395 // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
396 // source values can be read beyond that because the source is extended
397 // by aRadius pixels.
399 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
400 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
401 u8x16_t m1234 = p1234;
403 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
404 u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ?
405 simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) :
406 simd::FromZero8<u8x16_t>();
407 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
408 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
409 m1234 = Morph8<op,u8x16_t>(m1234, p2345);
410 m1234 = Morph8<op,u8x16_t>(m1234, p3456);
411 if (i + 2 < completeKernelSizeForFourPixels) {
412 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
413 m1234 = Morph8<op,u8x16_t>(m1234, p4567);
414 m1234 = Morph8<op,u8x16_t>(m1234, p5678);
415 }
416 p1234 = p5678;
417 }
419 int32_t destIndex = y * aDestStride + 4 * x;
420 simd::Store8(&aDestData[destIndex], m1234);
421 }
422 }
423 }
425 template<typename i16x8_t, typename u8x16_t>
426 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
427 uint8_t* aDestData, int32_t aDestStride,
428 const IntRect& aDestRect, int32_t aRadius,
429 MorphologyOperator aOp)
430 {
431 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
432 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
433 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
434 } else {
435 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
436 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
437 }
438 }
440 // Set every pixel to the per-component minimum or maximum of the pixels around
441 // it that are up to aRadius pixels away from it (vertically).
442 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t>
443 static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
444 uint8_t* aDestData, int32_t aDestStride,
445 const IntRect& aDestRect, int32_t aRadius)
446 {
447 static_assert(op == MORPHOLOGY_OPERATOR_ERODE ||
448 op == MORPHOLOGY_OPERATOR_DILATE,
449 "unexpected morphology operator");
451 int32_t startY = aDestRect.y - aRadius;
452 int32_t endY = aDestRect.y + aRadius;
453 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) {
454 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) {
455 int32_t sourceIndex = startY * aSourceStride + 4 * x;
456 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
457 sourceIndex += aSourceStride;
458 for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) {
459 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
460 u = Morph8<op,u8x16_t>(u, u2);
461 }
463 int32_t destIndex = y * aDestStride + 4 * x;
464 simd::Store8(&aDestData[destIndex], u);
465 }
466 }
467 }
469 template<typename i16x8_t, typename u8x16_t>
470 inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride,
471 uint8_t* aDestData, int32_t aDestStride,
472 const IntRect& aDestRect, int32_t aRadius,
473 MorphologyOperator aOp)
474 {
475 if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
476 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>(
477 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
478 } else {
479 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>(
480 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
481 }
482 }
484 template<typename i32x4_t, typename i16x8_t>
485 static i32x4_t
486 ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias)
487 {
488 // int16_t p[8] == { b, g, r, a, b, g, r, a }.
489 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
490 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
491 // int32_t bias[4] == { _B, _G, _R, _A }.
493 i32x4_t sum = bias;
495 // int16_t bg[8] = { b, g, b, g, b, g, b, g };
496 i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p));
497 // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
498 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
499 sum = simd::Add32(sum, prodsum_bg);
501 // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
502 i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p));
503 // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
504 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
505 sum = simd::Add32(sum, prodsum_ra);
507 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
508 return sum;
509 }
511 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
512 static TemporaryRef<DataSourceSurface>
513 ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix)
514 {
515 IntSize size = aInput->GetSize();
516 RefPtr<DataSourceSurface> target =
517 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
518 if (!target) {
519 return nullptr;
520 }
522 uint8_t* sourceData = aInput->GetData();
523 uint8_t* targetData = target->GetData();
524 int32_t sourceStride = aInput->Stride();
525 int32_t targetStride = target->Stride();
527 const int16_t factor = 128;
528 const Float floatElementMax = INT16_MAX / factor; // 255
529 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale");
531 const Float *floats = &aMatrix._11;
533 ptrdiff_t componentOffsets[4] = {
534 B8G8R8A8_COMPONENT_BYTEOFFSET_R,
535 B8G8R8A8_COMPONENT_BYTEOFFSET_G,
536 B8G8R8A8_COMPONENT_BYTEOFFSET_B,
537 B8G8R8A8_COMPONENT_BYTEOFFSET_A
538 };
540 // We store the color matrix in rows_bgra in the following format:
541 // { bB, bG, bR, bA, gB, gG, gR, gA }.
542 // { bB, gB, bG, gG, bR, gR, bA, gA }
543 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
544 // which works especially well for our use case.
545 int16_t rows_bgra[2][8];
546 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
547 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
548 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
549 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax);
550 int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5);
551 int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
552 int8_t g_or_a = componentOffsets[rowIndex] % 2;
553 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
554 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement;
555 }
556 }
558 int32_t rowBias[4];
559 Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255);
560 for (size_t colIndex = 0; colIndex < 4; colIndex++) {
561 size_t rowIndex = 4;
562 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
563 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax);
564 int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5);
565 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
566 }
568 i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
569 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
570 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
572 i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
573 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
574 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
576 i32x4_t rowsBias_v =
577 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
579 for (int32_t y = 0; y < size.height; y++) {
580 for (int32_t x = 0; x < size.width; x += 4) {
581 MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position");
582 MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position");
583 int32_t sourceIndex = y * sourceStride + 4 * x;
584 int32_t targetIndex = y * targetStride + 4 * x;
586 // We load 4 pixels, unpack them, process them 1 pixel at a time, and
587 // finally pack and store the 4 result pixels.
589 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
591 // Splat needed to get each pixel twice into i16x8
592 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
593 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
594 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
595 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
597 i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
598 i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
599 i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
600 i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
602 static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor.");
603 u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1),
604 simd::ShiftRight32<7>(result_p2),
605 simd::ShiftRight32<7>(result_p3),
606 simd::ShiftRight32<7>(result_p4));
607 simd::Store8(&targetData[targetIndex], result_p1234);
608 }
609 }
611 return target;
612 }
614 // source / dest: bgra bgra
615 // sourceAlpha / destAlpha: aaaa aaaa
616 // result: bgra bgra
617 template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
618 static inline u16x8_t
619 CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha)
620 {
621 u16x8_t x255 = simd::FromU16<u16x8_t>(255);
623 switch (aCompositeOperator) {
625 case COMPOSITE_OPERATOR_OVER:
626 {
627 // val = dest * (255 - sourceAlpha) + source * 255;
628 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
630 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
631 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
632 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
634 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
635 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
636 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
638 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
639 simd::FastDivideBy255(result2));
640 }
642 case COMPOSITE_OPERATOR_IN:
643 {
644 // val = source * destAlpha;
645 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
646 }
648 case COMPOSITE_OPERATOR_OUT:
649 {
650 // val = source * (255 - destAlpha);
651 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
652 return simd::FastDivideBy255_16(prod);
653 }
655 case COMPOSITE_OPERATOR_ATOP:
656 {
657 // val = dest * (255 - sourceAlpha) + source * destAlpha;
658 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
660 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
661 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
662 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
664 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
665 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
666 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
668 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
669 simd::FastDivideBy255(result2));
670 }
672 case COMPOSITE_OPERATOR_XOR:
673 {
674 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
675 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
676 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
678 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
679 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
680 twoFiftyFiveMinusDestAlpha);
681 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
683 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
684 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
685 twoFiftyFiveMinusDestAlpha);
686 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
688 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
689 simd::FastDivideBy255(result2));
690 }
692 default:
693 return simd::FromU16<u16x8_t>(0);
695 }
696 }
698 template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
699 static void
700 ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest)
701 {
702 IntSize size = aDest->GetSize();
704 uint8_t* sourceData = aSource->GetData();
705 uint8_t* destData = aDest->GetData();
706 uint32_t sourceStride = aSource->Stride();
707 uint32_t destStride = aDest->Stride();
709 for (int32_t y = 0; y < size.height; y++) {
710 for (int32_t x = 0; x < size.width; x += 4) {
711 uint32_t sourceIndex = y * sourceStride + 4 * x;
712 uint32_t destIndex = y * destStride + 4 * x;
714 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
715 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
717 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
718 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
719 u16x8_t sa12 = simd::Splat16<3,3>(s12);
720 u16x8_t da12 = simd::Splat16<3,3>(d12);
721 u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12);
723 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
724 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
725 u16x8_t sa34 = simd::Splat16<3,3>(s34);
726 u16x8_t da34 = simd::Splat16<3,3>(d34);
727 u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34);
729 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
730 simd::Store8(&destData[destIndex], result1234);
731 }
732 }
733 }
735 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
736 static void
737 ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest,
738 CompositeOperator aOperator)
739 {
740 switch (aOperator) {
741 case COMPOSITE_OPERATOR_OVER:
742 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest);
743 break;
744 case COMPOSITE_OPERATOR_IN:
745 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest);
746 break;
747 case COMPOSITE_OPERATOR_OUT:
748 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest);
749 break;
750 case COMPOSITE_OPERATOR_ATOP:
751 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest);
752 break;
753 case COMPOSITE_OPERATOR_XOR:
754 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest);
755 break;
756 default:
757 MOZ_CRASH();
758 }
759 }
761 template<typename u8x16_t>
762 static void
763 SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride,
764 uint8_t* channel0Data, uint8_t* channel1Data,
765 uint8_t* channel2Data, uint8_t* channel3Data,
766 int32_t channelStride)
767 {
768 for (int32_t y = 0; y < size.height; y++) {
769 for (int32_t x = 0; x < size.width; x += 16) {
770 // Process 16 pixels at a time.
771 int32_t sourceIndex = y * sourceStride + 4 * x;
772 int32_t targetIndex = y * channelStride + x;
774 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>();
775 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
776 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
777 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
779 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
780 if (4 * (x + 4) < sourceStride) {
781 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
782 }
783 if (4 * (x + 8) < sourceStride) {
784 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
785 }
786 if (4 * (x + 12) < sourceStride) {
787 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
788 }
790 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
791 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
792 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
793 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
794 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
795 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
796 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
797 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
798 u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
799 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
800 u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
801 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
802 u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
803 u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
804 u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
805 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
807 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
808 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
809 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
810 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
811 }
812 }
813 }
815 template<typename u8x16_t>
816 static void
817 CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data)
818 {
819 for (int32_t y = 0; y < size.height; y++) {
820 for (int32_t x = 0; x < size.width; x += 16) {
821 // Process 16 pixels at a time.
822 int32_t resultIndex = y * resultStride + 4 * x;
823 int32_t channelIndex = y * channelStride + x;
825 u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
826 u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
827 u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
828 u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
830 u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
831 u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
832 u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
833 u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
835 u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
836 u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
837 u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
838 u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
840 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
841 if (4 * (x + 4) < resultStride) {
842 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
843 }
844 if (4 * (x + 8) < resultStride) {
845 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
846 }
847 if (4 * (x + 12) < resultStride) {
848 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
849 }
850 }
851 }
852 }
855 template<typename i32x4_t, typename u16x8_t, typename u8x16_t>
856 static void
857 DoPremultiplicationCalculation_SIMD(const IntSize& aSize,
858 uint8_t* aTargetData, int32_t aTargetStride,
859 uint8_t* aSourceData, int32_t aSourceStride)
860 {
861 const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff);
862 for (int32_t y = 0; y < aSize.height; y++) {
863 for (int32_t x = 0; x < aSize.width; x += 4) {
864 int32_t inputIndex = y * aSourceStride + 4 * x;
865 int32_t targetIndex = y * aTargetStride + 4 * x;
867 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
868 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
869 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
871 // Multiply all components with alpha.
872 p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12));
873 p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34));
875 // Divide by 255 and pack.
876 u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12),
877 simd::FastDivideBy255_16(p34));
879 // Get the original alpha channel value back from p1234.
880 result = simd::Pick(alphaMask, result, p1234);
882 simd::Store8(&aTargetData[targetIndex], result);
883 }
884 }
885 }
887 // We use a table of precomputed factors for unpremultiplying.
888 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of
889 // r and alpha in constant time. This table of factors has the property that
890 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with
891 // a maximum deviation of 1).
892 //
893 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha)
894 //
895 // This table has been created using the python code
896 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256))
897 static const uint16_t sAlphaFactors[256] = {
898 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935,
899 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720,
900 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764,
901 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306,
902 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036,
903 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837,
904 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702,
905 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604,
906 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531,
907 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473,
908 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427,
909 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389,
910 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357,
911 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330,
912 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306,
913 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286,
914 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269,
915 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256
916 };
918 template<typename u16x8_t, typename u8x16_t>
919 static void
920 DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize,
921 uint8_t* aTargetData, int32_t aTargetStride,
922 uint8_t* aSourceData, int32_t aSourceStride)
923 {
924 for (int32_t y = 0; y < aSize.height; y++) {
925 for (int32_t x = 0; x < aSize.width; x += 4) {
926 int32_t inputIndex = y * aSourceStride + 4 * x;
927 int32_t targetIndex = y * aTargetStride + 4 * x;
928 union {
929 u8x16_t p1234;
930 uint8_t u8[4][4];
931 };
932 p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
934 // Prepare the alpha factors.
935 uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
936 uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
937 uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
938 uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]];
939 u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8);
940 u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8);
942 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
943 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
945 // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits.
946 p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128)));
947 p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128)));
949 u8x16_t result = simd::PackAndSaturate16To8(p12, p34);
950 simd::Store8(&aTargetData[targetIndex], result);
951 }
952 }
953 }
955 template<typename f32x4_t, typename i32x4_t, typename u8x16_t>
956 static TemporaryRef<DataSourceSurface>
957 RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency,
958 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect)
959 {
960 #define RETURN_TURBULENCE(Type, Stitch) \
961 SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \
962 renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \
963 return renderer.Render(aSize, aOffset);
965 switch (aType) {
966 case TURBULENCE_TYPE_TURBULENCE:
967 {
968 if (aStitch) {
969 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
970 }
971 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
972 }
973 case TURBULENCE_TYPE_FRACTAL_NOISE:
974 {
975 if (aStitch) {
976 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
977 }
978 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
979 }
980 }
981 return nullptr;
982 #undef RETURN_TURBULENCE
983 }
985 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
986 template<typename i32x4_t, typename i16x8_t>
987 static MOZ_ALWAYS_INLINE i16x8_t
988 ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2,
989 const i16x8_t &k1And4, const i16x8_t &k2And3)
990 {
991 // Calculate input product: inProd = (in1 * in2) / 255.
992 i32x4_t inProd_1, inProd_2;
993 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
994 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2));
996 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
997 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
998 i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight);
999 i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight);
1000 i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
1001 i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
1003 // Calculate k2 * in1 + k3 * in2
1004 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
1005 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
1006 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
1007 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
1009 // Sum everything up and truncate the fractional part.
1010 i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
1011 i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
1012 return simd::PackAndSaturate32To16(result_1, result_2);
1013 }
1015 template<typename i32x4_t, typename i16x8_t, typename u8x16_t>
1016 static TemporaryRef<DataSourceSurface>
1017 ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2,
1018 Float aK1, Float aK2, Float aK3, Float aK4)
1019 {
1020 IntSize size = aInput1->GetSize();
1021 RefPtr<DataSourceSurface> target =
1022 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
1023 if (!target) {
1024 return nullptr;
1025 }
1027 uint8_t* source1Data = aInput1->GetData();
1028 uint8_t* source2Data = aInput2->GetData();
1029 uint8_t* targetData = target->GetData();
1030 uint32_t source1Stride = aInput1->Stride();
1031 uint32_t source2Stride = aInput2->Stride();
1032 uint32_t targetStride = target->Stride();
1034 // The arithmetic combine filter does the following calculation:
1035 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
1036 //
1037 // Or, with in1/2 integers between 0 and 255:
1038 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
1039 //
1040 // We want the whole calculation to happen in integer, with 16-bit factors.
1041 // So we convert our factors to fixed-point with precision 1.8.7.
1042 // K4 is premultiplied with 255, and it will be multiplied with 128 later
1043 // during the actual calculation, because premultiplying it with 255 * 128
1044 // would overflow int16.
1046 i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f)));
1047 i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f)));
1048 i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f)));
1049 i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f)));
1051 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
1052 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
1054 for (int32_t y = 0; y < size.height; y++) {
1055 for (int32_t x = 0; x < size.width; x += 4) {
1056 uint32_t source1Index = y * source1Stride + 4 * x;
1057 uint32_t source2Index = y * source2Stride + 4 * x;
1058 uint32_t targetIndex = y * targetStride + 4 * x;
1060 // Load and unpack.
1061 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
1062 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
1063 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
1064 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
1065 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
1066 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
1068 // Multiply and add.
1069 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3);
1070 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3);
1072 // Pack and store.
1073 simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34));
1074 }
1075 }
1077 return target;
1078 }
1080 } // namespace mozilla
1081 } // namespace gfx