|
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
|
2 * This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "FilterProcessing.h" |
|
7 |
|
8 #include "SIMD.h" |
|
9 #include "SVGTurbulenceRenderer-inl.h" |
|
10 |
|
11 namespace mozilla { |
|
12 namespace gfx { |
|
13 |
|
14 template<typename u8x16_t> |
|
15 inline TemporaryRef<DataSourceSurface> |
|
16 ConvertToB8G8R8A8_SIMD(SourceSurface* aSurface) |
|
17 { |
|
18 IntSize size = aSurface->GetSize(); |
|
19 RefPtr<DataSourceSurface> input = aSurface->GetDataSurface(); |
|
20 RefPtr<DataSourceSurface> output = |
|
21 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
|
22 uint8_t *inputData = input->GetData(); |
|
23 uint8_t *outputData = output->GetData(); |
|
24 int32_t inputStride = input->Stride(); |
|
25 int32_t outputStride = output->Stride(); |
|
26 switch (input->GetFormat()) { |
|
27 case SurfaceFormat::B8G8R8A8: |
|
28 output = input; |
|
29 break; |
|
30 case SurfaceFormat::B8G8R8X8: |
|
31 for (int32_t y = 0; y < size.height; y++) { |
|
32 for (int32_t x = 0; x < size.width; x++) { |
|
33 int32_t inputIndex = y * inputStride + 4 * x; |
|
34 int32_t outputIndex = y * outputStride + 4 * x; |
|
35 outputData[outputIndex + 0] = inputData[inputIndex + 0]; |
|
36 outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
|
37 outputData[outputIndex + 2] = inputData[inputIndex + 2]; |
|
38 outputData[outputIndex + 3] = 255; |
|
39 } |
|
40 } |
|
41 break; |
|
42 case SurfaceFormat::R8G8B8A8: |
|
43 for (int32_t y = 0; y < size.height; y++) { |
|
44 for (int32_t x = 0; x < size.width; x++) { |
|
45 int32_t inputIndex = y * inputStride + 4 * x; |
|
46 int32_t outputIndex = y * outputStride + 4 * x; |
|
47 outputData[outputIndex + 2] = inputData[inputIndex + 0]; |
|
48 outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
|
49 outputData[outputIndex + 0] = inputData[inputIndex + 2]; |
|
50 outputData[outputIndex + 3] = inputData[inputIndex + 3]; |
|
51 } |
|
52 } |
|
53 break; |
|
54 case SurfaceFormat::R8G8B8X8: |
|
55 for (int32_t y = 0; y < size.height; y++) { |
|
56 for (int32_t x = 0; x < size.width; x++) { |
|
57 int32_t inputIndex = y * inputStride + 4 * x; |
|
58 int32_t outputIndex = y * outputStride + 4 * x; |
|
59 outputData[outputIndex + 2] = inputData[inputIndex + 0]; |
|
60 outputData[outputIndex + 1] = inputData[inputIndex + 1]; |
|
61 outputData[outputIndex + 0] = inputData[inputIndex + 2]; |
|
62 outputData[outputIndex + 3] = 255; |
|
63 } |
|
64 } |
|
65 break; |
|
66 case SurfaceFormat::A8: |
|
67 for (int32_t y = 0; y < size.height; y++) { |
|
68 for (int32_t x = 0; x < size.width; x += 16) { |
|
69 int32_t inputIndex = y * inputStride + x; |
|
70 int32_t outputIndex = y * outputStride + 4 * x; |
|
71 u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]); |
|
72 // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by |
|
73 // interleaving with 0000000000000000 twice. |
|
74 u8x16_t zero = simd::FromZero8<u8x16_t>(); |
|
75 u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16); |
|
76 u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16); |
|
77 u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8); |
|
78 u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8); |
|
79 u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16); |
|
80 u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16); |
|
81 simd::Store8(&outputData[outputIndex], p1To4); |
|
82 if ((x + 4) * 4 < outputStride) { |
|
83 simd::Store8(&outputData[outputIndex + 4 * 4], p5To8); |
|
84 } |
|
85 if ((x + 8) * 4 < outputStride) { |
|
86 simd::Store8(&outputData[outputIndex + 4 * 8], p9To12); |
|
87 } |
|
88 if ((x + 12) * 4 < outputStride) { |
|
89 simd::Store8(&outputData[outputIndex + 4 * 12], p13To16); |
|
90 } |
|
91 } |
|
92 } |
|
93 break; |
|
94 default: |
|
95 output = nullptr; |
|
96 break; |
|
97 } |
|
98 return output; |
|
99 } |
|
100 |
|
101 template<typename u8x16_t> |
|
102 inline void |
|
103 ExtractAlpha_SIMD(const IntSize& size, uint8_t* sourceData, int32_t sourceStride, uint8_t* alphaData, int32_t alphaStride) |
|
104 { |
|
105 for (int32_t y = 0; y < size.height; y++) { |
|
106 for (int32_t x = 0; x < size.width; x += 16) { |
|
107 // Process 16 pixels at a time. |
|
108 // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of AAAAAAAAAAAAAAAA. |
|
109 int32_t sourceIndex = y * sourceStride + 4 * x; |
|
110 int32_t targetIndex = y * alphaStride + x; |
|
111 |
|
112 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); |
|
113 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); |
|
114 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); |
|
115 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); |
|
116 |
|
117 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
|
118 if (4 * (x + 4) < sourceStride) { |
|
119 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); |
|
120 } |
|
121 if (4 * (x + 8) < sourceStride) { |
|
122 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); |
|
123 } |
|
124 if (4 * (x + 12) < sourceStride) { |
|
125 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); |
|
126 } |
|
127 |
|
128 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
|
129 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
|
130 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
|
131 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
|
132 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
|
133 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
|
134 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
|
135 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
|
136 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
|
137 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
|
138 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
|
139 |
|
140 simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa); |
|
141 } |
|
142 } |
|
143 } |
|
144 |
|
145 // This function calculates the result color values for four pixels, but for |
|
146 // only two color channels - either b & r or g & a. However, the a result will |
|
147 // not be used. |
|
148 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa. |
|
149 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the |
|
150 // alpha of all four pixels (and both aaaa's are the same). |
|
151 // blendendComponent1 and blendedComponent2 are the out parameters. |
|
152 template<typename i16x8_t, typename i32x4_t, uint32_t aBlendMode> |
|
153 inline void |
|
154 BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha, |
|
155 i16x8_t dest, const i16x8_t& destAlpha, |
|
156 i32x4_t& blendedComponent1, i32x4_t& blendedComponent2) |
|
157 { |
|
158 i16x8_t x255 = simd::FromI16<i16x8_t>(255); |
|
159 |
|
160 switch (aBlendMode) { |
|
161 |
|
162 case BLEND_MODE_MULTIPLY: |
|
163 { |
|
164 // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) * dest); |
|
165 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
|
166 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
|
167 i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource = simd::Add16(twoFiftyFiveMinusSourceAlpha, source); |
|
168 |
|
169 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); |
|
170 i16x8_t leftFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); |
|
171 blendedComponent1 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1); |
|
172 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
|
173 |
|
174 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); |
|
175 i16x8_t leftFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource); |
|
176 blendedComponent2 = simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2); |
|
177 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
|
178 |
|
179 break; |
|
180 } |
|
181 |
|
182 case BLEND_MODE_SCREEN: |
|
183 { |
|
184 // val = 255 * (source + dest) + (0 - dest) * source; |
|
185 i16x8_t sourcePlusDest = simd::Add16(source, dest); |
|
186 i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest); |
|
187 |
|
188 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 = simd::InterleaveLo16(x255, zeroMinusDest); |
|
189 i16x8_t sourcePlusDestInterleavedWithSource1 = simd::InterleaveLo16(sourcePlusDest, source); |
|
190 blendedComponent1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1, sourcePlusDestInterleavedWithSource1); |
|
191 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
|
192 |
|
193 i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 = simd::InterleaveHi16(x255, zeroMinusDest); |
|
194 i16x8_t sourcePlusDestInterleavedWithSource2 = simd::InterleaveHi16(sourcePlusDest, source); |
|
195 blendedComponent2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2, sourcePlusDestInterleavedWithSource2); |
|
196 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
|
197 |
|
198 break; |
|
199 } |
|
200 |
|
201 case BLEND_MODE_DARKEN: |
|
202 case BLEND_MODE_LIGHTEN: |
|
203 { |
|
204 // Darken: |
|
205 // val = min((255 - destAlpha) * source + 255 * dest, |
|
206 // 255 * source + (255 - sourceAlpha) * dest); |
|
207 // |
|
208 // Lighten: |
|
209 // val = max((255 - destAlpha) * source + 255 * dest, |
|
210 // 255 * source + (255 - sourceAlpha) * dest); |
|
211 |
|
212 i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
|
213 i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
|
214 |
|
215 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 = simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255); |
|
216 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 = simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha); |
|
217 i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest); |
|
218 i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1, sourceInterleavedWithDest1); |
|
219 i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1, sourceInterleavedWithDest1); |
|
220 blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product1_1, product1_2) : simd::Max32(product1_1, product1_2); |
|
221 blendedComponent1 = simd::FastDivideBy255(blendedComponent1); |
|
222 |
|
223 i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 = simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255); |
|
224 i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 = simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha); |
|
225 i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest); |
|
226 i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2, sourceInterleavedWithDest2); |
|
227 i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2, sourceInterleavedWithDest2); |
|
228 blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN ? simd::Min32(product2_1, product2_2) : simd::Max32(product2_1, product2_2); |
|
229 blendedComponent2 = simd::FastDivideBy255(blendedComponent2); |
|
230 |
|
231 break; |
|
232 } |
|
233 |
|
234 } |
|
235 } |
|
236 |
|
237 // The alpha channel is subject to a different calculation than the RGB |
|
238 // channels, and this calculation is the same for all blend modes: |
|
239 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha) |
|
240 template<typename i16x8_t, typename i32x4_t> |
|
241 inline i32x4_t |
|
242 BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234, i16x8_t d_rrrraaaa1234) |
|
243 { |
|
244 // We're using MulAdd16x8x2To32x4, so we need to interleave our factors |
|
245 // appropriately. The calculation is rewritten as follows: |
|
246 // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0]) |
|
247 // = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) |
|
248 // = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255) |
|
249 // = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0] |
|
250 i16x8_t zeroInterleavedWithSourceAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234); |
|
251 i16x8_t fiveTenInterleavedWithDestAlpha = simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234); |
|
252 i16x8_t f1 = simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha); |
|
253 i16x8_t f2 = simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255)); |
|
254 return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2)); |
|
255 } |
|
256 |
|
257 template<typename u8x16_t, typename i16x8_t> |
|
258 inline void |
|
259 UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234, |
|
260 i16x8_t& bbbbgggg1234, i16x8_t& rrrraaaa1234) |
|
261 { |
|
262 // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234 |
|
263 i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234); |
|
264 i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234); |
|
265 i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34); |
|
266 i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34); |
|
267 bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24); |
|
268 rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24); |
|
269 } |
|
270 |
|
271 template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
|
272 inline u8x16_t |
|
273 ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234, |
|
274 i32x4_t rrrr1234, const i32x4_t& aaaa1234) |
|
275 { |
|
276 // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234 |
|
277 i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234); |
|
278 i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234); |
|
279 i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234); |
|
280 i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234); |
|
281 i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234); |
|
282 i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234); |
|
283 return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34); |
|
284 } |
|
285 |
|
286 template<typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode> |
|
287 inline TemporaryRef<DataSourceSurface> |
|
288 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2) |
|
289 { |
|
290 IntSize size = aInput1->GetSize(); |
|
291 RefPtr<DataSourceSurface> target = |
|
292 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
|
293 if (!target) { |
|
294 return nullptr; |
|
295 } |
|
296 |
|
297 uint8_t* source1Data = aInput1->GetData(); |
|
298 uint8_t* source2Data = aInput2->GetData(); |
|
299 uint8_t* targetData = target->GetData(); |
|
300 int32_t targetStride = target->Stride(); |
|
301 int32_t source1Stride = aInput1->Stride(); |
|
302 int32_t source2Stride = aInput2->Stride(); |
|
303 |
|
304 for (int32_t y = 0; y < size.height; y++) { |
|
305 for (int32_t x = 0; x < size.width; x += 4) { |
|
306 int32_t targetIndex = y * targetStride + 4 * x; |
|
307 int32_t source1Index = y * source1Stride + 4 * x; |
|
308 int32_t source2Index = y * source2Stride + 4 * x; |
|
309 |
|
310 u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]); |
|
311 u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]); |
|
312 |
|
313 // The blending calculation for the RGB channels all need access to the |
|
314 // alpha channel of their pixel, and the alpha calculation is different, |
|
315 // so it makes sense to separate by channel. |
|
316 |
|
317 i16x8_t s_bbbbgggg1234, s_rrrraaaa1234; |
|
318 i16x8_t d_bbbbgggg1234, d_rrrraaaa1234; |
|
319 UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234); |
|
320 UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234); |
|
321 i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(s_rrrraaaa1234); |
|
322 i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3,2,3,2>(d_rrrraaaa1234); |
|
323 |
|
324 // We only use blendedB, blendedG and blendedR. |
|
325 i32x4_t blendedB, blendedG, blendedR, blendedA; |
|
326 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234, blendedB, blendedG); |
|
327 BlendTwoComponentsOfFourPixels<i16x8_t,i32x4_t,mode>(s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234, blendedR, blendedA); |
|
328 |
|
329 // Throw away blendedA and overwrite it with the correct blended alpha. |
|
330 blendedA = BlendAlphaOfFourPixels<i16x8_t,i32x4_t>(s_rrrraaaa1234, d_rrrraaaa1234); |
|
331 |
|
332 u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t,i16x8_t,u8x16_t>(blendedB, blendedG, blendedR, blendedA); |
|
333 simd::Store8(&targetData[targetIndex], result1234); |
|
334 } |
|
335 } |
|
336 |
|
337 return target; |
|
338 } |
|
339 |
|
340 template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
|
341 static TemporaryRef<DataSourceSurface> |
|
342 ApplyBlending_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, |
|
343 BlendMode aBlendMode) |
|
344 { |
|
345 switch (aBlendMode) { |
|
346 case BLEND_MODE_MULTIPLY: |
|
347 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_MULTIPLY>(aInput1, aInput2); |
|
348 case BLEND_MODE_SCREEN: |
|
349 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_SCREEN>(aInput1, aInput2); |
|
350 case BLEND_MODE_DARKEN: |
|
351 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_DARKEN>(aInput1, aInput2); |
|
352 case BLEND_MODE_LIGHTEN: |
|
353 return ApplyBlending_SIMD<i32x4_t,i16x8_t,u8x16_t, BLEND_MODE_LIGHTEN>(aInput1, aInput2); |
|
354 default: |
|
355 return nullptr; |
|
356 } |
|
357 } |
|
358 |
|
359 template<MorphologyOperator Operator, typename u8x16_t> |
|
360 static u8x16_t |
|
361 Morph8(u8x16_t a, u8x16_t b) |
|
362 { |
|
363 return Operator == MORPHOLOGY_OPERATOR_ERODE ? |
|
364 simd::Min8(a, b) : simd::Max8(a, b); |
|
365 } |
|
366 |
|
367 // Set every pixel to the per-component minimum or maximum of the pixels around |
|
368 // it that are up to aRadius pixels away from it (horizontally). |
|
369 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> |
|
370 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
|
371 uint8_t* aDestData, int32_t aDestStride, |
|
372 const IntRect& aDestRect, int32_t aRadius) |
|
373 { |
|
374 static_assert(op == MORPHOLOGY_OPERATOR_ERODE || |
|
375 op == MORPHOLOGY_OPERATOR_DILATE, |
|
376 "unexpected morphology operator"); |
|
377 |
|
378 int32_t kernelSize = aRadius + 1 + aRadius; |
|
379 MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0"); |
|
380 MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3); |
|
381 int32_t completeKernelSizeForFourPixels = kernelSize + 3; |
|
382 MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 || |
|
383 completeKernelSizeForFourPixels % 4 == 2); |
|
384 |
|
385 // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just |
|
386 // the way we need them to be. |
|
387 |
|
388 IntRect sourceRect = aDestRect; |
|
389 sourceRect.Inflate(aRadius, 0); |
|
390 |
|
391 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++) { |
|
392 int32_t kernelStartX = aDestRect.x - aRadius; |
|
393 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4, kernelStartX += 4) { |
|
394 // We process four pixels (16 color values) at a time. |
|
395 // aSourceData[0] points to the pixel located at aDestRect.TopLeft(); |
|
396 // source values can be read beyond that because the source is extended |
|
397 // by aRadius pixels. |
|
398 |
|
399 int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX; |
|
400 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
|
401 u8x16_t m1234 = p1234; |
|
402 |
|
403 for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) { |
|
404 u8x16_t p5678 = (kernelStartX + i < sourceRect.XMost()) ? |
|
405 simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i]) : |
|
406 simd::FromZero8<u8x16_t>(); |
|
407 u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678); |
|
408 u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678); |
|
409 m1234 = Morph8<op,u8x16_t>(m1234, p2345); |
|
410 m1234 = Morph8<op,u8x16_t>(m1234, p3456); |
|
411 if (i + 2 < completeKernelSizeForFourPixels) { |
|
412 u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678); |
|
413 m1234 = Morph8<op,u8x16_t>(m1234, p4567); |
|
414 m1234 = Morph8<op,u8x16_t>(m1234, p5678); |
|
415 } |
|
416 p1234 = p5678; |
|
417 } |
|
418 |
|
419 int32_t destIndex = y * aDestStride + 4 * x; |
|
420 simd::Store8(&aDestData[destIndex], m1234); |
|
421 } |
|
422 } |
|
423 } |
|
424 |
|
425 template<typename i16x8_t, typename u8x16_t> |
|
426 inline void ApplyMorphologyHorizontal_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
|
427 uint8_t* aDestData, int32_t aDestStride, |
|
428 const IntRect& aDestRect, int32_t aRadius, |
|
429 MorphologyOperator aOp) |
|
430 { |
|
431 if (aOp == MORPHOLOGY_OPERATOR_ERODE) { |
|
432 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( |
|
433 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
|
434 } else { |
|
435 ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( |
|
436 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
|
437 } |
|
438 } |
|
439 |
|
440 // Set every pixel to the per-component minimum or maximum of the pixels around |
|
441 // it that are up to aRadius pixels away from it (vertically). |
|
442 template<MorphologyOperator op, typename i16x8_t, typename u8x16_t> |
|
443 static void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
|
444 uint8_t* aDestData, int32_t aDestStride, |
|
445 const IntRect& aDestRect, int32_t aRadius) |
|
446 { |
|
447 static_assert(op == MORPHOLOGY_OPERATOR_ERODE || |
|
448 op == MORPHOLOGY_OPERATOR_DILATE, |
|
449 "unexpected morphology operator"); |
|
450 |
|
451 int32_t startY = aDestRect.y - aRadius; |
|
452 int32_t endY = aDestRect.y + aRadius; |
|
453 for (int32_t y = aDestRect.y; y < aDestRect.YMost(); y++, startY++, endY++) { |
|
454 for (int32_t x = aDestRect.x; x < aDestRect.XMost(); x += 4) { |
|
455 int32_t sourceIndex = startY * aSourceStride + 4 * x; |
|
456 u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
|
457 sourceIndex += aSourceStride; |
|
458 for (int32_t iy = startY + 1; iy <= endY; iy++, sourceIndex += aSourceStride) { |
|
459 u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]); |
|
460 u = Morph8<op,u8x16_t>(u, u2); |
|
461 } |
|
462 |
|
463 int32_t destIndex = y * aDestStride + 4 * x; |
|
464 simd::Store8(&aDestData[destIndex], u); |
|
465 } |
|
466 } |
|
467 } |
|
468 |
|
469 template<typename i16x8_t, typename u8x16_t> |
|
470 inline void ApplyMorphologyVertical_SIMD(uint8_t* aSourceData, int32_t aSourceStride, |
|
471 uint8_t* aDestData, int32_t aDestStride, |
|
472 const IntRect& aDestRect, int32_t aRadius, |
|
473 MorphologyOperator aOp) |
|
474 { |
|
475 if (aOp == MORPHOLOGY_OPERATOR_ERODE) { |
|
476 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE,i16x8_t,u8x16_t>( |
|
477 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
|
478 } else { |
|
479 ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE,i16x8_t,u8x16_t>( |
|
480 aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius); |
|
481 } |
|
482 } |
|
483 |
|
484 template<typename i32x4_t, typename i16x8_t> |
|
485 static i32x4_t |
|
486 ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra, const i32x4_t& bias) |
|
487 { |
|
488 // int16_t p[8] == { b, g, r, a, b, g, r, a }. |
|
489 // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }. |
|
490 // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }. |
|
491 // int32_t bias[4] == { _B, _G, _R, _A }. |
|
492 |
|
493 i32x4_t sum = bias; |
|
494 |
|
495 // int16_t bg[8] = { b, g, b, g, b, g, b, g }; |
|
496 i16x8_t bg = simd::ShuffleHi16<1,0,1,0>(simd::ShuffleLo16<1,0,1,0>(p)); |
|
497 // int32_t prodsum_bg[4] = { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA } |
|
498 i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg); |
|
499 sum = simd::Add32(sum, prodsum_bg); |
|
500 |
|
501 // uint16_t ra[8] = { r, a, r, a, r, a, r, a }; |
|
502 i16x8_t ra = simd::ShuffleHi16<3,2,3,2>(simd::ShuffleLo16<3,2,3,2>(p)); |
|
503 // int32_t prodsum_ra[4] = { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA } |
|
504 i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra); |
|
505 sum = simd::Add32(sum, prodsum_ra); |
|
506 |
|
507 // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }. |
|
508 return sum; |
|
509 } |
|
510 |
|
511 template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
|
512 static TemporaryRef<DataSourceSurface> |
|
513 ApplyColorMatrix_SIMD(DataSourceSurface* aInput, const Matrix5x4 &aMatrix) |
|
514 { |
|
515 IntSize size = aInput->GetSize(); |
|
516 RefPtr<DataSourceSurface> target = |
|
517 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
|
518 if (!target) { |
|
519 return nullptr; |
|
520 } |
|
521 |
|
522 uint8_t* sourceData = aInput->GetData(); |
|
523 uint8_t* targetData = target->GetData(); |
|
524 int32_t sourceStride = aInput->Stride(); |
|
525 int32_t targetStride = target->Stride(); |
|
526 |
|
527 const int16_t factor = 128; |
|
528 const Float floatElementMax = INT16_MAX / factor; // 255 |
|
529 MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX, "badly chosen float-to-int scale"); |
|
530 |
|
531 const Float *floats = &aMatrix._11; |
|
532 |
|
533 ptrdiff_t componentOffsets[4] = { |
|
534 B8G8R8A8_COMPONENT_BYTEOFFSET_R, |
|
535 B8G8R8A8_COMPONENT_BYTEOFFSET_G, |
|
536 B8G8R8A8_COMPONENT_BYTEOFFSET_B, |
|
537 B8G8R8A8_COMPONENT_BYTEOFFSET_A |
|
538 }; |
|
539 |
|
540 // We store the color matrix in rows_bgra in the following format: |
|
541 // { bB, bG, bR, bA, gB, gG, gR, gA }. |
|
542 // { bB, gB, bG, gG, bR, gR, bA, gA } |
|
543 // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16 |
|
544 // which works especially well for our use case. |
|
545 int16_t rows_bgra[2][8]; |
|
546 for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) { |
|
547 for (size_t colIndex = 0; colIndex < 4; colIndex++) { |
|
548 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; |
|
549 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -floatElementMax), floatElementMax); |
|
550 int16_t scaledIntMatrixElement = int16_t(clampedFloatMatrixElement * factor + 0.5); |
|
551 int8_t bg_or_ra = componentOffsets[rowIndex] / 2; |
|
552 int8_t g_or_a = componentOffsets[rowIndex] % 2; |
|
553 int8_t B_or_G_or_R_or_A = componentOffsets[colIndex]; |
|
554 rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] = scaledIntMatrixElement; |
|
555 } |
|
556 } |
|
557 |
|
558 int32_t rowBias[4]; |
|
559 Float biasMax = (INT32_MAX - 4 * 255 * INT16_MAX) / (factor * 255); |
|
560 for (size_t colIndex = 0; colIndex < 4; colIndex++) { |
|
561 size_t rowIndex = 4; |
|
562 const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex]; |
|
563 Float clampedFloatMatrixElement = std::min(std::max(floatMatrixElement, -biasMax), biasMax); |
|
564 int32_t scaledIntMatrixElement = int32_t(clampedFloatMatrixElement * factor * 255 + 0.5); |
|
565 rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement; |
|
566 } |
|
567 |
|
568 i16x8_t row_bg_v = simd::FromI16<i16x8_t>( |
|
569 rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3], |
|
570 rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]); |
|
571 |
|
572 i16x8_t row_ra_v = simd::FromI16<i16x8_t>( |
|
573 rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3], |
|
574 rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]); |
|
575 |
|
576 i32x4_t rowsBias_v = |
|
577 simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]); |
|
578 |
|
579 for (int32_t y = 0; y < size.height; y++) { |
|
580 for (int32_t x = 0; x < size.width; x += 4) { |
|
581 MOZ_ASSERT(sourceStride >= 4 * (x + 4), "need to be able to read 4 pixels at this position"); |
|
582 MOZ_ASSERT(targetStride >= 4 * (x + 4), "need to be able to write 4 pixels at this position"); |
|
583 int32_t sourceIndex = y * sourceStride + 4 * x; |
|
584 int32_t targetIndex = y * targetStride + 4 * x; |
|
585 |
|
586 // We load 4 pixels, unpack them, process them 1 pixel at a time, and |
|
587 // finally pack and store the 4 result pixels. |
|
588 |
|
589 u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
|
590 |
|
591 // Splat needed to get each pixel twice into i16x8 |
|
592 i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234)); |
|
593 i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234)); |
|
594 i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234)); |
|
595 i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234)); |
|
596 |
|
597 i32x4_t result_p1 = ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v); |
|
598 i32x4_t result_p2 = ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v); |
|
599 i32x4_t result_p3 = ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v); |
|
600 i32x4_t result_p4 = ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v); |
|
601 |
|
602 static_assert(factor == 1 << 7, "Please adapt the calculation in the lines below for a different factor."); |
|
603 u8x16_t result_p1234 = simd::PackAndSaturate32To8(simd::ShiftRight32<7>(result_p1), |
|
604 simd::ShiftRight32<7>(result_p2), |
|
605 simd::ShiftRight32<7>(result_p3), |
|
606 simd::ShiftRight32<7>(result_p4)); |
|
607 simd::Store8(&targetData[targetIndex], result_p1234); |
|
608 } |
|
609 } |
|
610 |
|
611 return target; |
|
612 } |
|
613 |
|
614 // source / dest: bgra bgra |
|
615 // sourceAlpha / destAlpha: aaaa aaaa |
|
616 // result: bgra bgra |
|
617 template<typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator> |
|
618 static inline u16x8_t |
|
619 CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha, u16x8_t dest, const u16x8_t& destAlpha) |
|
620 { |
|
621 u16x8_t x255 = simd::FromU16<u16x8_t>(255); |
|
622 |
|
623 switch (aCompositeOperator) { |
|
624 |
|
625 case COMPOSITE_OPERATOR_OVER: |
|
626 { |
|
627 // val = dest * (255 - sourceAlpha) + source * 255; |
|
628 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
|
629 |
|
630 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
|
631 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255); |
|
632 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
|
633 |
|
634 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
|
635 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255); |
|
636 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
|
637 |
|
638 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
|
639 simd::FastDivideBy255(result2)); |
|
640 } |
|
641 |
|
642 case COMPOSITE_OPERATOR_IN: |
|
643 { |
|
644 // val = source * destAlpha; |
|
645 return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha)); |
|
646 } |
|
647 |
|
648 case COMPOSITE_OPERATOR_OUT: |
|
649 { |
|
650 // val = source * (255 - destAlpha); |
|
651 u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha)); |
|
652 return simd::FastDivideBy255_16(prod); |
|
653 } |
|
654 |
|
655 case COMPOSITE_OPERATOR_ATOP: |
|
656 { |
|
657 // val = dest * (255 - sourceAlpha) + source * destAlpha; |
|
658 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
|
659 |
|
660 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
|
661 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha); |
|
662 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
|
663 |
|
664 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
|
665 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha); |
|
666 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
|
667 |
|
668 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
|
669 simd::FastDivideBy255(result2)); |
|
670 } |
|
671 |
|
672 case COMPOSITE_OPERATOR_XOR: |
|
673 { |
|
674 // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha); |
|
675 u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha); |
|
676 u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha); |
|
677 |
|
678 u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source); |
|
679 u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, |
|
680 twoFiftyFiveMinusDestAlpha); |
|
681 i32x4_t result1 = simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1); |
|
682 |
|
683 u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source); |
|
684 u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, |
|
685 twoFiftyFiveMinusDestAlpha); |
|
686 i32x4_t result2 = simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2); |
|
687 |
|
688 return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1), |
|
689 simd::FastDivideBy255(result2)); |
|
690 } |
|
691 |
|
692 default: |
|
693 return simd::FromU16<u16x8_t>(0); |
|
694 |
|
695 } |
|
696 } |
|
697 |
|
698 template<typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op> |
|
699 static void |
|
700 ApplyComposition(DataSourceSurface* aSource, DataSourceSurface* aDest) |
|
701 { |
|
702 IntSize size = aDest->GetSize(); |
|
703 |
|
704 uint8_t* sourceData = aSource->GetData(); |
|
705 uint8_t* destData = aDest->GetData(); |
|
706 uint32_t sourceStride = aSource->Stride(); |
|
707 uint32_t destStride = aDest->Stride(); |
|
708 |
|
709 for (int32_t y = 0; y < size.height; y++) { |
|
710 for (int32_t x = 0; x < size.width; x += 4) { |
|
711 uint32_t sourceIndex = y * sourceStride + 4 * x; |
|
712 uint32_t destIndex = y * destStride + 4 * x; |
|
713 |
|
714 u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
|
715 u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]); |
|
716 |
|
717 u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234); |
|
718 u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234); |
|
719 u16x8_t sa12 = simd::Splat16<3,3>(s12); |
|
720 u16x8_t da12 = simd::Splat16<3,3>(d12); |
|
721 u16x8_t result12 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s12, sa12, d12, da12); |
|
722 |
|
723 u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234); |
|
724 u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234); |
|
725 u16x8_t sa34 = simd::Splat16<3,3>(s34); |
|
726 u16x8_t da34 = simd::Splat16<3,3>(d34); |
|
727 u16x8_t result34 = CompositeTwoPixels<i32x4_t,u16x8_t,op>(s34, sa34, d34, da34); |
|
728 |
|
729 u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34); |
|
730 simd::Store8(&destData[destIndex], result1234); |
|
731 } |
|
732 } |
|
733 } |
|
734 |
|
735 template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
|
736 static void |
|
737 ApplyComposition_SIMD(DataSourceSurface* aSource, DataSourceSurface* aDest, |
|
738 CompositeOperator aOperator) |
|
739 { |
|
740 switch (aOperator) { |
|
741 case COMPOSITE_OPERATOR_OVER: |
|
742 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OVER>(aSource, aDest); |
|
743 break; |
|
744 case COMPOSITE_OPERATOR_IN: |
|
745 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_IN>(aSource, aDest); |
|
746 break; |
|
747 case COMPOSITE_OPERATOR_OUT: |
|
748 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_OUT>(aSource, aDest); |
|
749 break; |
|
750 case COMPOSITE_OPERATOR_ATOP: |
|
751 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_ATOP>(aSource, aDest); |
|
752 break; |
|
753 case COMPOSITE_OPERATOR_XOR: |
|
754 ApplyComposition<i32x4_t,i16x8_t,u8x16_t, COMPOSITE_OPERATOR_XOR>(aSource, aDest); |
|
755 break; |
|
756 default: |
|
757 MOZ_CRASH(); |
|
758 } |
|
759 } |
|
760 |
|
761 template<typename u8x16_t> |
|
762 static void |
|
763 SeparateColorChannels_SIMD(const IntSize &size, uint8_t* sourceData, int32_t sourceStride, |
|
764 uint8_t* channel0Data, uint8_t* channel1Data, |
|
765 uint8_t* channel2Data, uint8_t* channel3Data, |
|
766 int32_t channelStride) |
|
767 { |
|
768 for (int32_t y = 0; y < size.height; y++) { |
|
769 for (int32_t x = 0; x < size.width; x += 16) { |
|
770 // Process 16 pixels at a time. |
|
771 int32_t sourceIndex = y * sourceStride + 4 * x; |
|
772 int32_t targetIndex = y * channelStride + x; |
|
773 |
|
774 u8x16_t bgrabgrabgrabgra1 = simd::FromZero8<u8x16_t>(); |
|
775 u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>(); |
|
776 u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>(); |
|
777 u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>(); |
|
778 |
|
779 bgrabgrabgrabgra1 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]); |
|
780 if (4 * (x + 4) < sourceStride) { |
|
781 bgrabgrabgrabgra2 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]); |
|
782 } |
|
783 if (4 * (x + 8) < sourceStride) { |
|
784 bgrabgrabgrabgra3 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]); |
|
785 } |
|
786 if (4 * (x + 12) < sourceStride) { |
|
787 bgrabgrabgrabgra4 = simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]); |
|
788 } |
|
789 |
|
790 u8x16_t bbggrraabbggrraa1 = simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
|
791 u8x16_t bbggrraabbggrraa2 = simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3); |
|
792 u8x16_t bbggrraabbggrraa3 = simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
|
793 u8x16_t bbggrraabbggrraa4 = simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4); |
|
794 u8x16_t bbbbggggrrrraaaa1 = simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
|
795 u8x16_t bbbbggggrrrraaaa2 = simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3); |
|
796 u8x16_t bbbbggggrrrraaaa3 = simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
|
797 u8x16_t bbbbggggrrrraaaa4 = simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4); |
|
798 u8x16_t bbbbbbbbgggggggg1 = simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
|
799 u8x16_t rrrrrrrraaaaaaaa1 = simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3); |
|
800 u8x16_t bbbbbbbbgggggggg2 = simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
|
801 u8x16_t rrrrrrrraaaaaaaa2 = simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4); |
|
802 u8x16_t bbbbbbbbbbbbbbbb = simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); |
|
803 u8x16_t gggggggggggggggg = simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2); |
|
804 u8x16_t rrrrrrrrrrrrrrrr = simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
|
805 u8x16_t aaaaaaaaaaaaaaaa = simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2); |
|
806 |
|
807 simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb); |
|
808 simd::Store8(&channel1Data[targetIndex], gggggggggggggggg); |
|
809 simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr); |
|
810 simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa); |
|
811 } |
|
812 } |
|
813 } |
|
814 |
|
815 template<typename u8x16_t> |
|
816 static void |
|
817 CombineColorChannels_SIMD(const IntSize &size, int32_t resultStride, uint8_t* resultData, int32_t channelStride, uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data, uint8_t* channel3Data) |
|
818 { |
|
819 for (int32_t y = 0; y < size.height; y++) { |
|
820 for (int32_t x = 0; x < size.width; x += 16) { |
|
821 // Process 16 pixels at a time. |
|
822 int32_t resultIndex = y * resultStride + 4 * x; |
|
823 int32_t channelIndex = y * channelStride + x; |
|
824 |
|
825 u8x16_t bbbbbbbbbbbbbbbb = simd::Load8<u8x16_t>(&channel0Data[channelIndex]); |
|
826 u8x16_t gggggggggggggggg = simd::Load8<u8x16_t>(&channel1Data[channelIndex]); |
|
827 u8x16_t rrrrrrrrrrrrrrrr = simd::Load8<u8x16_t>(&channel2Data[channelIndex]); |
|
828 u8x16_t aaaaaaaaaaaaaaaa = simd::Load8<u8x16_t>(&channel3Data[channelIndex]); |
|
829 |
|
830 u8x16_t brbrbrbrbrbrbrbr1 = simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); |
|
831 u8x16_t brbrbrbrbrbrbrbr2 = simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr); |
|
832 u8x16_t gagagagagagagaga1 = simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa); |
|
833 u8x16_t gagagagagagagaga2 = simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa); |
|
834 |
|
835 u8x16_t bgrabgrabgrabgra1 = simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); |
|
836 u8x16_t bgrabgrabgrabgra2 = simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1); |
|
837 u8x16_t bgrabgrabgrabgra3 = simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); |
|
838 u8x16_t bgrabgrabgrabgra4 = simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2); |
|
839 |
|
840 simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1); |
|
841 if (4 * (x + 4) < resultStride) { |
|
842 simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2); |
|
843 } |
|
844 if (4 * (x + 8) < resultStride) { |
|
845 simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3); |
|
846 } |
|
847 if (4 * (x + 12) < resultStride) { |
|
848 simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4); |
|
849 } |
|
850 } |
|
851 } |
|
852 } |
|
853 |
|
854 |
|
855 template<typename i32x4_t, typename u16x8_t, typename u8x16_t> |
|
856 static void |
|
857 DoPremultiplicationCalculation_SIMD(const IntSize& aSize, |
|
858 uint8_t* aTargetData, int32_t aTargetStride, |
|
859 uint8_t* aSourceData, int32_t aSourceStride) |
|
860 { |
|
861 const u8x16_t alphaMask = simd::From8<u8x16_t>(0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff); |
|
862 for (int32_t y = 0; y < aSize.height; y++) { |
|
863 for (int32_t x = 0; x < aSize.width; x += 4) { |
|
864 int32_t inputIndex = y * aSourceStride + 4 * x; |
|
865 int32_t targetIndex = y * aTargetStride + 4 * x; |
|
866 |
|
867 u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); |
|
868 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); |
|
869 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); |
|
870 |
|
871 // Multiply all components with alpha. |
|
872 p12 = simd::Mul16(p12, simd::Splat16<3,3>(p12)); |
|
873 p34 = simd::Mul16(p34, simd::Splat16<3,3>(p34)); |
|
874 |
|
875 // Divide by 255 and pack. |
|
876 u8x16_t result = simd::PackAndSaturate16To8(simd::FastDivideBy255_16(p12), |
|
877 simd::FastDivideBy255_16(p34)); |
|
878 |
|
879 // Get the original alpha channel value back from p1234. |
|
880 result = simd::Pick(alphaMask, result, p1234); |
|
881 |
|
882 simd::Store8(&aTargetData[targetIndex], result); |
|
883 } |
|
884 } |
|
885 } |
|
886 |
|
887 // We use a table of precomputed factors for unpremultiplying. |
|
888 // We want to compute round(r / (alpha / 255.0f)) for arbitrary values of |
|
889 // r and alpha in constant time. This table of factors has the property that |
|
890 // (r * sAlphaFactors[alpha] + 128) >> 8 roughly gives the result we want (with |
|
891 // a maximum deviation of 1). |
|
892 // |
|
893 // sAlphaFactors[alpha] == round(255.0 * (1 << 8) / alpha) |
|
894 // |
|
895 // This table has been created using the python code |
|
896 // ", ".join("%d" % (round(255.0 * 256 / alpha) if alpha > 0 else 0) for alpha in range(256)) |
|
897 static const uint16_t sAlphaFactors[256] = { |
|
898 0, 65280, 32640, 21760, 16320, 13056, 10880, 9326, 8160, 7253, 6528, 5935, |
|
899 5440, 5022, 4663, 4352, 4080, 3840, 3627, 3436, 3264, 3109, 2967, 2838, 2720, |
|
900 2611, 2511, 2418, 2331, 2251, 2176, 2106, 2040, 1978, 1920, 1865, 1813, 1764, |
|
901 1718, 1674, 1632, 1592, 1554, 1518, 1484, 1451, 1419, 1389, 1360, 1332, 1306, |
|
902 1280, 1255, 1232, 1209, 1187, 1166, 1145, 1126, 1106, 1088, 1070, 1053, 1036, |
|
903 1020, 1004, 989, 974, 960, 946, 933, 919, 907, 894, 882, 870, 859, 848, 837, |
|
904 826, 816, 806, 796, 787, 777, 768, 759, 750, 742, 733, 725, 717, 710, 702, |
|
905 694, 687, 680, 673, 666, 659, 653, 646, 640, 634, 628, 622, 616, 610, 604, |
|
906 599, 593, 588, 583, 578, 573, 568, 563, 558, 553, 549, 544, 540, 535, 531, |
|
907 526, 522, 518, 514, 510, 506, 502, 498, 495, 491, 487, 484, 480, 476, 473, |
|
908 470, 466, 463, 460, 457, 453, 450, 447, 444, 441, 438, 435, 432, 429, 427, |
|
909 424, 421, 418, 416, 413, 411, 408, 405, 403, 400, 398, 396, 393, 391, 389, |
|
910 386, 384, 382, 380, 377, 375, 373, 371, 369, 367, 365, 363, 361, 359, 357, |
|
911 355, 353, 351, 349, 347, 345, 344, 342, 340, 338, 336, 335, 333, 331, 330, |
|
912 328, 326, 325, 323, 322, 320, 318, 317, 315, 314, 312, 311, 309, 308, 306, |
|
913 305, 304, 302, 301, 299, 298, 297, 295, 294, 293, 291, 290, 289, 288, 286, |
|
914 285, 284, 283, 281, 280, 279, 278, 277, 275, 274, 273, 272, 271, 270, 269, |
|
915 268, 266, 265, 264, 263, 262, 261, 260, 259, 258, 257, 256 |
|
916 }; |
|
917 |
|
918 template<typename u16x8_t, typename u8x16_t> |
|
919 static void |
|
920 DoUnpremultiplicationCalculation_SIMD(const IntSize& aSize, |
|
921 uint8_t* aTargetData, int32_t aTargetStride, |
|
922 uint8_t* aSourceData, int32_t aSourceStride) |
|
923 { |
|
924 for (int32_t y = 0; y < aSize.height; y++) { |
|
925 for (int32_t x = 0; x < aSize.width; x += 4) { |
|
926 int32_t inputIndex = y * aSourceStride + 4 * x; |
|
927 int32_t targetIndex = y * aTargetStride + 4 * x; |
|
928 union { |
|
929 u8x16_t p1234; |
|
930 uint8_t u8[4][4]; |
|
931 }; |
|
932 p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]); |
|
933 |
|
934 // Prepare the alpha factors. |
|
935 uint16_t aF1 = sAlphaFactors[u8[0][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
|
936 uint16_t aF2 = sAlphaFactors[u8[1][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
|
937 uint16_t aF3 = sAlphaFactors[u8[2][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
|
938 uint16_t aF4 = sAlphaFactors[u8[3][B8G8R8A8_COMPONENT_BYTEOFFSET_A]]; |
|
939 u16x8_t aF12 = simd::FromU16<u16x8_t>(aF1, aF1, aF1, 1 << 8, aF2, aF2, aF2, 1 << 8); |
|
940 u16x8_t aF34 = simd::FromU16<u16x8_t>(aF3, aF3, aF3, 1 << 8, aF4, aF4, aF4, 1 << 8); |
|
941 |
|
942 u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234); |
|
943 u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234); |
|
944 |
|
945 // Multiply with the alpha factors, add 128 for rounding, and shift right by 8 bits. |
|
946 p12 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p12, aF12), simd::FromU16<u16x8_t>(128))); |
|
947 p34 = simd::ShiftRight16<8>(simd::Add16(simd::Mul16(p34, aF34), simd::FromU16<u16x8_t>(128))); |
|
948 |
|
949 u8x16_t result = simd::PackAndSaturate16To8(p12, p34); |
|
950 simd::Store8(&aTargetData[targetIndex], result); |
|
951 } |
|
952 } |
|
953 } |
|
954 |
|
955 template<typename f32x4_t, typename i32x4_t, typename u8x16_t> |
|
956 static TemporaryRef<DataSourceSurface> |
|
957 RenderTurbulence_SIMD(const IntSize &aSize, const Point &aOffset, const Size &aBaseFrequency, |
|
958 int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch, const Rect &aTileRect) |
|
959 { |
|
960 #define RETURN_TURBULENCE(Type, Stitch) \ |
|
961 SVGTurbulenceRenderer<Type,Stitch,f32x4_t,i32x4_t,u8x16_t> \ |
|
962 renderer(aBaseFrequency, aSeed, aNumOctaves, aTileRect); \ |
|
963 return renderer.Render(aSize, aOffset); |
|
964 |
|
965 switch (aType) { |
|
966 case TURBULENCE_TYPE_TURBULENCE: |
|
967 { |
|
968 if (aStitch) { |
|
969 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true); |
|
970 } |
|
971 RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false); |
|
972 } |
|
973 case TURBULENCE_TYPE_FRACTAL_NOISE: |
|
974 { |
|
975 if (aStitch) { |
|
976 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true); |
|
977 } |
|
978 RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false); |
|
979 } |
|
980 } |
|
981 return nullptr; |
|
982 #undef RETURN_TURBULENCE |
|
983 } |
|
984 |
|
985 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 |
|
986 template<typename i32x4_t, typename i16x8_t> |
|
987 static MOZ_ALWAYS_INLINE i16x8_t |
|
988 ArithmeticCombineTwoPixels(i16x8_t in1, i16x8_t in2, |
|
989 const i16x8_t &k1And4, const i16x8_t &k2And3) |
|
990 { |
|
991 // Calculate input product: inProd = (in1 * in2) / 255. |
|
992 i32x4_t inProd_1, inProd_2; |
|
993 simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2); |
|
994 i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1), simd::FastDivideBy255(inProd_2)); |
|
995 |
|
996 // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128 |
|
997 i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128); |
|
998 i16x8_t inProd1AndOneTwentyEight = simd::InterleaveLo16(inProd, oneTwentyEight); |
|
999 i16x8_t inProd2AndOneTwentyEight = simd::InterleaveHi16(inProd, oneTwentyEight); |
|
1000 i32x4_t inProdTimesK1PlusK4_1 = simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight); |
|
1001 i32x4_t inProdTimesK1PlusK4_2 = simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight); |
|
1002 |
|
1003 // Calculate k2 * in1 + k3 * in2 |
|
1004 i16x8_t in12_1 = simd::InterleaveLo16(in1, in2); |
|
1005 i16x8_t in12_2 = simd::InterleaveHi16(in1, in2); |
|
1006 i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1); |
|
1007 i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2); |
|
1008 |
|
1009 // Sum everything up and truncate the fractional part. |
|
1010 i32x4_t result_1 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1)); |
|
1011 i32x4_t result_2 = simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2)); |
|
1012 return simd::PackAndSaturate32To16(result_1, result_2); |
|
1013 } |
|
1014 |
|
1015 template<typename i32x4_t, typename i16x8_t, typename u8x16_t> |
|
1016 static TemporaryRef<DataSourceSurface> |
|
1017 ApplyArithmeticCombine_SIMD(DataSourceSurface* aInput1, DataSourceSurface* aInput2, |
|
1018 Float aK1, Float aK2, Float aK3, Float aK4) |
|
1019 { |
|
1020 IntSize size = aInput1->GetSize(); |
|
1021 RefPtr<DataSourceSurface> target = |
|
1022 Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8); |
|
1023 if (!target) { |
|
1024 return nullptr; |
|
1025 } |
|
1026 |
|
1027 uint8_t* source1Data = aInput1->GetData(); |
|
1028 uint8_t* source2Data = aInput2->GetData(); |
|
1029 uint8_t* targetData = target->GetData(); |
|
1030 uint32_t source1Stride = aInput1->Stride(); |
|
1031 uint32_t source2Stride = aInput2->Stride(); |
|
1032 uint32_t targetStride = target->Stride(); |
|
1033 |
|
1034 // The arithmetic combine filter does the following calculation: |
|
1035 // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4 |
|
1036 // |
|
1037 // Or, with in1/2 integers between 0 and 255: |
|
1038 // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255 |
|
1039 // |
|
1040 // We want the whole calculation to happen in integer, with 16-bit factors. |
|
1041 // So we convert our factors to fixed-point with precision 1.8.7. |
|
1042 // K4 is premultiplied with 255, and it will be multiplied with 128 later |
|
1043 // during the actual calculation, because premultiplying it with 255 * 128 |
|
1044 // would overflow int16. |
|
1045 |
|
1046 i16x8_t k1 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK1, -255.0f), 255.0f) * 128 + 0.5f))); |
|
1047 i16x8_t k2 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK2, -255.0f), 255.0f) * 128 + 0.5f))); |
|
1048 i16x8_t k3 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK3, -255.0f), 255.0f) * 128 + 0.5f))); |
|
1049 i16x8_t k4 = simd::FromI16<i16x8_t>(int16_t(floorf(std::min(std::max(aK4, -128.0f), 128.0f) * 255 + 0.5f))); |
|
1050 |
|
1051 i16x8_t k1And4 = simd::InterleaveLo16(k1, k4); |
|
1052 i16x8_t k2And3 = simd::InterleaveLo16(k2, k3); |
|
1053 |
|
1054 for (int32_t y = 0; y < size.height; y++) { |
|
1055 for (int32_t x = 0; x < size.width; x += 4) { |
|
1056 uint32_t source1Index = y * source1Stride + 4 * x; |
|
1057 uint32_t source2Index = y * source2Stride + 4 * x; |
|
1058 uint32_t targetIndex = y * targetStride + 4 * x; |
|
1059 |
|
1060 // Load and unpack. |
|
1061 u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]); |
|
1062 u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]); |
|
1063 i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1); |
|
1064 i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1); |
|
1065 i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2); |
|
1066 i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2); |
|
1067 |
|
1068 // Multiply and add. |
|
1069 i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_12, in2_12, k1And4, k2And3); |
|
1070 i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t,i16x8_t>(in1_34, in2_34, k1And4, k2And3); |
|
1071 |
|
1072 // Pack and store. |
|
1073 simd::Store8(&targetData[targetIndex], simd::PackAndSaturate16To8(result_12, result_34)); |
|
1074 } |
|
1075 } |
|
1076 |
|
1077 return target; |
|
1078 } |
|
1079 |
|
1080 } // namespace mozilla |
|
1081 } // namespace gfx |