|
1 /* This Source Code Form is subject to the terms of the Mozilla Public |
|
2 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
4 |
|
5 #include "Blur.h" |
|
6 |
|
7 #include "SSEHelpers.h" |
|
8 |
|
9 #include <string.h> |
|
10 |
|
11 namespace mozilla { |
|
12 namespace gfx { |
|
13 |
|
14 MOZ_ALWAYS_INLINE |
|
15 __m128i Divide(__m128i aValues, __m128i aDivisor) |
|
16 { |
|
17 const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff); |
|
18 static const union { |
|
19 int64_t i64[2]; |
|
20 __m128i m; |
|
21 } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } }; |
|
22 |
|
23 __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor); |
|
24 __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor); |
|
25 |
|
26 // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the |
|
27 // result is rounded. |
|
28 __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32); |
|
29 __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask); |
|
30 __m128i p4321 = _mm_or_si128(p_3_1, p4_2_); |
|
31 return p4321; |
|
32 } |
|
33 |
|
34 MOZ_ALWAYS_INLINE |
|
35 __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight, |
|
36 const __m128i& aBottomRight, const __m128i& aBottomLeft, |
|
37 const __m128i& aDivisor) |
|
38 { |
|
39 __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft); |
|
40 return Divide(values, aDivisor); |
|
41 } |
|
42 |
|
43 MOZ_ALWAYS_INLINE |
|
44 void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource, |
|
45 int32_t aSourceWidth, int32_t aLeftInflation, |
|
46 int32_t aRightInflation) |
|
47 { |
|
48 int32_t currentRowSum = 0; |
|
49 |
|
50 for (int x = 0; x < aLeftInflation; x++) { |
|
51 currentRowSum += aSource[0]; |
|
52 aDest[x] = currentRowSum; |
|
53 } |
|
54 for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) { |
|
55 currentRowSum += aSource[(x - aLeftInflation)]; |
|
56 aDest[x] = currentRowSum; |
|
57 } |
|
58 for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) { |
|
59 currentRowSum += aSource[aSourceWidth - 1]; |
|
60 aDest[x] = currentRowSum; |
|
61 } |
|
62 } |
|
63 |
|
64 // This function calculates an integral of four pixels stored in the 4 |
|
65 // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns |
|
66 // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after |
|
67 // much testing. |
|
68 MOZ_ALWAYS_INLINE |
|
69 __m128i AccumulatePixelSums(__m128i aPixels) |
|
70 { |
|
71 __m128i sumPixels = aPixels; |
|
72 __m128i currentPixels = _mm_slli_si128(aPixels, 4); |
|
73 sumPixels = _mm_add_epi32(sumPixels, currentPixels); |
|
74 currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels); |
|
75 |
|
76 return _mm_add_epi32(sumPixels, currentPixels); |
|
77 } |
|
78 |
|
79 MOZ_ALWAYS_INLINE void |
|
80 GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation, |
|
81 int32_t aTopInflation, int32_t aBottomInflation, |
|
82 uint32_t *aIntegralImage, size_t aIntegralImageStride, |
|
83 uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize) |
|
84 { |
|
85 MOZ_ASSERT(!(aLeftInflation & 3)); |
|
86 |
|
87 uint32_t stride32bit = aIntegralImageStride / 4; |
|
88 |
|
89 IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation, |
|
90 aSize.height + aTopInflation + aBottomInflation); |
|
91 |
|
92 LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation); |
|
93 |
|
94 for (int y = 1; y < aTopInflation + 1; y++) { |
|
95 uint32_t *intRow = aIntegralImage + (y * stride32bit); |
|
96 uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
|
97 uint32_t *intFirstRow = aIntegralImage; |
|
98 |
|
99 for (int x = 0; x < integralImageSize.width; x += 4) { |
|
100 __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x)); |
|
101 __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x)); |
|
102 _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow)); |
|
103 } |
|
104 } |
|
105 |
|
106 for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) { |
|
107 __m128i currentRowSum = _mm_setzero_si128(); |
|
108 uint32_t *intRow = aIntegralImage + (y * stride32bit); |
|
109 uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
|
110 uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation); |
|
111 |
|
112 uint32_t pixel = sourceRow[0]; |
|
113 for (int x = 0; x < aLeftInflation; x += 4) { |
|
114 __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0))); |
|
115 |
|
116 sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
|
117 |
|
118 currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
|
119 |
|
120 _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
|
121 } |
|
122 for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) { |
|
123 uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation)); |
|
124 |
|
125 // It's important to shuffle here. When we exit this loop currentRowSum |
|
126 // has to be set to sumPixels, so that the following loop can get the |
|
127 // correct pixel for the currentRowSum. The highest order pixel in |
|
128 // currentRowSum could've originated from accumulation in the stride. |
|
129 currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
|
130 |
|
131 __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128())); |
|
132 sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
|
133 |
|
134 currentRowSum = sumPixels; |
|
135 |
|
136 _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
|
137 } |
|
138 |
|
139 pixel = sourceRow[aSize.width - 1]; |
|
140 int x = (aSize.width + aLeftInflation); |
|
141 if ((aSize.width & 3)) { |
|
142 // Deal with unaligned portion. Get the correct pixel from currentRowSum, |
|
143 // see explanation above. |
|
144 uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1]; |
|
145 for (; x < integralImageSize.width; x++) { |
|
146 // We could be unaligned here! |
|
147 if (!(x & 3)) { |
|
148 // aligned! |
|
149 currentRowSum = _mm_set1_epi32(intCurrentRowSum); |
|
150 break; |
|
151 } |
|
152 intCurrentRowSum += pixel; |
|
153 intRow[x] = intPrevRow[x] + intCurrentRowSum; |
|
154 } |
|
155 } else { |
|
156 currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
|
157 } |
|
158 for (; x < integralImageSize.width; x += 4) { |
|
159 __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel)); |
|
160 |
|
161 sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
|
162 |
|
163 currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
|
164 |
|
165 _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
|
166 } |
|
167 } |
|
168 |
|
169 if (aBottomInflation) { |
|
170 // Store the last valid row of our source image in the last row of |
|
171 // our integral image. This will be overwritten with the correct values |
|
172 // in the upcoming loop. |
|
173 LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit, |
|
174 aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation); |
|
175 |
|
176 |
|
177 for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) { |
|
178 __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit)); |
|
179 __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit); |
|
180 __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit); |
|
181 |
|
182 for (int x = 0; x < integralImageSize.width; x += 4) { |
|
183 _mm_store_si128(intRow + (x / 4), |
|
184 _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)), |
|
185 _mm_load_si128(intPrevRow + (x / 4)))); |
|
186 } |
|
187 } |
|
188 } |
|
189 } |
|
190 |
|
191 /** |
|
192 * Attempt to do an in-place box blur using an integral image. |
|
193 */ |
|
194 void |
|
195 AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData, |
|
196 int32_t aLeftLobe, |
|
197 int32_t aRightLobe, |
|
198 int32_t aTopLobe, |
|
199 int32_t aBottomLobe, |
|
200 uint32_t *aIntegralImage, |
|
201 size_t aIntegralImageStride) |
|
202 { |
|
203 IntSize size = GetSize(); |
|
204 |
|
205 MOZ_ASSERT(size.height > 0); |
|
206 |
|
207 // Our 'left' or 'top' lobe will include the current pixel. i.e. when |
|
208 // looking at an integral image the value of a pixel at 'x,y' is calculated |
|
209 // using the value of the integral image values above/below that. |
|
210 aLeftLobe++; |
|
211 aTopLobe++; |
|
212 int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe); |
|
213 |
|
214 MOZ_ASSERT(boxSize > 0); |
|
215 |
|
216 if (boxSize == 1) { |
|
217 return; |
|
218 } |
|
219 |
|
220 uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize); |
|
221 |
|
222 uint32_t stride32bit = aIntegralImageStride / 4; |
|
223 int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value(); |
|
224 |
|
225 GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe, |
|
226 aIntegralImage, aIntegralImageStride, aData, |
|
227 mStride, size); |
|
228 |
|
229 __m128i divisor = _mm_set1_epi32(reciprocal); |
|
230 |
|
231 // This points to the start of the rectangle within the IntegralImage that overlaps |
|
232 // the surface being blurred. |
|
233 uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation; |
|
234 |
|
235 IntRect skipRect = mSkipRect; |
|
236 int32_t stride = mStride; |
|
237 uint8_t *data = aData; |
|
238 for (int32_t y = 0; y < size.height; y++) { |
|
239 bool inSkipRectY = y > skipRect.y && y < skipRect.YMost(); |
|
240 |
|
241 uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
|
242 uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
|
243 uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
|
244 uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
|
245 |
|
246 int32_t x = 0; |
|
247 // Process 16 pixels at a time for as long as possible. |
|
248 for (; x <= size.width - 16; x += 16) { |
|
249 if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { |
|
250 x = skipRect.XMost() - 16; |
|
251 // Trigger early jump on coming loop iterations, this will be reset |
|
252 // next line anyway. |
|
253 inSkipRectY = false; |
|
254 continue; |
|
255 } |
|
256 |
|
257 __m128i topLeft; |
|
258 __m128i topRight; |
|
259 __m128i bottomRight; |
|
260 __m128i bottomLeft; |
|
261 |
|
262 topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
|
263 topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
|
264 bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
|
265 bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
|
266 __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
|
267 |
|
268 topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4)); |
|
269 topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4)); |
|
270 bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4)); |
|
271 bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4)); |
|
272 __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
|
273 |
|
274 topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8)); |
|
275 topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8)); |
|
276 bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8)); |
|
277 bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8)); |
|
278 __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
|
279 |
|
280 topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12)); |
|
281 topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12)); |
|
282 bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12)); |
|
283 bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12)); |
|
284 __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
|
285 |
|
286 __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4)); |
|
287 |
|
288 _mm_storeu_si128((__m128i*)(data + stride * y + x), final); |
|
289 } |
|
290 |
|
291 // Process the remaining pixels 4 bytes at a time. |
|
292 for (; x < size.width; x += 4) { |
|
293 if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { |
|
294 x = skipRect.XMost() - 4; |
|
295 // Trigger early jump on coming loop iterations, this will be reset |
|
296 // next line anyway. |
|
297 inSkipRectY = false; |
|
298 continue; |
|
299 } |
|
300 __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
|
301 __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
|
302 __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
|
303 __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
|
304 |
|
305 __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
|
306 __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128()); |
|
307 |
|
308 *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final); |
|
309 } |
|
310 } |
|
311 |
|
312 } |
|
313 |
|
314 } |
|
315 } |