|
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
|
2 * This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "ImageScaling.h" |
|
7 #include "mozilla/Attributes.h" |
|
8 |
|
9 #include "SSEHelpers.h" |
|
10 |
|
11 /* The functions below use the following system for averaging 4 pixels: |
|
12 * |
|
13 * The first observation is that a half-adder is implemented as follows: |
|
14 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); |
|
15 * |
|
16 * This can be trivially extended to three pixels by observaring that when |
|
17 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the |
|
18 * carries of the individual numbers, since the sum of 3 bits can only ever |
|
19 * have a carry of one. |
|
20 * |
|
21 * We then observe that the average is then ((carry << 1) + sum) >> 1, or, |
|
22 * assuming eliminating overflows and underflows, carry + (sum >> 1). |
|
23 * |
|
24 * We now average our existing sum with the fourth number, so we get: |
|
25 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). |
|
26 * |
|
27 * We now observe that our sum has been moved into place relative to the |
|
28 * carry, so we can now average with the carry to get the final 4 input |
|
29 * average: avg = (sum2 + carry) >> 1; |
|
30 * |
|
31 * Or to reverse the proof: |
|
32 * avg = ((sum >> 1) + carry + d >> 1) >> 1 |
|
33 * avg = ((a + b + c) >> 1 + d >> 1) >> 1 |
|
34 * avg = ((a + b + c + d) >> 2) |
|
35 * |
|
36 * An additional fact used in the SSE versions is the concept that we can |
|
37 * trivially convert a rounded average to a truncated average: |
|
38 * |
|
39 * We have: |
|
40 * f(a, b) = (a + b + 1) >> 1 |
|
41 * |
|
42 * And want: |
|
43 * g(a, b) = (a + b) >> 1 |
|
44 * |
|
45 * Observe: |
|
46 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) |
|
47 * == ~((-a - 1 + -b - 1 + 1) >> 1) |
|
48 * == ~((-a - 1 + -b) >> 1) |
|
49 * == ~((-(a + b) - 1) >> 1) |
|
50 * == ~((~(a + b)) >> 1) |
|
51 * == (a + b) >> 1 |
|
52 * == g(a, b) |
|
53 */ |
|
54 |
|
55 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) |
|
56 { |
|
57 __m128i minusone = _mm_set1_epi32(0xffffffff); |
|
58 return _mm_xor_si128(arg, minusone); |
|
59 } |
|
60 |
|
61 /* We have to pass pointers here, MSVC does not allow passing more than 3 |
|
62 * __m128i arguments on the stack. And it does not allow 16-byte aligned |
|
63 * stack variables. This inlines properly on MSVC 2010. It does -not- inline |
|
64 * with just the inline directive. |
|
65 */ |
|
66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d) |
|
67 { |
|
68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0) |
|
69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1) |
|
70 |
|
71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps |
|
72 // needs to be a compile time constant. |
|
73 #define shuffle_si128(arga, argb, imm) \ |
|
74 _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm))); |
|
75 |
|
76 __m128i t = shuffle_si128(*a, *b, shuf1); |
|
77 *b = shuffle_si128(*a, *b, shuf2); |
|
78 *a = t; |
|
79 t = shuffle_si128(*c, *d, shuf1); |
|
80 *d = shuffle_si128(*c, *d, shuf2); |
|
81 *c = t; |
|
82 |
|
83 #undef shuf1 |
|
84 #undef shuf2 |
|
85 #undef shuffle_si128 |
|
86 |
|
87 __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c)); |
|
88 |
|
89 __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c))); |
|
90 |
|
91 sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d)); |
|
92 |
|
93 return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry))); |
|
94 } |
|
95 |
|
96 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) |
|
97 { |
|
98 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
|
99 } |
|
100 |
|
101 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) |
|
102 { |
|
103 __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); |
|
104 b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0))); |
|
105 a = t; |
|
106 |
|
107 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
|
108 } |
|
109 |
|
110 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d) |
|
111 { |
|
112 uint32_t sum = a ^ b ^ c; |
|
113 uint32_t carry = (a & b) | (a & c) | (b & c); |
|
114 |
|
115 uint32_t mask = 0xfefefefe; |
|
116 |
|
117 // Not having a byte based average instruction means we should mask to avoid |
|
118 // underflow. |
|
119 sum = (((sum ^ d) & mask) >> 1) + (sum & d); |
|
120 |
|
121 return (((sum ^ carry) & mask) >> 1) + (sum & carry); |
|
122 } |
|
123 |
|
124 // Simple 2 pixel average version of the function above. |
|
125 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) |
|
126 { |
|
127 uint32_t sum = a ^ b; |
|
128 uint32_t carry = (a & b); |
|
129 |
|
130 uint32_t mask = 0xfefefefe; |
|
131 |
|
132 return ((sum & mask) >> 1) + carry; |
|
133 } |
|
134 |
|
135 namespace mozilla { |
|
136 namespace gfx { |
|
137 |
|
138 void |
|
139 ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride, |
|
140 const IntSize &aSourceSize, uint8_t *aDest, |
|
141 uint32_t aDestStride) |
|
142 { |
|
143 const int Bpp = 4; |
|
144 |
|
145 for (int y = 0; y < aSourceSize.height; y += 2) { |
|
146 __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
|
147 int x = 0; |
|
148 // Run a loop depending on alignment. |
|
149 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
|
150 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
|
151 for (; x < (aSourceSize.width - 7); x += 8) { |
|
152 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
|
153 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
|
154 |
|
155 __m128i a = _mm_load_si128(upperRow); |
|
156 __m128i b = _mm_load_si128(upperRow + 1); |
|
157 __m128i c = _mm_load_si128(lowerRow); |
|
158 __m128i d = _mm_load_si128(lowerRow + 1); |
|
159 |
|
160 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
|
161 } |
|
162 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
|
163 for (; x < (aSourceSize.width - 7); x += 8) { |
|
164 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
|
165 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
|
166 |
|
167 __m128i a = _mm_load_si128(upperRow); |
|
168 __m128i b = _mm_load_si128(upperRow + 1); |
|
169 __m128i c = loadUnaligned128(lowerRow); |
|
170 __m128i d = loadUnaligned128(lowerRow + 1); |
|
171 |
|
172 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
|
173 } |
|
174 } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
|
175 for (; x < (aSourceSize.width - 7); x += 8) { |
|
176 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
|
177 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
|
178 |
|
179 __m128i a = loadUnaligned128((__m128i*)upperRow); |
|
180 __m128i b = loadUnaligned128((__m128i*)upperRow + 1); |
|
181 __m128i c = _mm_load_si128((__m128i*)lowerRow); |
|
182 __m128i d = _mm_load_si128((__m128i*)lowerRow + 1); |
|
183 |
|
184 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
|
185 } |
|
186 } else { |
|
187 for (; x < (aSourceSize.width - 7); x += 8) { |
|
188 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
|
189 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
|
190 |
|
191 __m128i a = loadUnaligned128(upperRow); |
|
192 __m128i b = loadUnaligned128(upperRow + 1); |
|
193 __m128i c = loadUnaligned128(lowerRow); |
|
194 __m128i d = loadUnaligned128(lowerRow + 1); |
|
195 |
|
196 *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
|
197 } |
|
198 } |
|
199 |
|
200 uint32_t *unalignedStorage = (uint32_t*)storage; |
|
201 // Take care of the final pixels, we know there's an even number of pixels |
|
202 // in the source rectangle. We use a 2x2 'simd' implementation for this. |
|
203 // |
|
204 // Potentially we only have to do this in the last row since overflowing |
|
205 // 8 pixels in an earlier row would appear to be harmless as it doesn't |
|
206 // touch invalid memory. Even when reading and writing to the same surface. |
|
207 // in practice we only do this when doing an additional downscale pass, and |
|
208 // in this situation we have unused stride to write into harmlessly. |
|
209 // I do not believe the additional code complexity would be worth it though. |
|
210 for (; x < aSourceSize.width; x += 2) { |
|
211 uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp); |
|
212 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp); |
|
213 |
|
214 *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1), |
|
215 *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1)); |
|
216 } |
|
217 } |
|
218 } |
|
219 |
|
220 void |
|
221 ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride, |
|
222 const IntSize &aSourceSize, uint8_t *aDest, |
|
223 uint32_t aDestStride) |
|
224 { |
|
225 for (int y = 0; y < aSourceSize.height; y += 2) { |
|
226 __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
|
227 int x = 0; |
|
228 // Run a loop depending on alignment. |
|
229 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
|
230 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
|
231 for (; x < (aSourceSize.width - 3); x += 4) { |
|
232 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
|
233 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
|
234 |
|
235 __m128i a = _mm_load_si128((__m128i*)upperRow); |
|
236 __m128i b = _mm_load_si128((__m128i*)lowerRow); |
|
237 |
|
238 *storage++ = avg_sse2_4x2_4x1(a, b); |
|
239 } |
|
240 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
|
241 // This line doesn't align well. |
|
242 for (; x < (aSourceSize.width - 3); x += 4) { |
|
243 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
|
244 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
|
245 |
|
246 __m128i a = _mm_load_si128((__m128i*)upperRow); |
|
247 __m128i b = loadUnaligned128((__m128i*)lowerRow); |
|
248 |
|
249 *storage++ = avg_sse2_4x2_4x1(a, b); |
|
250 } |
|
251 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
|
252 for (; x < (aSourceSize.width - 3); x += 4) { |
|
253 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
|
254 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
|
255 |
|
256 __m128i a = loadUnaligned128((__m128i*)upperRow); |
|
257 __m128i b = _mm_load_si128((__m128i*)lowerRow); |
|
258 |
|
259 *storage++ = avg_sse2_4x2_4x1(a, b); |
|
260 } |
|
261 } else { |
|
262 for (; x < (aSourceSize.width - 3); x += 4) { |
|
263 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
|
264 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
|
265 |
|
266 __m128i a = loadUnaligned128((__m128i*)upperRow); |
|
267 __m128i b = loadUnaligned128((__m128i*)lowerRow); |
|
268 |
|
269 *storage++ = avg_sse2_4x2_4x1(a, b); |
|
270 } |
|
271 } |
|
272 |
|
273 uint32_t *unalignedStorage = (uint32_t*)storage; |
|
274 // Take care of the final pixels, we know there's an even number of pixels |
|
275 // in the source rectangle. |
|
276 // |
|
277 // Similar overflow considerations are valid as in the previous function. |
|
278 for (; x < aSourceSize.width; x++) { |
|
279 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
|
280 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
|
281 |
|
282 *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow); |
|
283 } |
|
284 } |
|
285 } |
|
286 |
|
287 void |
|
288 ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride, |
|
289 const IntSize &aSourceSize, uint8_t *aDest, |
|
290 uint32_t aDestStride) |
|
291 { |
|
292 for (int y = 0; y < aSourceSize.height; y++) { |
|
293 __m128i *storage = (__m128i*)(aDest + (y * aDestStride)); |
|
294 int x = 0; |
|
295 // Run a loop depending on alignment. |
|
296 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
|
297 for (; x < (aSourceSize.width - 7); x += 8) { |
|
298 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
|
299 |
|
300 __m128i a = _mm_load_si128(pixels); |
|
301 __m128i b = _mm_load_si128(pixels + 1); |
|
302 |
|
303 *storage++ = avg_sse2_8x1_4x1(a, b); |
|
304 } |
|
305 } else { |
|
306 for (; x < (aSourceSize.width - 7); x += 8) { |
|
307 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
|
308 |
|
309 __m128i a = loadUnaligned128(pixels); |
|
310 __m128i b = loadUnaligned128(pixels + 1); |
|
311 |
|
312 *storage++ = avg_sse2_8x1_4x1(a, b); |
|
313 } |
|
314 } |
|
315 |
|
316 uint32_t *unalignedStorage = (uint32_t*)storage; |
|
317 // Take care of the final pixels, we know there's an even number of pixels |
|
318 // in the source rectangle. |
|
319 // |
|
320 // Similar overflow considerations are valid as in the previous function. |
|
321 for (; x < aSourceSize.width; x += 2) { |
|
322 uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4)); |
|
323 |
|
324 *unalignedStorage++ = Avg2(*pixels, *(pixels + 1)); |
|
325 } |
|
326 } |
|
327 } |
|
328 |
|
329 } |
|
330 } |