1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/2d/ImageScalingSSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,330 @@ 1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- 1.5 + * This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "ImageScaling.h" 1.10 +#include "mozilla/Attributes.h" 1.11 + 1.12 +#include "SSEHelpers.h" 1.13 + 1.14 +/* The functions below use the following system for averaging 4 pixels: 1.15 + * 1.16 + * The first observation is that a half-adder is implemented as follows: 1.17 + * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); 1.18 + * 1.19 + * This can be trivially extended to three pixels by observaring that when 1.20 + * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the 1.21 + * carries of the individual numbers, since the sum of 3 bits can only ever 1.22 + * have a carry of one. 1.23 + * 1.24 + * We then observe that the average is then ((carry << 1) + sum) >> 1, or, 1.25 + * assuming eliminating overflows and underflows, carry + (sum >> 1). 1.26 + * 1.27 + * We now average our existing sum with the fourth number, so we get: 1.28 + * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). 1.29 + * 1.30 + * We now observe that our sum has been moved into place relative to the 1.31 + * carry, so we can now average with the carry to get the final 4 input 1.32 + * average: avg = (sum2 + carry) >> 1; 1.33 + * 1.34 + * Or to reverse the proof: 1.35 + * avg = ((sum >> 1) + carry + d >> 1) >> 1 1.36 + * avg = ((a + b + c) >> 1 + d >> 1) >> 1 1.37 + * avg = ((a + b + c + d) >> 2) 1.38 + * 1.39 + * An additional fact used in the SSE versions is the concept that we can 1.40 + * trivially convert a rounded average to a truncated average: 1.41 + * 1.42 + * We have: 1.43 + * f(a, b) = (a + b + 1) >> 1 1.44 + * 1.45 + * And want: 1.46 + * g(a, b) = (a + b) >> 1 1.47 + * 1.48 + * Observe: 1.49 + * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) 1.50 + * == ~((-a - 1 + -b - 1 + 1) >> 1) 1.51 + * == ~((-a - 1 + -b) >> 1) 1.52 + * == ~((-(a + b) - 1) >> 1) 1.53 + * == ~((~(a + b)) >> 1) 1.54 + * == (a + b) >> 1 1.55 + * == g(a, b) 1.56 + */ 1.57 + 1.58 +MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) 1.59 +{ 1.60 + __m128i minusone = _mm_set1_epi32(0xffffffff); 1.61 + return _mm_xor_si128(arg, minusone); 1.62 +} 1.63 + 1.64 +/* We have to pass pointers here, MSVC does not allow passing more than 3 1.65 + * __m128i arguments on the stack. And it does not allow 16-byte aligned 1.66 + * stack variables. This inlines properly on MSVC 2010. It does -not- inline 1.67 + * with just the inline directive. 1.68 + */ 1.69 +MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d) 1.70 +{ 1.71 +#define shuf1 _MM_SHUFFLE(2, 0, 2, 0) 1.72 +#define shuf2 _MM_SHUFFLE(3, 1, 3, 1) 1.73 + 1.74 +// This cannot be an inline function as the __Imm argument to _mm_shuffle_ps 1.75 +// needs to be a compile time constant. 1.76 +#define shuffle_si128(arga, argb, imm) \ 1.77 + _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm))); 1.78 + 1.79 + __m128i t = shuffle_si128(*a, *b, shuf1); 1.80 + *b = shuffle_si128(*a, *b, shuf2); 1.81 + *a = t; 1.82 + t = shuffle_si128(*c, *d, shuf1); 1.83 + *d = shuffle_si128(*c, *d, shuf2); 1.84 + *c = t; 1.85 + 1.86 +#undef shuf1 1.87 +#undef shuf2 1.88 +#undef shuffle_si128 1.89 + 1.90 + __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c)); 1.91 + 1.92 + __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c))); 1.93 + 1.94 + sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d)); 1.95 + 1.96 + return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry))); 1.97 +} 1.98 + 1.99 +MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) 1.100 +{ 1.101 + return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); 1.102 +} 1.103 + 1.104 +MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) 1.105 +{ 1.106 + __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); 1.107 + b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0))); 1.108 + a = t; 1.109 + 1.110 + return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); 1.111 +} 1.112 + 1.113 +MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d) 1.114 +{ 1.115 + uint32_t sum = a ^ b ^ c; 1.116 + uint32_t carry = (a & b) | (a & c) | (b & c); 1.117 + 1.118 + uint32_t mask = 0xfefefefe; 1.119 + 1.120 + // Not having a byte based average instruction means we should mask to avoid 1.121 + // underflow. 1.122 + sum = (((sum ^ d) & mask) >> 1) + (sum & d); 1.123 + 1.124 + return (((sum ^ carry) & mask) >> 1) + (sum & carry); 1.125 +} 1.126 + 1.127 +// Simple 2 pixel average version of the function above. 1.128 +MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) 1.129 +{ 1.130 + uint32_t sum = a ^ b; 1.131 + uint32_t carry = (a & b); 1.132 + 1.133 + uint32_t mask = 0xfefefefe; 1.134 + 1.135 + return ((sum & mask) >> 1) + carry; 1.136 +} 1.137 + 1.138 +namespace mozilla { 1.139 +namespace gfx { 1.140 + 1.141 +void 1.142 +ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride, 1.143 + const IntSize &aSourceSize, uint8_t *aDest, 1.144 + uint32_t aDestStride) 1.145 +{ 1.146 + const int Bpp = 4; 1.147 + 1.148 + for (int y = 0; y < aSourceSize.height; y += 2) { 1.149 + __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); 1.150 + int x = 0; 1.151 + // Run a loop depending on alignment. 1.152 + if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && 1.153 + !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 1.154 + for (; x < (aSourceSize.width - 7); x += 8) { 1.155 + __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 1.156 + __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 1.157 + 1.158 + __m128i a = _mm_load_si128(upperRow); 1.159 + __m128i b = _mm_load_si128(upperRow + 1); 1.160 + __m128i c = _mm_load_si128(lowerRow); 1.161 + __m128i d = _mm_load_si128(lowerRow + 1); 1.162 + 1.163 + *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 1.164 + } 1.165 + } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 1.166 + for (; x < (aSourceSize.width - 7); x += 8) { 1.167 + __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 1.168 + __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 1.169 + 1.170 + __m128i a = _mm_load_si128(upperRow); 1.171 + __m128i b = _mm_load_si128(upperRow + 1); 1.172 + __m128i c = loadUnaligned128(lowerRow); 1.173 + __m128i d = loadUnaligned128(lowerRow + 1); 1.174 + 1.175 + *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 1.176 + } 1.177 + } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 1.178 + for (; x < (aSourceSize.width - 7); x += 8) { 1.179 + __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 1.180 + __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 1.181 + 1.182 + __m128i a = loadUnaligned128((__m128i*)upperRow); 1.183 + __m128i b = loadUnaligned128((__m128i*)upperRow + 1); 1.184 + __m128i c = _mm_load_si128((__m128i*)lowerRow); 1.185 + __m128i d = _mm_load_si128((__m128i*)lowerRow + 1); 1.186 + 1.187 + *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 1.188 + } 1.189 + } else { 1.190 + for (; x < (aSourceSize.width - 7); x += 8) { 1.191 + __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); 1.192 + __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); 1.193 + 1.194 + __m128i a = loadUnaligned128(upperRow); 1.195 + __m128i b = loadUnaligned128(upperRow + 1); 1.196 + __m128i c = loadUnaligned128(lowerRow); 1.197 + __m128i d = loadUnaligned128(lowerRow + 1); 1.198 + 1.199 + *storage++ = avg_sse2_8x2(&a, &b, &c, &d); 1.200 + } 1.201 + } 1.202 + 1.203 + uint32_t *unalignedStorage = (uint32_t*)storage; 1.204 + // Take care of the final pixels, we know there's an even number of pixels 1.205 + // in the source rectangle. We use a 2x2 'simd' implementation for this. 1.206 + // 1.207 + // Potentially we only have to do this in the last row since overflowing 1.208 + // 8 pixels in an earlier row would appear to be harmless as it doesn't 1.209 + // touch invalid memory. Even when reading and writing to the same surface. 1.210 + // in practice we only do this when doing an additional downscale pass, and 1.211 + // in this situation we have unused stride to write into harmlessly. 1.212 + // I do not believe the additional code complexity would be worth it though. 1.213 + for (; x < aSourceSize.width; x += 2) { 1.214 + uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp); 1.215 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp); 1.216 + 1.217 + *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1), 1.218 + *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1)); 1.219 + } 1.220 + } 1.221 +} 1.222 + 1.223 +void 1.224 +ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride, 1.225 + const IntSize &aSourceSize, uint8_t *aDest, 1.226 + uint32_t aDestStride) 1.227 +{ 1.228 + for (int y = 0; y < aSourceSize.height; y += 2) { 1.229 + __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); 1.230 + int x = 0; 1.231 + // Run a loop depending on alignment. 1.232 + if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && 1.233 + !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { 1.234 + for (; x < (aSourceSize.width - 3); x += 4) { 1.235 + uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); 1.236 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 1.237 + 1.238 + __m128i a = _mm_load_si128((__m128i*)upperRow); 1.239 + __m128i b = _mm_load_si128((__m128i*)lowerRow); 1.240 + 1.241 + *storage++ = avg_sse2_4x2_4x1(a, b); 1.242 + } 1.243 + } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 1.244 + // This line doesn't align well. 1.245 + for (; x < (aSourceSize.width - 3); x += 4) { 1.246 + uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); 1.247 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 1.248 + 1.249 + __m128i a = _mm_load_si128((__m128i*)upperRow); 1.250 + __m128i b = loadUnaligned128((__m128i*)lowerRow); 1.251 + 1.252 + *storage++ = avg_sse2_4x2_4x1(a, b); 1.253 + } 1.254 + } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 1.255 + for (; x < (aSourceSize.width - 3); x += 4) { 1.256 + uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); 1.257 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 1.258 + 1.259 + __m128i a = loadUnaligned128((__m128i*)upperRow); 1.260 + __m128i b = _mm_load_si128((__m128i*)lowerRow); 1.261 + 1.262 + *storage++ = avg_sse2_4x2_4x1(a, b); 1.263 + } 1.264 + } else { 1.265 + for (; x < (aSourceSize.width - 3); x += 4) { 1.266 + uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); 1.267 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 1.268 + 1.269 + __m128i a = loadUnaligned128((__m128i*)upperRow); 1.270 + __m128i b = loadUnaligned128((__m128i*)lowerRow); 1.271 + 1.272 + *storage++ = avg_sse2_4x2_4x1(a, b); 1.273 + } 1.274 + } 1.275 + 1.276 + uint32_t *unalignedStorage = (uint32_t*)storage; 1.277 + // Take care of the final pixels, we know there's an even number of pixels 1.278 + // in the source rectangle. 1.279 + // 1.280 + // Similar overflow considerations are valid as in the previous function. 1.281 + for (; x < aSourceSize.width; x++) { 1.282 + uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); 1.283 + uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); 1.284 + 1.285 + *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow); 1.286 + } 1.287 + } 1.288 +} 1.289 + 1.290 +void 1.291 +ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride, 1.292 + const IntSize &aSourceSize, uint8_t *aDest, 1.293 + uint32_t aDestStride) 1.294 +{ 1.295 + for (int y = 0; y < aSourceSize.height; y++) { 1.296 + __m128i *storage = (__m128i*)(aDest + (y * aDestStride)); 1.297 + int x = 0; 1.298 + // Run a loop depending on alignment. 1.299 + if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { 1.300 + for (; x < (aSourceSize.width - 7); x += 8) { 1.301 + __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); 1.302 + 1.303 + __m128i a = _mm_load_si128(pixels); 1.304 + __m128i b = _mm_load_si128(pixels + 1); 1.305 + 1.306 + *storage++ = avg_sse2_8x1_4x1(a, b); 1.307 + } 1.308 + } else { 1.309 + for (; x < (aSourceSize.width - 7); x += 8) { 1.310 + __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); 1.311 + 1.312 + __m128i a = loadUnaligned128(pixels); 1.313 + __m128i b = loadUnaligned128(pixels + 1); 1.314 + 1.315 + *storage++ = avg_sse2_8x1_4x1(a, b); 1.316 + } 1.317 + } 1.318 + 1.319 + uint32_t *unalignedStorage = (uint32_t*)storage; 1.320 + // Take care of the final pixels, we know there's an even number of pixels 1.321 + // in the source rectangle. 1.322 + // 1.323 + // Similar overflow considerations are valid as in the previous function. 1.324 + for (; x < aSourceSize.width; x += 2) { 1.325 + uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4)); 1.326 + 1.327 + *unalignedStorage++ = Avg2(*pixels, *(pixels + 1)); 1.328 + } 1.329 + } 1.330 +} 1.331 + 1.332 +} 1.333 +}