1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/2d/BlurSSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,315 @@ 1.4 +/* This Source Code Form is subject to the terms of the Mozilla Public 1.5 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.7 + 1.8 +#include "Blur.h" 1.9 + 1.10 +#include "SSEHelpers.h" 1.11 + 1.12 +#include <string.h> 1.13 + 1.14 +namespace mozilla { 1.15 +namespace gfx { 1.16 + 1.17 +MOZ_ALWAYS_INLINE 1.18 +__m128i Divide(__m128i aValues, __m128i aDivisor) 1.19 +{ 1.20 + const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff); 1.21 + static const union { 1.22 + int64_t i64[2]; 1.23 + __m128i m; 1.24 + } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } }; 1.25 + 1.26 + __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor); 1.27 + __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor); 1.28 + 1.29 + // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the 1.30 + // result is rounded. 1.31 + __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32); 1.32 + __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask); 1.33 + __m128i p4321 = _mm_or_si128(p_3_1, p4_2_); 1.34 + return p4321; 1.35 +} 1.36 + 1.37 +MOZ_ALWAYS_INLINE 1.38 +__m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight, 1.39 + const __m128i& aBottomRight, const __m128i& aBottomLeft, 1.40 + const __m128i& aDivisor) 1.41 +{ 1.42 + __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft); 1.43 + return Divide(values, aDivisor); 1.44 +} 1.45 + 1.46 +MOZ_ALWAYS_INLINE 1.47 +void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource, 1.48 + int32_t aSourceWidth, int32_t aLeftInflation, 1.49 + int32_t aRightInflation) 1.50 +{ 1.51 + int32_t currentRowSum = 0; 1.52 + 1.53 + for (int x = 0; x < aLeftInflation; x++) { 1.54 + currentRowSum += aSource[0]; 1.55 + aDest[x] = currentRowSum; 1.56 + } 1.57 + for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) { 1.58 + currentRowSum += aSource[(x - aLeftInflation)]; 1.59 + aDest[x] = currentRowSum; 1.60 + } 1.61 + for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) { 1.62 + currentRowSum += aSource[aSourceWidth - 1]; 1.63 + aDest[x] = currentRowSum; 1.64 + } 1.65 +} 1.66 + 1.67 +// This function calculates an integral of four pixels stored in the 4 1.68 +// 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns 1.69 +// { 30, 80, 160, 260 }. This seems to be the fastest way to do this after 1.70 +// much testing. 1.71 +MOZ_ALWAYS_INLINE 1.72 +__m128i AccumulatePixelSums(__m128i aPixels) 1.73 +{ 1.74 + __m128i sumPixels = aPixels; 1.75 + __m128i currentPixels = _mm_slli_si128(aPixels, 4); 1.76 + sumPixels = _mm_add_epi32(sumPixels, currentPixels); 1.77 + currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels); 1.78 + 1.79 + return _mm_add_epi32(sumPixels, currentPixels); 1.80 +} 1.81 + 1.82 +MOZ_ALWAYS_INLINE void 1.83 +GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation, 1.84 + int32_t aTopInflation, int32_t aBottomInflation, 1.85 + uint32_t *aIntegralImage, size_t aIntegralImageStride, 1.86 + uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize) 1.87 +{ 1.88 + MOZ_ASSERT(!(aLeftInflation & 3)); 1.89 + 1.90 + uint32_t stride32bit = aIntegralImageStride / 4; 1.91 + 1.92 + IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation, 1.93 + aSize.height + aTopInflation + aBottomInflation); 1.94 + 1.95 + LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation); 1.96 + 1.97 + for (int y = 1; y < aTopInflation + 1; y++) { 1.98 + uint32_t *intRow = aIntegralImage + (y * stride32bit); 1.99 + uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; 1.100 + uint32_t *intFirstRow = aIntegralImage; 1.101 + 1.102 + for (int x = 0; x < integralImageSize.width; x += 4) { 1.103 + __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x)); 1.104 + __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x)); 1.105 + _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow)); 1.106 + } 1.107 + } 1.108 + 1.109 + for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) { 1.110 + __m128i currentRowSum = _mm_setzero_si128(); 1.111 + uint32_t *intRow = aIntegralImage + (y * stride32bit); 1.112 + uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; 1.113 + uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation); 1.114 + 1.115 + uint32_t pixel = sourceRow[0]; 1.116 + for (int x = 0; x < aLeftInflation; x += 4) { 1.117 + __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0))); 1.118 + 1.119 + sumPixels = _mm_add_epi32(sumPixels, currentRowSum); 1.120 + 1.121 + currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); 1.122 + 1.123 + _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); 1.124 + } 1.125 + for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) { 1.126 + uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation)); 1.127 + 1.128 + // It's important to shuffle here. When we exit this loop currentRowSum 1.129 + // has to be set to sumPixels, so that the following loop can get the 1.130 + // correct pixel for the currentRowSum. The highest order pixel in 1.131 + // currentRowSum could've originated from accumulation in the stride. 1.132 + currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); 1.133 + 1.134 + __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128())); 1.135 + sumPixels = _mm_add_epi32(sumPixels, currentRowSum); 1.136 + 1.137 + currentRowSum = sumPixels; 1.138 + 1.139 + _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); 1.140 + } 1.141 + 1.142 + pixel = sourceRow[aSize.width - 1]; 1.143 + int x = (aSize.width + aLeftInflation); 1.144 + if ((aSize.width & 3)) { 1.145 + // Deal with unaligned portion. Get the correct pixel from currentRowSum, 1.146 + // see explanation above. 1.147 + uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1]; 1.148 + for (; x < integralImageSize.width; x++) { 1.149 + // We could be unaligned here! 1.150 + if (!(x & 3)) { 1.151 + // aligned! 1.152 + currentRowSum = _mm_set1_epi32(intCurrentRowSum); 1.153 + break; 1.154 + } 1.155 + intCurrentRowSum += pixel; 1.156 + intRow[x] = intPrevRow[x] + intCurrentRowSum; 1.157 + } 1.158 + } else { 1.159 + currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); 1.160 + } 1.161 + for (; x < integralImageSize.width; x += 4) { 1.162 + __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel)); 1.163 + 1.164 + sumPixels = _mm_add_epi32(sumPixels, currentRowSum); 1.165 + 1.166 + currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); 1.167 + 1.168 + _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); 1.169 + } 1.170 + } 1.171 + 1.172 + if (aBottomInflation) { 1.173 + // Store the last valid row of our source image in the last row of 1.174 + // our integral image. This will be overwritten with the correct values 1.175 + // in the upcoming loop. 1.176 + LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit, 1.177 + aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation); 1.178 + 1.179 + 1.180 + for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) { 1.181 + __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit)); 1.182 + __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit); 1.183 + __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit); 1.184 + 1.185 + for (int x = 0; x < integralImageSize.width; x += 4) { 1.186 + _mm_store_si128(intRow + (x / 4), 1.187 + _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)), 1.188 + _mm_load_si128(intPrevRow + (x / 4)))); 1.189 + } 1.190 + } 1.191 + } 1.192 +} 1.193 + 1.194 +/** 1.195 + * Attempt to do an in-place box blur using an integral image. 1.196 + */ 1.197 +void 1.198 +AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData, 1.199 + int32_t aLeftLobe, 1.200 + int32_t aRightLobe, 1.201 + int32_t aTopLobe, 1.202 + int32_t aBottomLobe, 1.203 + uint32_t *aIntegralImage, 1.204 + size_t aIntegralImageStride) 1.205 +{ 1.206 + IntSize size = GetSize(); 1.207 + 1.208 + MOZ_ASSERT(size.height > 0); 1.209 + 1.210 + // Our 'left' or 'top' lobe will include the current pixel. i.e. when 1.211 + // looking at an integral image the value of a pixel at 'x,y' is calculated 1.212 + // using the value of the integral image values above/below that. 1.213 + aLeftLobe++; 1.214 + aTopLobe++; 1.215 + int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe); 1.216 + 1.217 + MOZ_ASSERT(boxSize > 0); 1.218 + 1.219 + if (boxSize == 1) { 1.220 + return; 1.221 + } 1.222 + 1.223 + uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize); 1.224 + 1.225 + uint32_t stride32bit = aIntegralImageStride / 4; 1.226 + int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value(); 1.227 + 1.228 + GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe, 1.229 + aIntegralImage, aIntegralImageStride, aData, 1.230 + mStride, size); 1.231 + 1.232 + __m128i divisor = _mm_set1_epi32(reciprocal); 1.233 + 1.234 + // This points to the start of the rectangle within the IntegralImage that overlaps 1.235 + // the surface being blurred. 1.236 + uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation; 1.237 + 1.238 + IntRect skipRect = mSkipRect; 1.239 + int32_t stride = mStride; 1.240 + uint8_t *data = aData; 1.241 + for (int32_t y = 0; y < size.height; y++) { 1.242 + bool inSkipRectY = y > skipRect.y && y < skipRect.YMost(); 1.243 + 1.244 + uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe); 1.245 + uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe); 1.246 + uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe); 1.247 + uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe); 1.248 + 1.249 + int32_t x = 0; 1.250 + // Process 16 pixels at a time for as long as possible. 1.251 + for (; x <= size.width - 16; x += 16) { 1.252 + if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { 1.253 + x = skipRect.XMost() - 16; 1.254 + // Trigger early jump on coming loop iterations, this will be reset 1.255 + // next line anyway. 1.256 + inSkipRectY = false; 1.257 + continue; 1.258 + } 1.259 + 1.260 + __m128i topLeft; 1.261 + __m128i topRight; 1.262 + __m128i bottomRight; 1.263 + __m128i bottomLeft; 1.264 + 1.265 + topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); 1.266 + topRight = loadUnaligned128((__m128i*)(topRightBase + x)); 1.267 + bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); 1.268 + bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); 1.269 + __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); 1.270 + 1.271 + topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4)); 1.272 + topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4)); 1.273 + bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4)); 1.274 + bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4)); 1.275 + __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); 1.276 + 1.277 + topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8)); 1.278 + topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8)); 1.279 + bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8)); 1.280 + bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8)); 1.281 + __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); 1.282 + 1.283 + topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12)); 1.284 + topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12)); 1.285 + bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12)); 1.286 + bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12)); 1.287 + __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); 1.288 + 1.289 + __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4)); 1.290 + 1.291 + _mm_storeu_si128((__m128i*)(data + stride * y + x), final); 1.292 + } 1.293 + 1.294 + // Process the remaining pixels 4 bytes at a time. 1.295 + for (; x < size.width; x += 4) { 1.296 + if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { 1.297 + x = skipRect.XMost() - 4; 1.298 + // Trigger early jump on coming loop iterations, this will be reset 1.299 + // next line anyway. 1.300 + inSkipRectY = false; 1.301 + continue; 1.302 + } 1.303 + __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); 1.304 + __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x)); 1.305 + __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); 1.306 + __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); 1.307 + 1.308 + __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); 1.309 + __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128()); 1.310 + 1.311 + *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final); 1.312 + } 1.313 + } 1.314 + 1.315 +} 1.316 + 1.317 +} 1.318 +}