Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | /* This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 4 | |
michael@0 | 5 | #include "Blur.h" |
michael@0 | 6 | |
michael@0 | 7 | #include "SSEHelpers.h" |
michael@0 | 8 | |
michael@0 | 9 | #include <string.h> |
michael@0 | 10 | |
michael@0 | 11 | namespace mozilla { |
michael@0 | 12 | namespace gfx { |
michael@0 | 13 | |
michael@0 | 14 | MOZ_ALWAYS_INLINE |
michael@0 | 15 | __m128i Divide(__m128i aValues, __m128i aDivisor) |
michael@0 | 16 | { |
michael@0 | 17 | const __m128i mask = _mm_setr_epi32(0x0, 0xffffffff, 0x0, 0xffffffff); |
michael@0 | 18 | static const union { |
michael@0 | 19 | int64_t i64[2]; |
michael@0 | 20 | __m128i m; |
michael@0 | 21 | } roundingAddition = { { int64_t(1) << 31, int64_t(1) << 31 } }; |
michael@0 | 22 | |
michael@0 | 23 | __m128i multiplied31 = _mm_mul_epu32(aValues, aDivisor); |
michael@0 | 24 | __m128i multiplied42 = _mm_mul_epu32(_mm_srli_epi64(aValues, 32), aDivisor); |
michael@0 | 25 | |
michael@0 | 26 | // Add 1 << 31 before shifting or masking the lower 32 bits away, so that the |
michael@0 | 27 | // result is rounded. |
michael@0 | 28 | __m128i p_3_1 = _mm_srli_epi64(_mm_add_epi64(multiplied31, roundingAddition.m), 32); |
michael@0 | 29 | __m128i p4_2_ = _mm_and_si128(_mm_add_epi64(multiplied42, roundingAddition.m), mask); |
michael@0 | 30 | __m128i p4321 = _mm_or_si128(p_3_1, p4_2_); |
michael@0 | 31 | return p4321; |
michael@0 | 32 | } |
michael@0 | 33 | |
michael@0 | 34 | MOZ_ALWAYS_INLINE |
michael@0 | 35 | __m128i BlurFourPixels(const __m128i& aTopLeft, const __m128i& aTopRight, |
michael@0 | 36 | const __m128i& aBottomRight, const __m128i& aBottomLeft, |
michael@0 | 37 | const __m128i& aDivisor) |
michael@0 | 38 | { |
michael@0 | 39 | __m128i values = _mm_add_epi32(_mm_sub_epi32(_mm_sub_epi32(aBottomRight, aTopRight), aBottomLeft), aTopLeft); |
michael@0 | 40 | return Divide(values, aDivisor); |
michael@0 | 41 | } |
michael@0 | 42 | |
michael@0 | 43 | MOZ_ALWAYS_INLINE |
michael@0 | 44 | void LoadIntegralRowFromRow(uint32_t *aDest, const uint8_t *aSource, |
michael@0 | 45 | int32_t aSourceWidth, int32_t aLeftInflation, |
michael@0 | 46 | int32_t aRightInflation) |
michael@0 | 47 | { |
michael@0 | 48 | int32_t currentRowSum = 0; |
michael@0 | 49 | |
michael@0 | 50 | for (int x = 0; x < aLeftInflation; x++) { |
michael@0 | 51 | currentRowSum += aSource[0]; |
michael@0 | 52 | aDest[x] = currentRowSum; |
michael@0 | 53 | } |
michael@0 | 54 | for (int x = aLeftInflation; x < (aSourceWidth + aLeftInflation); x++) { |
michael@0 | 55 | currentRowSum += aSource[(x - aLeftInflation)]; |
michael@0 | 56 | aDest[x] = currentRowSum; |
michael@0 | 57 | } |
michael@0 | 58 | for (int x = (aSourceWidth + aLeftInflation); x < (aSourceWidth + aLeftInflation + aRightInflation); x++) { |
michael@0 | 59 | currentRowSum += aSource[aSourceWidth - 1]; |
michael@0 | 60 | aDest[x] = currentRowSum; |
michael@0 | 61 | } |
michael@0 | 62 | } |
michael@0 | 63 | |
michael@0 | 64 | // This function calculates an integral of four pixels stored in the 4 |
michael@0 | 65 | // 32-bit integers on aPixels. i.e. for { 30, 50, 80, 100 } this returns |
michael@0 | 66 | // { 30, 80, 160, 260 }. This seems to be the fastest way to do this after |
michael@0 | 67 | // much testing. |
michael@0 | 68 | MOZ_ALWAYS_INLINE |
michael@0 | 69 | __m128i AccumulatePixelSums(__m128i aPixels) |
michael@0 | 70 | { |
michael@0 | 71 | __m128i sumPixels = aPixels; |
michael@0 | 72 | __m128i currentPixels = _mm_slli_si128(aPixels, 4); |
michael@0 | 73 | sumPixels = _mm_add_epi32(sumPixels, currentPixels); |
michael@0 | 74 | currentPixels = _mm_unpacklo_epi64(_mm_setzero_si128(), sumPixels); |
michael@0 | 75 | |
michael@0 | 76 | return _mm_add_epi32(sumPixels, currentPixels); |
michael@0 | 77 | } |
michael@0 | 78 | |
michael@0 | 79 | MOZ_ALWAYS_INLINE void |
michael@0 | 80 | GenerateIntegralImage_SSE2(int32_t aLeftInflation, int32_t aRightInflation, |
michael@0 | 81 | int32_t aTopInflation, int32_t aBottomInflation, |
michael@0 | 82 | uint32_t *aIntegralImage, size_t aIntegralImageStride, |
michael@0 | 83 | uint8_t *aSource, int32_t aSourceStride, const IntSize &aSize) |
michael@0 | 84 | { |
michael@0 | 85 | MOZ_ASSERT(!(aLeftInflation & 3)); |
michael@0 | 86 | |
michael@0 | 87 | uint32_t stride32bit = aIntegralImageStride / 4; |
michael@0 | 88 | |
michael@0 | 89 | IntSize integralImageSize(aSize.width + aLeftInflation + aRightInflation, |
michael@0 | 90 | aSize.height + aTopInflation + aBottomInflation); |
michael@0 | 91 | |
michael@0 | 92 | LoadIntegralRowFromRow(aIntegralImage, aSource, aSize.width, aLeftInflation, aRightInflation); |
michael@0 | 93 | |
michael@0 | 94 | for (int y = 1; y < aTopInflation + 1; y++) { |
michael@0 | 95 | uint32_t *intRow = aIntegralImage + (y * stride32bit); |
michael@0 | 96 | uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
michael@0 | 97 | uint32_t *intFirstRow = aIntegralImage; |
michael@0 | 98 | |
michael@0 | 99 | for (int x = 0; x < integralImageSize.width; x += 4) { |
michael@0 | 100 | __m128i firstRow = _mm_load_si128((__m128i*)(intFirstRow + x)); |
michael@0 | 101 | __m128i previousRow = _mm_load_si128((__m128i*)(intPrevRow + x)); |
michael@0 | 102 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(firstRow, previousRow)); |
michael@0 | 103 | } |
michael@0 | 104 | } |
michael@0 | 105 | |
michael@0 | 106 | for (int y = aTopInflation + 1; y < (aSize.height + aTopInflation); y++) { |
michael@0 | 107 | __m128i currentRowSum = _mm_setzero_si128(); |
michael@0 | 108 | uint32_t *intRow = aIntegralImage + (y * stride32bit); |
michael@0 | 109 | uint32_t *intPrevRow = aIntegralImage + (y - 1) * stride32bit; |
michael@0 | 110 | uint8_t *sourceRow = aSource + aSourceStride * (y - aTopInflation); |
michael@0 | 111 | |
michael@0 | 112 | uint32_t pixel = sourceRow[0]; |
michael@0 | 113 | for (int x = 0; x < aLeftInflation; x += 4) { |
michael@0 | 114 | __m128i sumPixels = AccumulatePixelSums(_mm_shuffle_epi32(_mm_set1_epi32(pixel), _MM_SHUFFLE(0, 0, 0, 0))); |
michael@0 | 115 | |
michael@0 | 116 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
michael@0 | 117 | |
michael@0 | 118 | currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
michael@0 | 119 | |
michael@0 | 120 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
michael@0 | 121 | } |
michael@0 | 122 | for (int x = aLeftInflation; x < (aSize.width + aLeftInflation); x += 4) { |
michael@0 | 123 | uint32_t pixels = *(uint32_t*)(sourceRow + (x - aLeftInflation)); |
michael@0 | 124 | |
michael@0 | 125 | // It's important to shuffle here. When we exit this loop currentRowSum |
michael@0 | 126 | // has to be set to sumPixels, so that the following loop can get the |
michael@0 | 127 | // correct pixel for the currentRowSum. The highest order pixel in |
michael@0 | 128 | // currentRowSum could've originated from accumulation in the stride. |
michael@0 | 129 | currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
michael@0 | 130 | |
michael@0 | 131 | __m128i sumPixels = AccumulatePixelSums(_mm_unpacklo_epi16(_mm_unpacklo_epi8( _mm_set1_epi32(pixels), _mm_setzero_si128()), _mm_setzero_si128())); |
michael@0 | 132 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
michael@0 | 133 | |
michael@0 | 134 | currentRowSum = sumPixels; |
michael@0 | 135 | |
michael@0 | 136 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
michael@0 | 137 | } |
michael@0 | 138 | |
michael@0 | 139 | pixel = sourceRow[aSize.width - 1]; |
michael@0 | 140 | int x = (aSize.width + aLeftInflation); |
michael@0 | 141 | if ((aSize.width & 3)) { |
michael@0 | 142 | // Deal with unaligned portion. Get the correct pixel from currentRowSum, |
michael@0 | 143 | // see explanation above. |
michael@0 | 144 | uint32_t intCurrentRowSum = ((uint32_t*)¤tRowSum)[(aSize.width % 4) - 1]; |
michael@0 | 145 | for (; x < integralImageSize.width; x++) { |
michael@0 | 146 | // We could be unaligned here! |
michael@0 | 147 | if (!(x & 3)) { |
michael@0 | 148 | // aligned! |
michael@0 | 149 | currentRowSum = _mm_set1_epi32(intCurrentRowSum); |
michael@0 | 150 | break; |
michael@0 | 151 | } |
michael@0 | 152 | intCurrentRowSum += pixel; |
michael@0 | 153 | intRow[x] = intPrevRow[x] + intCurrentRowSum; |
michael@0 | 154 | } |
michael@0 | 155 | } else { |
michael@0 | 156 | currentRowSum = _mm_shuffle_epi32(currentRowSum, _MM_SHUFFLE(3, 3, 3, 3)); |
michael@0 | 157 | } |
michael@0 | 158 | for (; x < integralImageSize.width; x += 4) { |
michael@0 | 159 | __m128i sumPixels = AccumulatePixelSums(_mm_set1_epi32(pixel)); |
michael@0 | 160 | |
michael@0 | 161 | sumPixels = _mm_add_epi32(sumPixels, currentRowSum); |
michael@0 | 162 | |
michael@0 | 163 | currentRowSum = _mm_shuffle_epi32(sumPixels, _MM_SHUFFLE(3, 3, 3, 3)); |
michael@0 | 164 | |
michael@0 | 165 | _mm_store_si128((__m128i*)(intRow + x), _mm_add_epi32(sumPixels, _mm_load_si128((__m128i*)(intPrevRow + x)))); |
michael@0 | 166 | } |
michael@0 | 167 | } |
michael@0 | 168 | |
michael@0 | 169 | if (aBottomInflation) { |
michael@0 | 170 | // Store the last valid row of our source image in the last row of |
michael@0 | 171 | // our integral image. This will be overwritten with the correct values |
michael@0 | 172 | // in the upcoming loop. |
michael@0 | 173 | LoadIntegralRowFromRow(aIntegralImage + (integralImageSize.height - 1) * stride32bit, |
michael@0 | 174 | aSource + (aSize.height - 1) * aSourceStride, aSize.width, aLeftInflation, aRightInflation); |
michael@0 | 175 | |
michael@0 | 176 | |
michael@0 | 177 | for (int y = aSize.height + aTopInflation; y < integralImageSize.height; y++) { |
michael@0 | 178 | __m128i *intRow = (__m128i*)(aIntegralImage + (y * stride32bit)); |
michael@0 | 179 | __m128i *intPrevRow = (__m128i*)(aIntegralImage + (y - 1) * stride32bit); |
michael@0 | 180 | __m128i *intLastRow = (__m128i*)(aIntegralImage + (integralImageSize.height - 1) * stride32bit); |
michael@0 | 181 | |
michael@0 | 182 | for (int x = 0; x < integralImageSize.width; x += 4) { |
michael@0 | 183 | _mm_store_si128(intRow + (x / 4), |
michael@0 | 184 | _mm_add_epi32(_mm_load_si128(intLastRow + (x / 4)), |
michael@0 | 185 | _mm_load_si128(intPrevRow + (x / 4)))); |
michael@0 | 186 | } |
michael@0 | 187 | } |
michael@0 | 188 | } |
michael@0 | 189 | } |
michael@0 | 190 | |
michael@0 | 191 | /** |
michael@0 | 192 | * Attempt to do an in-place box blur using an integral image. |
michael@0 | 193 | */ |
michael@0 | 194 | void |
michael@0 | 195 | AlphaBoxBlur::BoxBlur_SSE2(uint8_t* aData, |
michael@0 | 196 | int32_t aLeftLobe, |
michael@0 | 197 | int32_t aRightLobe, |
michael@0 | 198 | int32_t aTopLobe, |
michael@0 | 199 | int32_t aBottomLobe, |
michael@0 | 200 | uint32_t *aIntegralImage, |
michael@0 | 201 | size_t aIntegralImageStride) |
michael@0 | 202 | { |
michael@0 | 203 | IntSize size = GetSize(); |
michael@0 | 204 | |
michael@0 | 205 | MOZ_ASSERT(size.height > 0); |
michael@0 | 206 | |
michael@0 | 207 | // Our 'left' or 'top' lobe will include the current pixel. i.e. when |
michael@0 | 208 | // looking at an integral image the value of a pixel at 'x,y' is calculated |
michael@0 | 209 | // using the value of the integral image values above/below that. |
michael@0 | 210 | aLeftLobe++; |
michael@0 | 211 | aTopLobe++; |
michael@0 | 212 | int32_t boxSize = (aLeftLobe + aRightLobe) * (aTopLobe + aBottomLobe); |
michael@0 | 213 | |
michael@0 | 214 | MOZ_ASSERT(boxSize > 0); |
michael@0 | 215 | |
michael@0 | 216 | if (boxSize == 1) { |
michael@0 | 217 | return; |
michael@0 | 218 | } |
michael@0 | 219 | |
michael@0 | 220 | uint32_t reciprocal = uint32_t((uint64_t(1) << 32) / boxSize); |
michael@0 | 221 | |
michael@0 | 222 | uint32_t stride32bit = aIntegralImageStride / 4; |
michael@0 | 223 | int32_t leftInflation = RoundUpToMultipleOf4(aLeftLobe).value(); |
michael@0 | 224 | |
michael@0 | 225 | GenerateIntegralImage_SSE2(leftInflation, aRightLobe, aTopLobe, aBottomLobe, |
michael@0 | 226 | aIntegralImage, aIntegralImageStride, aData, |
michael@0 | 227 | mStride, size); |
michael@0 | 228 | |
michael@0 | 229 | __m128i divisor = _mm_set1_epi32(reciprocal); |
michael@0 | 230 | |
michael@0 | 231 | // This points to the start of the rectangle within the IntegralImage that overlaps |
michael@0 | 232 | // the surface being blurred. |
michael@0 | 233 | uint32_t *innerIntegral = aIntegralImage + (aTopLobe * stride32bit) + leftInflation; |
michael@0 | 234 | |
michael@0 | 235 | IntRect skipRect = mSkipRect; |
michael@0 | 236 | int32_t stride = mStride; |
michael@0 | 237 | uint8_t *data = aData; |
michael@0 | 238 | for (int32_t y = 0; y < size.height; y++) { |
michael@0 | 239 | bool inSkipRectY = y > skipRect.y && y < skipRect.YMost(); |
michael@0 | 240 | |
michael@0 | 241 | uint32_t *topLeftBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
michael@0 | 242 | uint32_t *topRightBase = innerIntegral + ((y - aTopLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
michael@0 | 243 | uint32_t *bottomRightBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) + aRightLobe); |
michael@0 | 244 | uint32_t *bottomLeftBase = innerIntegral + ((y + aBottomLobe) * ptrdiff_t(stride32bit) - aLeftLobe); |
michael@0 | 245 | |
michael@0 | 246 | int32_t x = 0; |
michael@0 | 247 | // Process 16 pixels at a time for as long as possible. |
michael@0 | 248 | for (; x <= size.width - 16; x += 16) { |
michael@0 | 249 | if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { |
michael@0 | 250 | x = skipRect.XMost() - 16; |
michael@0 | 251 | // Trigger early jump on coming loop iterations, this will be reset |
michael@0 | 252 | // next line anyway. |
michael@0 | 253 | inSkipRectY = false; |
michael@0 | 254 | continue; |
michael@0 | 255 | } |
michael@0 | 256 | |
michael@0 | 257 | __m128i topLeft; |
michael@0 | 258 | __m128i topRight; |
michael@0 | 259 | __m128i bottomRight; |
michael@0 | 260 | __m128i bottomLeft; |
michael@0 | 261 | |
michael@0 | 262 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
michael@0 | 263 | topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
michael@0 | 264 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
michael@0 | 265 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
michael@0 | 266 | __m128i result1 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
michael@0 | 267 | |
michael@0 | 268 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 4)); |
michael@0 | 269 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 4)); |
michael@0 | 270 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 4)); |
michael@0 | 271 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 4)); |
michael@0 | 272 | __m128i result2 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
michael@0 | 273 | |
michael@0 | 274 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 8)); |
michael@0 | 275 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 8)); |
michael@0 | 276 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 8)); |
michael@0 | 277 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 8)); |
michael@0 | 278 | __m128i result3 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
michael@0 | 279 | |
michael@0 | 280 | topLeft = loadUnaligned128((__m128i*)(topLeftBase + x + 12)); |
michael@0 | 281 | topRight = loadUnaligned128((__m128i*)(topRightBase + x + 12)); |
michael@0 | 282 | bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x + 12)); |
michael@0 | 283 | bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x + 12)); |
michael@0 | 284 | __m128i result4 = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
michael@0 | 285 | |
michael@0 | 286 | __m128i final = _mm_packus_epi16(_mm_packs_epi32(result1, result2), _mm_packs_epi32(result3, result4)); |
michael@0 | 287 | |
michael@0 | 288 | _mm_storeu_si128((__m128i*)(data + stride * y + x), final); |
michael@0 | 289 | } |
michael@0 | 290 | |
michael@0 | 291 | // Process the remaining pixels 4 bytes at a time. |
michael@0 | 292 | for (; x < size.width; x += 4) { |
michael@0 | 293 | if (inSkipRectY && x > skipRect.x && x < skipRect.XMost()) { |
michael@0 | 294 | x = skipRect.XMost() - 4; |
michael@0 | 295 | // Trigger early jump on coming loop iterations, this will be reset |
michael@0 | 296 | // next line anyway. |
michael@0 | 297 | inSkipRectY = false; |
michael@0 | 298 | continue; |
michael@0 | 299 | } |
michael@0 | 300 | __m128i topLeft = loadUnaligned128((__m128i*)(topLeftBase + x)); |
michael@0 | 301 | __m128i topRight = loadUnaligned128((__m128i*)(topRightBase + x)); |
michael@0 | 302 | __m128i bottomRight = loadUnaligned128((__m128i*)(bottomRightBase + x)); |
michael@0 | 303 | __m128i bottomLeft = loadUnaligned128((__m128i*)(bottomLeftBase + x)); |
michael@0 | 304 | |
michael@0 | 305 | __m128i result = BlurFourPixels(topLeft, topRight, bottomRight, bottomLeft, divisor); |
michael@0 | 306 | __m128i final = _mm_packus_epi16(_mm_packs_epi32(result, _mm_setzero_si128()), _mm_setzero_si128()); |
michael@0 | 307 | |
michael@0 | 308 | *(uint32_t*)(data + stride * y + x) = _mm_cvtsi128_si32(final); |
michael@0 | 309 | } |
michael@0 | 310 | } |
michael@0 | 311 | |
michael@0 | 312 | } |
michael@0 | 313 | |
michael@0 | 314 | } |
michael@0 | 315 | } |