Tue, 06 Jan 2015 21:39:09 +0100
Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- |
michael@0 | 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "ImageScaling.h" |
michael@0 | 7 | #include "mozilla/Attributes.h" |
michael@0 | 8 | |
michael@0 | 9 | #include "SSEHelpers.h" |
michael@0 | 10 | |
michael@0 | 11 | /* The functions below use the following system for averaging 4 pixels: |
michael@0 | 12 | * |
michael@0 | 13 | * The first observation is that a half-adder is implemented as follows: |
michael@0 | 14 | * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1); |
michael@0 | 15 | * |
michael@0 | 16 | * This can be trivially extended to three pixels by observaring that when |
michael@0 | 17 | * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the |
michael@0 | 18 | * carries of the individual numbers, since the sum of 3 bits can only ever |
michael@0 | 19 | * have a carry of one. |
michael@0 | 20 | * |
michael@0 | 21 | * We then observe that the average is then ((carry << 1) + sum) >> 1, or, |
michael@0 | 22 | * assuming eliminating overflows and underflows, carry + (sum >> 1). |
michael@0 | 23 | * |
michael@0 | 24 | * We now average our existing sum with the fourth number, so we get: |
michael@0 | 25 | * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1). |
michael@0 | 26 | * |
michael@0 | 27 | * We now observe that our sum has been moved into place relative to the |
michael@0 | 28 | * carry, so we can now average with the carry to get the final 4 input |
michael@0 | 29 | * average: avg = (sum2 + carry) >> 1; |
michael@0 | 30 | * |
michael@0 | 31 | * Or to reverse the proof: |
michael@0 | 32 | * avg = ((sum >> 1) + carry + d >> 1) >> 1 |
michael@0 | 33 | * avg = ((a + b + c) >> 1 + d >> 1) >> 1 |
michael@0 | 34 | * avg = ((a + b + c + d) >> 2) |
michael@0 | 35 | * |
michael@0 | 36 | * An additional fact used in the SSE versions is the concept that we can |
michael@0 | 37 | * trivially convert a rounded average to a truncated average: |
michael@0 | 38 | * |
michael@0 | 39 | * We have: |
michael@0 | 40 | * f(a, b) = (a + b + 1) >> 1 |
michael@0 | 41 | * |
michael@0 | 42 | * And want: |
michael@0 | 43 | * g(a, b) = (a + b) >> 1 |
michael@0 | 44 | * |
michael@0 | 45 | * Observe: |
michael@0 | 46 | * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1) |
michael@0 | 47 | * == ~((-a - 1 + -b - 1 + 1) >> 1) |
michael@0 | 48 | * == ~((-a - 1 + -b) >> 1) |
michael@0 | 49 | * == ~((-(a + b) - 1) >> 1) |
michael@0 | 50 | * == ~((~(a + b)) >> 1) |
michael@0 | 51 | * == (a + b) >> 1 |
michael@0 | 52 | * == g(a, b) |
michael@0 | 53 | */ |
michael@0 | 54 | |
michael@0 | 55 | MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) |
michael@0 | 56 | { |
michael@0 | 57 | __m128i minusone = _mm_set1_epi32(0xffffffff); |
michael@0 | 58 | return _mm_xor_si128(arg, minusone); |
michael@0 | 59 | } |
michael@0 | 60 | |
michael@0 | 61 | /* We have to pass pointers here, MSVC does not allow passing more than 3 |
michael@0 | 62 | * __m128i arguments on the stack. And it does not allow 16-byte aligned |
michael@0 | 63 | * stack variables. This inlines properly on MSVC 2010. It does -not- inline |
michael@0 | 64 | * with just the inline directive. |
michael@0 | 65 | */ |
michael@0 | 66 | MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d) |
michael@0 | 67 | { |
michael@0 | 68 | #define shuf1 _MM_SHUFFLE(2, 0, 2, 0) |
michael@0 | 69 | #define shuf2 _MM_SHUFFLE(3, 1, 3, 1) |
michael@0 | 70 | |
michael@0 | 71 | // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps |
michael@0 | 72 | // needs to be a compile time constant. |
michael@0 | 73 | #define shuffle_si128(arga, argb, imm) \ |
michael@0 | 74 | _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm))); |
michael@0 | 75 | |
michael@0 | 76 | __m128i t = shuffle_si128(*a, *b, shuf1); |
michael@0 | 77 | *b = shuffle_si128(*a, *b, shuf2); |
michael@0 | 78 | *a = t; |
michael@0 | 79 | t = shuffle_si128(*c, *d, shuf1); |
michael@0 | 80 | *d = shuffle_si128(*c, *d, shuf2); |
michael@0 | 81 | *c = t; |
michael@0 | 82 | |
michael@0 | 83 | #undef shuf1 |
michael@0 | 84 | #undef shuf2 |
michael@0 | 85 | #undef shuffle_si128 |
michael@0 | 86 | |
michael@0 | 87 | __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c)); |
michael@0 | 88 | |
michael@0 | 89 | __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c))); |
michael@0 | 90 | |
michael@0 | 91 | sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d)); |
michael@0 | 92 | |
michael@0 | 93 | return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry))); |
michael@0 | 94 | } |
michael@0 | 95 | |
michael@0 | 96 | MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) |
michael@0 | 97 | { |
michael@0 | 98 | return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
michael@0 | 99 | } |
michael@0 | 100 | |
michael@0 | 101 | MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) |
michael@0 | 102 | { |
michael@0 | 103 | __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1))); |
michael@0 | 104 | b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0))); |
michael@0 | 105 | a = t; |
michael@0 | 106 | |
michael@0 | 107 | return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b))); |
michael@0 | 108 | } |
michael@0 | 109 | |
michael@0 | 110 | MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d) |
michael@0 | 111 | { |
michael@0 | 112 | uint32_t sum = a ^ b ^ c; |
michael@0 | 113 | uint32_t carry = (a & b) | (a & c) | (b & c); |
michael@0 | 114 | |
michael@0 | 115 | uint32_t mask = 0xfefefefe; |
michael@0 | 116 | |
michael@0 | 117 | // Not having a byte based average instruction means we should mask to avoid |
michael@0 | 118 | // underflow. |
michael@0 | 119 | sum = (((sum ^ d) & mask) >> 1) + (sum & d); |
michael@0 | 120 | |
michael@0 | 121 | return (((sum ^ carry) & mask) >> 1) + (sum & carry); |
michael@0 | 122 | } |
michael@0 | 123 | |
michael@0 | 124 | // Simple 2 pixel average version of the function above. |
michael@0 | 125 | MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) |
michael@0 | 126 | { |
michael@0 | 127 | uint32_t sum = a ^ b; |
michael@0 | 128 | uint32_t carry = (a & b); |
michael@0 | 129 | |
michael@0 | 130 | uint32_t mask = 0xfefefefe; |
michael@0 | 131 | |
michael@0 | 132 | return ((sum & mask) >> 1) + carry; |
michael@0 | 133 | } |
michael@0 | 134 | |
michael@0 | 135 | namespace mozilla { |
michael@0 | 136 | namespace gfx { |
michael@0 | 137 | |
michael@0 | 138 | void |
michael@0 | 139 | ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride, |
michael@0 | 140 | const IntSize &aSourceSize, uint8_t *aDest, |
michael@0 | 141 | uint32_t aDestStride) |
michael@0 | 142 | { |
michael@0 | 143 | const int Bpp = 4; |
michael@0 | 144 | |
michael@0 | 145 | for (int y = 0; y < aSourceSize.height; y += 2) { |
michael@0 | 146 | __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
michael@0 | 147 | int x = 0; |
michael@0 | 148 | // Run a loop depending on alignment. |
michael@0 | 149 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
michael@0 | 150 | !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
michael@0 | 151 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 152 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
michael@0 | 153 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
michael@0 | 154 | |
michael@0 | 155 | __m128i a = _mm_load_si128(upperRow); |
michael@0 | 156 | __m128i b = _mm_load_si128(upperRow + 1); |
michael@0 | 157 | __m128i c = _mm_load_si128(lowerRow); |
michael@0 | 158 | __m128i d = _mm_load_si128(lowerRow + 1); |
michael@0 | 159 | |
michael@0 | 160 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
michael@0 | 161 | } |
michael@0 | 162 | } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
michael@0 | 163 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 164 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
michael@0 | 165 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
michael@0 | 166 | |
michael@0 | 167 | __m128i a = _mm_load_si128(upperRow); |
michael@0 | 168 | __m128i b = _mm_load_si128(upperRow + 1); |
michael@0 | 169 | __m128i c = loadUnaligned128(lowerRow); |
michael@0 | 170 | __m128i d = loadUnaligned128(lowerRow + 1); |
michael@0 | 171 | |
michael@0 | 172 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
michael@0 | 173 | } |
michael@0 | 174 | } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
michael@0 | 175 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 176 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
michael@0 | 177 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
michael@0 | 178 | |
michael@0 | 179 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
michael@0 | 180 | __m128i b = loadUnaligned128((__m128i*)upperRow + 1); |
michael@0 | 181 | __m128i c = _mm_load_si128((__m128i*)lowerRow); |
michael@0 | 182 | __m128i d = _mm_load_si128((__m128i*)lowerRow + 1); |
michael@0 | 183 | |
michael@0 | 184 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
michael@0 | 185 | } |
michael@0 | 186 | } else { |
michael@0 | 187 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 188 | __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp)); |
michael@0 | 189 | __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp)); |
michael@0 | 190 | |
michael@0 | 191 | __m128i a = loadUnaligned128(upperRow); |
michael@0 | 192 | __m128i b = loadUnaligned128(upperRow + 1); |
michael@0 | 193 | __m128i c = loadUnaligned128(lowerRow); |
michael@0 | 194 | __m128i d = loadUnaligned128(lowerRow + 1); |
michael@0 | 195 | |
michael@0 | 196 | *storage++ = avg_sse2_8x2(&a, &b, &c, &d); |
michael@0 | 197 | } |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | uint32_t *unalignedStorage = (uint32_t*)storage; |
michael@0 | 201 | // Take care of the final pixels, we know there's an even number of pixels |
michael@0 | 202 | // in the source rectangle. We use a 2x2 'simd' implementation for this. |
michael@0 | 203 | // |
michael@0 | 204 | // Potentially we only have to do this in the last row since overflowing |
michael@0 | 205 | // 8 pixels in an earlier row would appear to be harmless as it doesn't |
michael@0 | 206 | // touch invalid memory. Even when reading and writing to the same surface. |
michael@0 | 207 | // in practice we only do this when doing an additional downscale pass, and |
michael@0 | 208 | // in this situation we have unused stride to write into harmlessly. |
michael@0 | 209 | // I do not believe the additional code complexity would be worth it though. |
michael@0 | 210 | for (; x < aSourceSize.width; x += 2) { |
michael@0 | 211 | uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp); |
michael@0 | 212 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp); |
michael@0 | 213 | |
michael@0 | 214 | *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1), |
michael@0 | 215 | *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1)); |
michael@0 | 216 | } |
michael@0 | 217 | } |
michael@0 | 218 | } |
michael@0 | 219 | |
michael@0 | 220 | void |
michael@0 | 221 | ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride, |
michael@0 | 222 | const IntSize &aSourceSize, uint8_t *aDest, |
michael@0 | 223 | uint32_t aDestStride) |
michael@0 | 224 | { |
michael@0 | 225 | for (int y = 0; y < aSourceSize.height; y += 2) { |
michael@0 | 226 | __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride); |
michael@0 | 227 | int x = 0; |
michael@0 | 228 | // Run a loop depending on alignment. |
michael@0 | 229 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) && |
michael@0 | 230 | !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) { |
michael@0 | 231 | for (; x < (aSourceSize.width - 3); x += 4) { |
michael@0 | 232 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
michael@0 | 233 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
michael@0 | 234 | |
michael@0 | 235 | __m128i a = _mm_load_si128((__m128i*)upperRow); |
michael@0 | 236 | __m128i b = _mm_load_si128((__m128i*)lowerRow); |
michael@0 | 237 | |
michael@0 | 238 | *storage++ = avg_sse2_4x2_4x1(a, b); |
michael@0 | 239 | } |
michael@0 | 240 | } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
michael@0 | 241 | // This line doesn't align well. |
michael@0 | 242 | for (; x < (aSourceSize.width - 3); x += 4) { |
michael@0 | 243 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
michael@0 | 244 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
michael@0 | 245 | |
michael@0 | 246 | __m128i a = _mm_load_si128((__m128i*)upperRow); |
michael@0 | 247 | __m128i b = loadUnaligned128((__m128i*)lowerRow); |
michael@0 | 248 | |
michael@0 | 249 | *storage++ = avg_sse2_4x2_4x1(a, b); |
michael@0 | 250 | } |
michael@0 | 251 | } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
michael@0 | 252 | for (; x < (aSourceSize.width - 3); x += 4) { |
michael@0 | 253 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
michael@0 | 254 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
michael@0 | 255 | |
michael@0 | 256 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
michael@0 | 257 | __m128i b = _mm_load_si128((__m128i*)lowerRow); |
michael@0 | 258 | |
michael@0 | 259 | *storage++ = avg_sse2_4x2_4x1(a, b); |
michael@0 | 260 | } |
michael@0 | 261 | } else { |
michael@0 | 262 | for (; x < (aSourceSize.width - 3); x += 4) { |
michael@0 | 263 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
michael@0 | 264 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
michael@0 | 265 | |
michael@0 | 266 | __m128i a = loadUnaligned128((__m128i*)upperRow); |
michael@0 | 267 | __m128i b = loadUnaligned128((__m128i*)lowerRow); |
michael@0 | 268 | |
michael@0 | 269 | *storage++ = avg_sse2_4x2_4x1(a, b); |
michael@0 | 270 | } |
michael@0 | 271 | } |
michael@0 | 272 | |
michael@0 | 273 | uint32_t *unalignedStorage = (uint32_t*)storage; |
michael@0 | 274 | // Take care of the final pixels, we know there's an even number of pixels |
michael@0 | 275 | // in the source rectangle. |
michael@0 | 276 | // |
michael@0 | 277 | // Similar overflow considerations are valid as in the previous function. |
michael@0 | 278 | for (; x < aSourceSize.width; x++) { |
michael@0 | 279 | uint8_t *upperRow = aSource + (y * aSourceStride + x * 4); |
michael@0 | 280 | uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4); |
michael@0 | 281 | |
michael@0 | 282 | *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow); |
michael@0 | 283 | } |
michael@0 | 284 | } |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | void |
michael@0 | 288 | ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride, |
michael@0 | 289 | const IntSize &aSourceSize, uint8_t *aDest, |
michael@0 | 290 | uint32_t aDestStride) |
michael@0 | 291 | { |
michael@0 | 292 | for (int y = 0; y < aSourceSize.height; y++) { |
michael@0 | 293 | __m128i *storage = (__m128i*)(aDest + (y * aDestStride)); |
michael@0 | 294 | int x = 0; |
michael@0 | 295 | // Run a loop depending on alignment. |
michael@0 | 296 | if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) { |
michael@0 | 297 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 298 | __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
michael@0 | 299 | |
michael@0 | 300 | __m128i a = _mm_load_si128(pixels); |
michael@0 | 301 | __m128i b = _mm_load_si128(pixels + 1); |
michael@0 | 302 | |
michael@0 | 303 | *storage++ = avg_sse2_8x1_4x1(a, b); |
michael@0 | 304 | } |
michael@0 | 305 | } else { |
michael@0 | 306 | for (; x < (aSourceSize.width - 7); x += 8) { |
michael@0 | 307 | __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4)); |
michael@0 | 308 | |
michael@0 | 309 | __m128i a = loadUnaligned128(pixels); |
michael@0 | 310 | __m128i b = loadUnaligned128(pixels + 1); |
michael@0 | 311 | |
michael@0 | 312 | *storage++ = avg_sse2_8x1_4x1(a, b); |
michael@0 | 313 | } |
michael@0 | 314 | } |
michael@0 | 315 | |
michael@0 | 316 | uint32_t *unalignedStorage = (uint32_t*)storage; |
michael@0 | 317 | // Take care of the final pixels, we know there's an even number of pixels |
michael@0 | 318 | // in the source rectangle. |
michael@0 | 319 | // |
michael@0 | 320 | // Similar overflow considerations are valid as in the previous function. |
michael@0 | 321 | for (; x < aSourceSize.width; x += 2) { |
michael@0 | 322 | uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4)); |
michael@0 | 323 | |
michael@0 | 324 | *unalignedStorage++ = Avg2(*pixels, *(pixels + 1)); |
michael@0 | 325 | } |
michael@0 | 326 | } |
michael@0 | 327 | } |
michael@0 | 328 | |
michael@0 | 329 | } |
michael@0 | 330 | } |