gfx/2d/ImageScalingSSE2.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
michael@0 2 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "ImageScaling.h"
michael@0 7 #include "mozilla/Attributes.h"
michael@0 8
michael@0 9 #include "SSEHelpers.h"
michael@0 10
michael@0 11 /* The functions below use the following system for averaging 4 pixels:
michael@0 12 *
michael@0 13 * The first observation is that a half-adder is implemented as follows:
michael@0 14 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
michael@0 15 *
michael@0 16 * This can be trivially extended to three pixels by observaring that when
michael@0 17 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
michael@0 18 * carries of the individual numbers, since the sum of 3 bits can only ever
michael@0 19 * have a carry of one.
michael@0 20 *
michael@0 21 * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
michael@0 22 * assuming eliminating overflows and underflows, carry + (sum >> 1).
michael@0 23 *
michael@0 24 * We now average our existing sum with the fourth number, so we get:
michael@0 25 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
michael@0 26 *
michael@0 27 * We now observe that our sum has been moved into place relative to the
michael@0 28 * carry, so we can now average with the carry to get the final 4 input
michael@0 29 * average: avg = (sum2 + carry) >> 1;
michael@0 30 *
michael@0 31 * Or to reverse the proof:
michael@0 32 * avg = ((sum >> 1) + carry + d >> 1) >> 1
michael@0 33 * avg = ((a + b + c) >> 1 + d >> 1) >> 1
michael@0 34 * avg = ((a + b + c + d) >> 2)
michael@0 35 *
michael@0 36 * An additional fact used in the SSE versions is the concept that we can
michael@0 37 * trivially convert a rounded average to a truncated average:
michael@0 38 *
michael@0 39 * We have:
michael@0 40 * f(a, b) = (a + b + 1) >> 1
michael@0 41 *
michael@0 42 * And want:
michael@0 43 * g(a, b) = (a + b) >> 1
michael@0 44 *
michael@0 45 * Observe:
michael@0 46 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
michael@0 47 * == ~((-a - 1 + -b - 1 + 1) >> 1)
michael@0 48 * == ~((-a - 1 + -b) >> 1)
michael@0 49 * == ~((-(a + b) - 1) >> 1)
michael@0 50 * == ~((~(a + b)) >> 1)
michael@0 51 * == (a + b) >> 1
michael@0 52 * == g(a, b)
michael@0 53 */
michael@0 54
michael@0 55 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
michael@0 56 {
michael@0 57 __m128i minusone = _mm_set1_epi32(0xffffffff);
michael@0 58 return _mm_xor_si128(arg, minusone);
michael@0 59 }
michael@0 60
michael@0 61 /* We have to pass pointers here, MSVC does not allow passing more than 3
michael@0 62 * __m128i arguments on the stack. And it does not allow 16-byte aligned
michael@0 63 * stack variables. This inlines properly on MSVC 2010. It does -not- inline
michael@0 64 * with just the inline directive.
michael@0 65 */
michael@0 66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
michael@0 67 {
michael@0 68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
michael@0 69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
michael@0 70
michael@0 71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
michael@0 72 // needs to be a compile time constant.
michael@0 73 #define shuffle_si128(arga, argb, imm) \
michael@0 74 _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
michael@0 75
michael@0 76 __m128i t = shuffle_si128(*a, *b, shuf1);
michael@0 77 *b = shuffle_si128(*a, *b, shuf2);
michael@0 78 *a = t;
michael@0 79 t = shuffle_si128(*c, *d, shuf1);
michael@0 80 *d = shuffle_si128(*c, *d, shuf2);
michael@0 81 *c = t;
michael@0 82
michael@0 83 #undef shuf1
michael@0 84 #undef shuf2
michael@0 85 #undef shuffle_si128
michael@0 86
michael@0 87 __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
michael@0 88
michael@0 89 __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
michael@0 90
michael@0 91 sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
michael@0 92
michael@0 93 return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
michael@0 94 }
michael@0 95
michael@0 96 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
michael@0 97 {
michael@0 98 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
michael@0 99 }
michael@0 100
michael@0 101 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
michael@0 102 {
michael@0 103 __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
michael@0 104 b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
michael@0 105 a = t;
michael@0 106
michael@0 107 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
michael@0 108 }
michael@0 109
michael@0 110 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
michael@0 111 {
michael@0 112 uint32_t sum = a ^ b ^ c;
michael@0 113 uint32_t carry = (a & b) | (a & c) | (b & c);
michael@0 114
michael@0 115 uint32_t mask = 0xfefefefe;
michael@0 116
michael@0 117 // Not having a byte based average instruction means we should mask to avoid
michael@0 118 // underflow.
michael@0 119 sum = (((sum ^ d) & mask) >> 1) + (sum & d);
michael@0 120
michael@0 121 return (((sum ^ carry) & mask) >> 1) + (sum & carry);
michael@0 122 }
michael@0 123
michael@0 124 // Simple 2 pixel average version of the function above.
michael@0 125 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
michael@0 126 {
michael@0 127 uint32_t sum = a ^ b;
michael@0 128 uint32_t carry = (a & b);
michael@0 129
michael@0 130 uint32_t mask = 0xfefefefe;
michael@0 131
michael@0 132 return ((sum & mask) >> 1) + carry;
michael@0 133 }
michael@0 134
michael@0 135 namespace mozilla {
michael@0 136 namespace gfx {
michael@0 137
michael@0 138 void
michael@0 139 ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
michael@0 140 const IntSize &aSourceSize, uint8_t *aDest,
michael@0 141 uint32_t aDestStride)
michael@0 142 {
michael@0 143 const int Bpp = 4;
michael@0 144
michael@0 145 for (int y = 0; y < aSourceSize.height; y += 2) {
michael@0 146 __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
michael@0 147 int x = 0;
michael@0 148 // Run a loop depending on alignment.
michael@0 149 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
michael@0 150 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
michael@0 151 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 152 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
michael@0 153 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
michael@0 154
michael@0 155 __m128i a = _mm_load_si128(upperRow);
michael@0 156 __m128i b = _mm_load_si128(upperRow + 1);
michael@0 157 __m128i c = _mm_load_si128(lowerRow);
michael@0 158 __m128i d = _mm_load_si128(lowerRow + 1);
michael@0 159
michael@0 160 *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
michael@0 161 }
michael@0 162 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
michael@0 163 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 164 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
michael@0 165 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
michael@0 166
michael@0 167 __m128i a = _mm_load_si128(upperRow);
michael@0 168 __m128i b = _mm_load_si128(upperRow + 1);
michael@0 169 __m128i c = loadUnaligned128(lowerRow);
michael@0 170 __m128i d = loadUnaligned128(lowerRow + 1);
michael@0 171
michael@0 172 *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
michael@0 173 }
michael@0 174 } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
michael@0 175 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 176 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
michael@0 177 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
michael@0 178
michael@0 179 __m128i a = loadUnaligned128((__m128i*)upperRow);
michael@0 180 __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
michael@0 181 __m128i c = _mm_load_si128((__m128i*)lowerRow);
michael@0 182 __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
michael@0 183
michael@0 184 *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
michael@0 185 }
michael@0 186 } else {
michael@0 187 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 188 __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
michael@0 189 __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
michael@0 190
michael@0 191 __m128i a = loadUnaligned128(upperRow);
michael@0 192 __m128i b = loadUnaligned128(upperRow + 1);
michael@0 193 __m128i c = loadUnaligned128(lowerRow);
michael@0 194 __m128i d = loadUnaligned128(lowerRow + 1);
michael@0 195
michael@0 196 *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
michael@0 197 }
michael@0 198 }
michael@0 199
michael@0 200 uint32_t *unalignedStorage = (uint32_t*)storage;
michael@0 201 // Take care of the final pixels, we know there's an even number of pixels
michael@0 202 // in the source rectangle. We use a 2x2 'simd' implementation for this.
michael@0 203 //
michael@0 204 // Potentially we only have to do this in the last row since overflowing
michael@0 205 // 8 pixels in an earlier row would appear to be harmless as it doesn't
michael@0 206 // touch invalid memory. Even when reading and writing to the same surface.
michael@0 207 // in practice we only do this when doing an additional downscale pass, and
michael@0 208 // in this situation we have unused stride to write into harmlessly.
michael@0 209 // I do not believe the additional code complexity would be worth it though.
michael@0 210 for (; x < aSourceSize.width; x += 2) {
michael@0 211 uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
michael@0 212 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
michael@0 213
michael@0 214 *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
michael@0 215 *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
michael@0 216 }
michael@0 217 }
michael@0 218 }
michael@0 219
michael@0 220 void
michael@0 221 ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
michael@0 222 const IntSize &aSourceSize, uint8_t *aDest,
michael@0 223 uint32_t aDestStride)
michael@0 224 {
michael@0 225 for (int y = 0; y < aSourceSize.height; y += 2) {
michael@0 226 __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
michael@0 227 int x = 0;
michael@0 228 // Run a loop depending on alignment.
michael@0 229 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
michael@0 230 !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
michael@0 231 for (; x < (aSourceSize.width - 3); x += 4) {
michael@0 232 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
michael@0 233 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
michael@0 234
michael@0 235 __m128i a = _mm_load_si128((__m128i*)upperRow);
michael@0 236 __m128i b = _mm_load_si128((__m128i*)lowerRow);
michael@0 237
michael@0 238 *storage++ = avg_sse2_4x2_4x1(a, b);
michael@0 239 }
michael@0 240 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
michael@0 241 // This line doesn't align well.
michael@0 242 for (; x < (aSourceSize.width - 3); x += 4) {
michael@0 243 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
michael@0 244 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
michael@0 245
michael@0 246 __m128i a = _mm_load_si128((__m128i*)upperRow);
michael@0 247 __m128i b = loadUnaligned128((__m128i*)lowerRow);
michael@0 248
michael@0 249 *storage++ = avg_sse2_4x2_4x1(a, b);
michael@0 250 }
michael@0 251 } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
michael@0 252 for (; x < (aSourceSize.width - 3); x += 4) {
michael@0 253 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
michael@0 254 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
michael@0 255
michael@0 256 __m128i a = loadUnaligned128((__m128i*)upperRow);
michael@0 257 __m128i b = _mm_load_si128((__m128i*)lowerRow);
michael@0 258
michael@0 259 *storage++ = avg_sse2_4x2_4x1(a, b);
michael@0 260 }
michael@0 261 } else {
michael@0 262 for (; x < (aSourceSize.width - 3); x += 4) {
michael@0 263 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
michael@0 264 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
michael@0 265
michael@0 266 __m128i a = loadUnaligned128((__m128i*)upperRow);
michael@0 267 __m128i b = loadUnaligned128((__m128i*)lowerRow);
michael@0 268
michael@0 269 *storage++ = avg_sse2_4x2_4x1(a, b);
michael@0 270 }
michael@0 271 }
michael@0 272
michael@0 273 uint32_t *unalignedStorage = (uint32_t*)storage;
michael@0 274 // Take care of the final pixels, we know there's an even number of pixels
michael@0 275 // in the source rectangle.
michael@0 276 //
michael@0 277 // Similar overflow considerations are valid as in the previous function.
michael@0 278 for (; x < aSourceSize.width; x++) {
michael@0 279 uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
michael@0 280 uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
michael@0 281
michael@0 282 *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
michael@0 283 }
michael@0 284 }
michael@0 285 }
michael@0 286
michael@0 287 void
michael@0 288 ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
michael@0 289 const IntSize &aSourceSize, uint8_t *aDest,
michael@0 290 uint32_t aDestStride)
michael@0 291 {
michael@0 292 for (int y = 0; y < aSourceSize.height; y++) {
michael@0 293 __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
michael@0 294 int x = 0;
michael@0 295 // Run a loop depending on alignment.
michael@0 296 if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
michael@0 297 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 298 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
michael@0 299
michael@0 300 __m128i a = _mm_load_si128(pixels);
michael@0 301 __m128i b = _mm_load_si128(pixels + 1);
michael@0 302
michael@0 303 *storage++ = avg_sse2_8x1_4x1(a, b);
michael@0 304 }
michael@0 305 } else {
michael@0 306 for (; x < (aSourceSize.width - 7); x += 8) {
michael@0 307 __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
michael@0 308
michael@0 309 __m128i a = loadUnaligned128(pixels);
michael@0 310 __m128i b = loadUnaligned128(pixels + 1);
michael@0 311
michael@0 312 *storage++ = avg_sse2_8x1_4x1(a, b);
michael@0 313 }
michael@0 314 }
michael@0 315
michael@0 316 uint32_t *unalignedStorage = (uint32_t*)storage;
michael@0 317 // Take care of the final pixels, we know there's an even number of pixels
michael@0 318 // in the source rectangle.
michael@0 319 //
michael@0 320 // Similar overflow considerations are valid as in the previous function.
michael@0 321 for (; x < aSourceSize.width; x += 2) {
michael@0 322 uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
michael@0 323
michael@0 324 *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
michael@0 325 }
michael@0 326 }
michael@0 327 }
michael@0 328
michael@0 329 }
michael@0 330 }

mercurial