gfx/2d/ImageScalingSSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/2d/ImageScalingSSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,330 @@
     1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
     1.5 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "ImageScaling.h"
    1.10 +#include "mozilla/Attributes.h"
    1.11 +
    1.12 +#include "SSEHelpers.h"
    1.13 +
    1.14 +/* The functions below use the following system for averaging 4 pixels:
    1.15 + *
    1.16 + * The first observation is that a half-adder is implemented as follows:
    1.17 + * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
    1.18 + *
    1.19 + * This can be trivially extended to three pixels by observaring that when
    1.20 + * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
    1.21 + * carries of the individual numbers, since the sum of 3 bits can only ever
    1.22 + * have a carry of one.
    1.23 + *
    1.24 + * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
    1.25 + * assuming eliminating overflows and underflows, carry + (sum >> 1).
    1.26 + *
    1.27 + * We now average our existing sum with the fourth number, so we get:
    1.28 + * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
    1.29 + *
    1.30 + * We now observe that our sum has been moved into place relative to the
    1.31 + * carry, so we can now average with the carry to get the final 4 input
    1.32 + * average: avg = (sum2 + carry) >> 1;
    1.33 + *
    1.34 + * Or to reverse the proof:
    1.35 + * avg = ((sum >> 1) + carry + d >> 1) >> 1
    1.36 + * avg = ((a + b + c) >> 1 + d >> 1) >> 1
    1.37 + * avg = ((a + b + c + d) >> 2)
    1.38 + *
    1.39 + * An additional fact used in the SSE versions is the concept that we can
    1.40 + * trivially convert a rounded average to a truncated average:
    1.41 + *
    1.42 + * We have:
    1.43 + * f(a, b) = (a + b + 1) >> 1
    1.44 + *
    1.45 + * And want:
    1.46 + * g(a, b) = (a + b) >> 1
    1.47 + *
    1.48 + * Observe:
    1.49 + * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
    1.50 + *            == ~((-a - 1 + -b - 1 + 1) >> 1)
    1.51 + *            == ~((-a - 1 + -b) >> 1)
    1.52 + *            == ~((-(a + b) - 1) >> 1)
    1.53 + *            == ~((~(a + b)) >> 1)
    1.54 + *            == (a + b) >> 1
    1.55 + *            == g(a, b)
    1.56 + */
    1.57 +
    1.58 +MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg)
    1.59 +{
    1.60 +  __m128i minusone = _mm_set1_epi32(0xffffffff);
    1.61 +  return _mm_xor_si128(arg, minusone);
    1.62 +}
    1.63 +
    1.64 +/* We have to pass pointers here, MSVC does not allow passing more than 3
    1.65 + * __m128i arguments on the stack. And it does not allow 16-byte aligned
    1.66 + * stack variables. This inlines properly on MSVC 2010. It does -not- inline
    1.67 + * with just the inline directive.
    1.68 + */
    1.69 +MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i *a, __m128i *b, __m128i *c, __m128i *d)
    1.70 +{
    1.71 +#define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
    1.72 +#define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
    1.73 +
    1.74 +// This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
    1.75 +// needs to be a compile time constant.
    1.76 +#define shuffle_si128(arga, argb, imm) \
    1.77 +  _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), _mm_castsi128_ps((argb)), (imm)));
    1.78 +
    1.79 +  __m128i t = shuffle_si128(*a, *b, shuf1);
    1.80 +  *b = shuffle_si128(*a, *b, shuf2);
    1.81 +  *a = t;
    1.82 +  t = shuffle_si128(*c, *d, shuf1);
    1.83 +  *d = shuffle_si128(*c, *d, shuf2);
    1.84 +  *c = t;
    1.85 +
    1.86 +#undef shuf1
    1.87 +#undef shuf2
    1.88 +#undef shuffle_si128
    1.89 +
    1.90 +  __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
    1.91 +
    1.92 +  __m128i carry = _mm_or_si128(_mm_and_si128(*a, *b), _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
    1.93 +
    1.94 +  sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
    1.95 +
    1.96 +  return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
    1.97 +}
    1.98 +
    1.99 +MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b)
   1.100 +{
   1.101 +  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
   1.102 +}
   1.103 +
   1.104 +MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b)
   1.105 +{
   1.106 +  __m128i t = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
   1.107 +  b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(2, 0, 2, 0)));
   1.108 +  a = t;
   1.109 +
   1.110 +  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
   1.111 +}
   1.112 +
   1.113 +MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
   1.114 +{
   1.115 +  uint32_t sum = a ^ b ^ c;
   1.116 +  uint32_t carry = (a & b) | (a & c) | (b & c);
   1.117 +
   1.118 +  uint32_t mask = 0xfefefefe;
   1.119 +
   1.120 +  // Not having a byte based average instruction means we should mask to avoid
   1.121 +  // underflow.
   1.122 +  sum = (((sum ^ d) & mask) >> 1) + (sum & d);
   1.123 +
   1.124 +  return (((sum ^ carry) & mask) >> 1) + (sum & carry);
   1.125 +}
   1.126 +
   1.127 +// Simple 2 pixel average version of the function above.
   1.128 +MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b)
   1.129 +{
   1.130 +  uint32_t sum = a ^ b;
   1.131 +  uint32_t carry = (a & b);
   1.132 +
   1.133 +  uint32_t mask = 0xfefefefe;
   1.134 +
   1.135 +  return ((sum & mask) >> 1) + carry;
   1.136 +}
   1.137 +
   1.138 +namespace mozilla {
   1.139 +namespace gfx {
   1.140 +
   1.141 +void
   1.142 +ImageHalfScaler::HalfImage2D_SSE2(uint8_t *aSource, int32_t aSourceStride,
   1.143 +                                  const IntSize &aSourceSize, uint8_t *aDest,
   1.144 +                                  uint32_t aDestStride)
   1.145 +{
   1.146 +  const int Bpp = 4;
   1.147 +
   1.148 +  for (int y = 0; y < aSourceSize.height; y += 2) {
   1.149 +    __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
   1.150 +    int x = 0;
   1.151 +    // Run a loop depending on alignment.
   1.152 +    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
   1.153 +        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
   1.154 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.155 +        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
   1.156 +        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
   1.157 +
   1.158 +        __m128i a = _mm_load_si128(upperRow);
   1.159 +        __m128i b = _mm_load_si128(upperRow + 1);
   1.160 +        __m128i c = _mm_load_si128(lowerRow);
   1.161 +        __m128i d = _mm_load_si128(lowerRow + 1);
   1.162 +
   1.163 +        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
   1.164 +      }
   1.165 +    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
   1.166 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.167 +        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
   1.168 +        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
   1.169 +
   1.170 +        __m128i a = _mm_load_si128(upperRow);
   1.171 +        __m128i b = _mm_load_si128(upperRow + 1);
   1.172 +        __m128i c = loadUnaligned128(lowerRow);
   1.173 +        __m128i d = loadUnaligned128(lowerRow + 1);
   1.174 +
   1.175 +        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
   1.176 +      }
   1.177 +    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
   1.178 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.179 +        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
   1.180 +        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
   1.181 +
   1.182 +        __m128i a = loadUnaligned128((__m128i*)upperRow);
   1.183 +        __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
   1.184 +        __m128i c = _mm_load_si128((__m128i*)lowerRow);
   1.185 +        __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
   1.186 +
   1.187 +        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
   1.188 +      }
   1.189 +    } else {
   1.190 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.191 +        __m128i *upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
   1.192 +        __m128i *lowerRow = (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
   1.193 +
   1.194 +        __m128i a = loadUnaligned128(upperRow);
   1.195 +        __m128i b = loadUnaligned128(upperRow + 1);
   1.196 +        __m128i c = loadUnaligned128(lowerRow);
   1.197 +        __m128i d = loadUnaligned128(lowerRow + 1);
   1.198 +
   1.199 +        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
   1.200 +      }
   1.201 +    }
   1.202 +
   1.203 +    uint32_t *unalignedStorage = (uint32_t*)storage;
   1.204 +    // Take care of the final pixels, we know there's an even number of pixels
   1.205 +    // in the source rectangle. We use a 2x2 'simd' implementation for this.
   1.206 +    //
   1.207 +    // Potentially we only have to do this in the last row since overflowing 
   1.208 +    // 8 pixels in an earlier row would appear to be harmless as it doesn't
   1.209 +    // touch invalid memory. Even when reading and writing to the same surface.
   1.210 +    // in practice we only do this when doing an additional downscale pass, and
   1.211 +    // in this situation we have unused stride to write into harmlessly.
   1.212 +    // I do not believe the additional code complexity would be worth it though.
   1.213 +    for (; x < aSourceSize.width; x += 2) {
   1.214 +      uint8_t *upperRow = aSource + (y * aSourceStride + x * Bpp);
   1.215 +      uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
   1.216 +
   1.217 +      *unalignedStorage++ = Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
   1.218 +                                   *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
   1.219 +    }
   1.220 +  }
   1.221 +}
   1.222 +
   1.223 +void
   1.224 +ImageHalfScaler::HalfImageVertical_SSE2(uint8_t *aSource, int32_t aSourceStride,
   1.225 +                                        const IntSize &aSourceSize, uint8_t *aDest,
   1.226 +                                        uint32_t aDestStride)
   1.227 +{
   1.228 +  for (int y = 0; y < aSourceSize.height; y += 2) {
   1.229 +    __m128i *storage = (__m128i*)(aDest + (y / 2) * aDestStride);
   1.230 +    int x = 0;
   1.231 +    // Run a loop depending on alignment.
   1.232 +    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
   1.233 +        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
   1.234 +      for (; x < (aSourceSize.width - 3); x += 4) {
   1.235 +        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
   1.236 +        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
   1.237 +
   1.238 +        __m128i a = _mm_load_si128((__m128i*)upperRow);
   1.239 +        __m128i b = _mm_load_si128((__m128i*)lowerRow);
   1.240 +
   1.241 +        *storage++ = avg_sse2_4x2_4x1(a, b);
   1.242 +      }
   1.243 +    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
   1.244 +      // This line doesn't align well.
   1.245 +      for (; x < (aSourceSize.width - 3); x += 4) {
   1.246 +        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
   1.247 +        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
   1.248 +
   1.249 +        __m128i a = _mm_load_si128((__m128i*)upperRow);
   1.250 +        __m128i b = loadUnaligned128((__m128i*)lowerRow);
   1.251 +
   1.252 +        *storage++ = avg_sse2_4x2_4x1(a, b);
   1.253 +      }
   1.254 +    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
   1.255 +      for (; x < (aSourceSize.width - 3); x += 4) {
   1.256 +        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
   1.257 +        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
   1.258 +
   1.259 +        __m128i a = loadUnaligned128((__m128i*)upperRow);
   1.260 +        __m128i b = _mm_load_si128((__m128i*)lowerRow);
   1.261 +
   1.262 +        *storage++ = avg_sse2_4x2_4x1(a, b);
   1.263 +      }
   1.264 +    } else {
   1.265 +      for (; x < (aSourceSize.width - 3); x += 4) {
   1.266 +        uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
   1.267 +        uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
   1.268 +
   1.269 +        __m128i a = loadUnaligned128((__m128i*)upperRow);
   1.270 +        __m128i b = loadUnaligned128((__m128i*)lowerRow);
   1.271 +
   1.272 +        *storage++ = avg_sse2_4x2_4x1(a, b);
   1.273 +      }
   1.274 +    }
   1.275 +
   1.276 +    uint32_t *unalignedStorage = (uint32_t*)storage;
   1.277 +    // Take care of the final pixels, we know there's an even number of pixels
   1.278 +    // in the source rectangle.
   1.279 +    //
   1.280 +    // Similar overflow considerations are valid as in the previous function.
   1.281 +    for (; x < aSourceSize.width; x++) {
   1.282 +      uint8_t *upperRow = aSource + (y * aSourceStride + x * 4);
   1.283 +      uint8_t *lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
   1.284 +
   1.285 +      *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
   1.286 +    }
   1.287 +  }
   1.288 +}
   1.289 +
   1.290 +void
   1.291 +ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t *aSource, int32_t aSourceStride,
   1.292 +                                          const IntSize &aSourceSize, uint8_t *aDest,
   1.293 +                                          uint32_t aDestStride)
   1.294 +{
   1.295 +  for (int y = 0; y < aSourceSize.height; y++) {
   1.296 +    __m128i *storage = (__m128i*)(aDest + (y * aDestStride));
   1.297 +    int x = 0;
   1.298 +    // Run a loop depending on alignment.
   1.299 +    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
   1.300 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.301 +        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
   1.302 +
   1.303 +        __m128i a = _mm_load_si128(pixels);
   1.304 +        __m128i b = _mm_load_si128(pixels + 1);
   1.305 +
   1.306 +        *storage++ = avg_sse2_8x1_4x1(a, b);
   1.307 +      }
   1.308 +    } else {
   1.309 +      for (; x < (aSourceSize.width - 7); x += 8) {
   1.310 +        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
   1.311 +
   1.312 +        __m128i a = loadUnaligned128(pixels);
   1.313 +        __m128i b = loadUnaligned128(pixels + 1);
   1.314 +
   1.315 +        *storage++ = avg_sse2_8x1_4x1(a, b);
   1.316 +      }
   1.317 +    }
   1.318 +
   1.319 +    uint32_t *unalignedStorage = (uint32_t*)storage;
   1.320 +    // Take care of the final pixels, we know there's an even number of pixels
   1.321 +    // in the source rectangle.
   1.322 +    //
   1.323 +    // Similar overflow considerations are valid as in the previous function.
   1.324 +    for (; x < aSourceSize.width; x += 2) {
   1.325 +      uint32_t *pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
   1.326 +
   1.327 +      *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
   1.328 +    }
   1.329 +  }
   1.330 +}
   1.331 +
   1.332 +}
   1.333 +}

mercurial